summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--cmd/rescribe/gbook.go209
-rw-r--r--cmd/rescribe/gui.go39
2 files changed, 239 insertions, 9 deletions
diff --git a/cmd/rescribe/gbook.go b/cmd/rescribe/gbook.go
new file mode 100644
index 0000000..b1308cf
--- /dev/null
+++ b/cmd/rescribe/gbook.go
@@ -0,0 +1,209 @@
+// Copyright 2022 Nick White.
+// Use of this source code is governed by the GPLv3
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "net/http"
+ "os"
+ "os/exec"
+ "path"
+ "strings"
+ "unicode"
+
+ "rescribe.xyz/bookpipeline/internal/pipeline"
+)
+
+const maxPartLength = 48
+
+// formatAuthors formats a list of authors by just selecting
+// the first one listed, and returning the uppercased final
+// name.
+func formatAuthors(authors []string) string {
+ if len(authors) == 0 {
+ return ""
+ }
+
+ s := authors[0]
+
+ parts := strings.Fields(s)
+ if len(parts) > 1 {
+ s = parts[len(parts)-1]
+ }
+
+ s = strings.ToUpper(s)
+
+ if len(s) > maxPartLength {
+ s = s[:maxPartLength]
+ }
+
+ s = strings.Map(stripNonLetters, s)
+
+ return s
+}
+
+// mapTitle is a function for strings.Map to strip out
+// unwanted characters from the title.
+func stripNonLetters(r rune) rune {
+ if !unicode.IsLetter(r) {
+ return -1
+ }
+ return r
+}
+
+// formatTitle formats a title to our preferences, notably
+// by stripping spaces and punctuation characters.
+func formatTitle(title string) string {
+ s := strings.Map(stripNonLetters, title)
+ if len(s) > maxPartLength {
+ s = s[:maxPartLength]
+ }
+ return s
+}
+
+// getMetadata queries Google Books for metadata we care about
+// and returns it formatted as we need it.
+func getMetadata(id string) (string, string, string, error) {
+ var author, title, year string
+ url := fmt.Sprintf("https://www.googleapis.com/books/v1/volumes/%s", id)
+
+ // designed to be unmarshalled by encoding/json's Unmarshal()
+ type bookInfo struct {
+ VolumeInfo struct {
+ Title string
+ Authors []string
+ PublishedDate string
+ }
+ }
+
+ resp, err := http.Get(url)
+ if err != nil {
+ return author, title, year, fmt.Errorf("Error downloading metadata %s: %v", url, err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return author, title, year, fmt.Errorf("Error downloading metadata %s: %v", url, err)
+ }
+
+ b, err := ioutil.ReadAll(resp.Body)
+ if err != nil {
+ return author, title, year, fmt.Errorf("Error reading metadata %s: %v", url, err)
+ }
+
+ v := bookInfo{}
+ err = json.Unmarshal(b, &v)
+ if err != nil {
+ return author, title, year, fmt.Errorf("Error parsing metadata %s: %v", url, err)
+ }
+
+ author = formatAuthors(v.VolumeInfo.Authors)
+ title = formatTitle(v.VolumeInfo.Title)
+ year = v.VolumeInfo.PublishedDate
+
+ return author, title, year, nil
+}
+
+// moveFile just copies a file to the destination without
+// using os.Rename, as that can fail if crossing filesystem
+// boundaries
+func moveFile(from string, to string) error {
+ ffrom, err := os.Open(from)
+ if err != nil {
+ return err
+ }
+ defer ffrom.Close()
+
+ fto, err := os.Create(to)
+ if err != nil {
+ return err
+ }
+ defer fto.Close()
+
+ _, err = io.Copy(fto, ffrom)
+ if err != nil {
+ return err
+ }
+
+ ffrom.Close()
+ err = os.Remove(from)
+ if err != nil {
+ return err
+ }
+
+ return nil
+}
+
+// getGoogleBook downloads all images of a book to a directory
+// named YEAR_AUTHORSURNAME_Title_bookid inside basedir, returning
+// the directory path
+func getGoogleBook(ctx context.Context, id string, basedir string) (string, error) {
+ author, title, year, err := getMetadata(id)
+ if err != nil {
+ return "", err
+ }
+
+ tmpdir, err := ioutil.TempDir("", "bookpipeline")
+ if err != nil {
+ return "", fmt.Errorf("Error setting up temporary directory: %v", err)
+ }
+
+ // TODO: use embedded version if necessary
+ cmd := exec.CommandContext(ctx, "getgbook", id)
+ pipeline.HideCmd(cmd)
+ cmd.Dir = tmpdir
+ cmd.Stdout = os.Stdout
+ cmd.Stderr = os.Stderr
+ err = cmd.Run()
+ if err != nil {
+ return "", fmt.Errorf("Error running getgbook %s: %v", id, err)
+ }
+
+ select {
+ case <-ctx.Done():
+ _ = os.Remove(tmpdir)
+ return "", ctx.Err()
+ default:
+ }
+
+ // getgbook downloads into id directory, so move files out of
+ // there directly into dir
+ tmpdir = path.Join(tmpdir, id)
+ f, err := os.Open(tmpdir)
+ if err != nil {
+ return "", fmt.Errorf("Failed to open %s to move files: %v", tmpdir, err)
+ }
+ files, err := f.Readdir(0)
+ if err != nil {
+ return "", fmt.Errorf("Failed to readdir %s to move files: %v", tmpdir, err)
+ }
+
+ d := fmt.Sprintf("%s_%s_%s_%s", year, author, title, id)
+ dir := path.Join(basedir, d)
+ err = os.MkdirAll(dir, 0755)
+ if err != nil {
+ return "", fmt.Errorf("Couldn't create directory %s: %v", dir, err)
+ }
+
+ for _, v := range files {
+ orig := path.Join(tmpdir, v.Name())
+ new := path.Join(dir, v.Name())
+ err = moveFile(orig, new)
+ if err != nil {
+ return dir, fmt.Errorf("Failed to move %s to %s: %v", orig, new, err)
+ }
+ }
+
+ err = os.Remove(tmpdir)
+ if err != nil {
+ return dir, fmt.Errorf("Failed to remove temporary directory %s: %v", tmpdir, err)
+ }
+
+ return dir, nil
+}
diff --git a/cmd/rescribe/gui.go b/cmd/rescribe/gui.go
index b8449b5..fde660a 100644
--- a/cmd/rescribe/gui.go
+++ b/cmd/rescribe/gui.go
@@ -26,6 +26,8 @@ import (
)
var progressPoints = map[float64]string{
+ 0.11: "Downloading",
+ 0.12: "Processing PDF",
0.2: "Preprocessing",
0.5: "OCRing",
0.9: "Analysing",
@@ -327,7 +329,7 @@ func startGui(log log.Logger, cmd string, training string, tessdir string) error
if dirEntry.Text == "" {
dirEntry.SetText(homeDir)
}
- dir.SetText(fmt.Sprintf("Google Book: %s Savedir: %s", id, dirEntry.Text))
+ dir.SetText(fmt.Sprintf("Google Book: %s Save to: %s", id, dirEntry.Text))
dirIcon.SetResource(theme.SearchIcon())
myWindow.SetContent(fullContent)
gobtn.Enable()
@@ -452,18 +454,37 @@ func startGui(log log.Logger, cmd string, training string, tessdir string) error
progressBar.SetValue(0.1)
- if strings.HasPrefix(dir.Text, "Google Book:") {
- // TODO
- dialog.ShowError(errors.New("Coming soon"), myWindow)
- progressBar.SetValue(0.0)
- for _, v := range []fyne.Disableable{folderBtn, pdfBtn, gbookBtn, trainingOpts, gobtn} {
- v.Enable()
+ if strings.HasPrefix(dir.Text, "Google Book: ") {
+ progressBar.SetValue(0.11)
+ start := len("Google Book: ")
+ bookname = dir.Text[start:start+12]
+
+ start = start + 12 + len(" Save to: ")
+ bookdir = dir.Text[start:]
+ savedir = bookdir
+
+ fmt.Printf("Downloading Google Book\n")
+ d, err := getGoogleBook(ctx, bookname, bookdir)
+ if err != nil {
+ if !strings.HasSuffix(err.Error(), "signal: killed") {
+ msg := fmt.Sprintf("Error downloading Google Book %s: %v\n", bookname, err)
+ dialog.ShowError(errors.New(msg), myWindow)
+ fmt.Fprintf(os.Stderr, msg)
+ }
+ progressBar.SetValue(0.0)
+ for _, v := range []fyne.Disableable{folderBtn, pdfBtn, gbookBtn, trainingOpts, gobtn} {
+ v.Enable()
+ }
+ abortbtn.Disable()
+ return
}
- abortbtn.Disable()
- return
+ bookdir = d
+ savedir = d
+ bookname = filepath.Base(d)
}
if strings.HasSuffix(dir.Text, ".pdf") && !f.IsDir() {
+ progressBar.SetValue(0.12)
bookdir, err = extractPdfImgs(ctx, bookdir)
if err != nil {
if !strings.HasSuffix(err.Error(), "context canceled") {