From 2c29e91bd98a1354d0cc4772ca629e3f1fc327b5 Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 21 Feb 2022 14:54:42 +0000 Subject: rescribe: Add getgbook use to the GUI (not embedded yet) --- cmd/rescribe/gbook.go | 209 ++++++++++++++++++++++++++++++++++++++++++++++++++ cmd/rescribe/gui.go | 39 +++++++--- 2 files changed, 239 insertions(+), 9 deletions(-) create mode 100644 cmd/rescribe/gbook.go diff --git a/cmd/rescribe/gbook.go b/cmd/rescribe/gbook.go new file mode 100644 index 0000000..b1308cf --- /dev/null +++ b/cmd/rescribe/gbook.go @@ -0,0 +1,209 @@ +// Copyright 2022 Nick White. +// Use of this source code is governed by the GPLv3 +// license that can be found in the LICENSE file. + +package main + +import ( + "context" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "net/http" + "os" + "os/exec" + "path" + "strings" + "unicode" + + "rescribe.xyz/bookpipeline/internal/pipeline" +) + +const maxPartLength = 48 + +// formatAuthors formats a list of authors by just selecting +// the first one listed, and returning the uppercased final +// name. +func formatAuthors(authors []string) string { + if len(authors) == 0 { + return "" + } + + s := authors[0] + + parts := strings.Fields(s) + if len(parts) > 1 { + s = parts[len(parts)-1] + } + + s = strings.ToUpper(s) + + if len(s) > maxPartLength { + s = s[:maxPartLength] + } + + s = strings.Map(stripNonLetters, s) + + return s +} + +// mapTitle is a function for strings.Map to strip out +// unwanted characters from the title. +func stripNonLetters(r rune) rune { + if !unicode.IsLetter(r) { + return -1 + } + return r +} + +// formatTitle formats a title to our preferences, notably +// by stripping spaces and punctuation characters. +func formatTitle(title string) string { + s := strings.Map(stripNonLetters, title) + if len(s) > maxPartLength { + s = s[:maxPartLength] + } + return s +} + +// getMetadata queries Google Books for metadata we care about +// and returns it formatted as we need it. +func getMetadata(id string) (string, string, string, error) { + var author, title, year string + url := fmt.Sprintf("https://www.googleapis.com/books/v1/volumes/%s", id) + + // designed to be unmarshalled by encoding/json's Unmarshal() + type bookInfo struct { + VolumeInfo struct { + Title string + Authors []string + PublishedDate string + } + } + + resp, err := http.Get(url) + if err != nil { + return author, title, year, fmt.Errorf("Error downloading metadata %s: %v", url, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return author, title, year, fmt.Errorf("Error downloading metadata %s: %v", url, err) + } + + b, err := ioutil.ReadAll(resp.Body) + if err != nil { + return author, title, year, fmt.Errorf("Error reading metadata %s: %v", url, err) + } + + v := bookInfo{} + err = json.Unmarshal(b, &v) + if err != nil { + return author, title, year, fmt.Errorf("Error parsing metadata %s: %v", url, err) + } + + author = formatAuthors(v.VolumeInfo.Authors) + title = formatTitle(v.VolumeInfo.Title) + year = v.VolumeInfo.PublishedDate + + return author, title, year, nil +} + +// moveFile just copies a file to the destination without +// using os.Rename, as that can fail if crossing filesystem +// boundaries +func moveFile(from string, to string) error { + ffrom, err := os.Open(from) + if err != nil { + return err + } + defer ffrom.Close() + + fto, err := os.Create(to) + if err != nil { + return err + } + defer fto.Close() + + _, err = io.Copy(fto, ffrom) + if err != nil { + return err + } + + ffrom.Close() + err = os.Remove(from) + if err != nil { + return err + } + + return nil +} + +// getGoogleBook downloads all images of a book to a directory +// named YEAR_AUTHORSURNAME_Title_bookid inside basedir, returning +// the directory path +func getGoogleBook(ctx context.Context, id string, basedir string) (string, error) { + author, title, year, err := getMetadata(id) + if err != nil { + return "", err + } + + tmpdir, err := ioutil.TempDir("", "bookpipeline") + if err != nil { + return "", fmt.Errorf("Error setting up temporary directory: %v", err) + } + + // TODO: use embedded version if necessary + cmd := exec.CommandContext(ctx, "getgbook", id) + pipeline.HideCmd(cmd) + cmd.Dir = tmpdir + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + err = cmd.Run() + if err != nil { + return "", fmt.Errorf("Error running getgbook %s: %v", id, err) + } + + select { + case <-ctx.Done(): + _ = os.Remove(tmpdir) + return "", ctx.Err() + default: + } + + // getgbook downloads into id directory, so move files out of + // there directly into dir + tmpdir = path.Join(tmpdir, id) + f, err := os.Open(tmpdir) + if err != nil { + return "", fmt.Errorf("Failed to open %s to move files: %v", tmpdir, err) + } + files, err := f.Readdir(0) + if err != nil { + return "", fmt.Errorf("Failed to readdir %s to move files: %v", tmpdir, err) + } + + d := fmt.Sprintf("%s_%s_%s_%s", year, author, title, id) + dir := path.Join(basedir, d) + err = os.MkdirAll(dir, 0755) + if err != nil { + return "", fmt.Errorf("Couldn't create directory %s: %v", dir, err) + } + + for _, v := range files { + orig := path.Join(tmpdir, v.Name()) + new := path.Join(dir, v.Name()) + err = moveFile(orig, new) + if err != nil { + return dir, fmt.Errorf("Failed to move %s to %s: %v", orig, new, err) + } + } + + err = os.Remove(tmpdir) + if err != nil { + return dir, fmt.Errorf("Failed to remove temporary directory %s: %v", tmpdir, err) + } + + return dir, nil +} diff --git a/cmd/rescribe/gui.go b/cmd/rescribe/gui.go index b8449b5..fde660a 100644 --- a/cmd/rescribe/gui.go +++ b/cmd/rescribe/gui.go @@ -26,6 +26,8 @@ import ( ) var progressPoints = map[float64]string{ + 0.11: "Downloading", + 0.12: "Processing PDF", 0.2: "Preprocessing", 0.5: "OCRing", 0.9: "Analysing", @@ -327,7 +329,7 @@ func startGui(log log.Logger, cmd string, training string, tessdir string) error if dirEntry.Text == "" { dirEntry.SetText(homeDir) } - dir.SetText(fmt.Sprintf("Google Book: %s Savedir: %s", id, dirEntry.Text)) + dir.SetText(fmt.Sprintf("Google Book: %s Save to: %s", id, dirEntry.Text)) dirIcon.SetResource(theme.SearchIcon()) myWindow.SetContent(fullContent) gobtn.Enable() @@ -452,18 +454,37 @@ func startGui(log log.Logger, cmd string, training string, tessdir string) error progressBar.SetValue(0.1) - if strings.HasPrefix(dir.Text, "Google Book:") { - // TODO - dialog.ShowError(errors.New("Coming soon"), myWindow) - progressBar.SetValue(0.0) - for _, v := range []fyne.Disableable{folderBtn, pdfBtn, gbookBtn, trainingOpts, gobtn} { - v.Enable() + if strings.HasPrefix(dir.Text, "Google Book: ") { + progressBar.SetValue(0.11) + start := len("Google Book: ") + bookname = dir.Text[start:start+12] + + start = start + 12 + len(" Save to: ") + bookdir = dir.Text[start:] + savedir = bookdir + + fmt.Printf("Downloading Google Book\n") + d, err := getGoogleBook(ctx, bookname, bookdir) + if err != nil { + if !strings.HasSuffix(err.Error(), "signal: killed") { + msg := fmt.Sprintf("Error downloading Google Book %s: %v\n", bookname, err) + dialog.ShowError(errors.New(msg), myWindow) + fmt.Fprintf(os.Stderr, msg) + } + progressBar.SetValue(0.0) + for _, v := range []fyne.Disableable{folderBtn, pdfBtn, gbookBtn, trainingOpts, gobtn} { + v.Enable() + } + abortbtn.Disable() + return } - abortbtn.Disable() - return + bookdir = d + savedir = d + bookname = filepath.Base(d) } if strings.HasSuffix(dir.Text, ".pdf") && !f.IsDir() { + progressBar.SetValue(0.12) bookdir, err = extractPdfImgs(ctx, bookdir) if err != nil { if !strings.HasSuffix(err.Error(), "context canceled") { -- cgit v1.2.1-24-ge1ad