diff options
Diffstat (limited to 'cmd/rescribe/gbook.go')
-rw-r--r-- | cmd/rescribe/gbook.go | 259 |
1 files changed, 259 insertions, 0 deletions
diff --git a/cmd/rescribe/gbook.go b/cmd/rescribe/gbook.go new file mode 100644 index 0000000..a011181 --- /dev/null +++ b/cmd/rescribe/gbook.go @@ -0,0 +1,259 @@ +// Copyright 2022 Nick White. +// Use of this source code is governed by the GPLv3 +// license that can be found in the LICENSE file. + +package main + +import ( + "context" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "net/http" + "os" + "os/exec" + "path" + "regexp" + "strings" + "unicode" + + "rescribe.xyz/bookpipeline/internal/pipeline" +) + +const maxPartLength = 48 + +// formatAuthors formats a list of authors by just selecting +// the first one listed, and returning the uppercased final +// name. +func formatAuthors(authors []string) string { + if len(authors) == 0 { + return "" + } + + s := authors[0] + + parts := strings.Fields(s) + if len(parts) > 1 { + s = parts[len(parts)-1] + } + + s = strings.ToUpper(s) + + if len(s) > maxPartLength { + // truncate to maxPartLength + m := fmt.Sprintf("%%.%ds", maxPartLength) + s = fmt.Sprintf(m, s) + } + + s = strings.Map(stripNonLetters, s) + + return s +} + +// mapTitle is a function for strings.Map to strip out +// unwanted characters from the title. +func stripNonLetters(r rune) rune { + if !unicode.IsLetter(r) { + return -1 + } + return r +} + +// formatTitle formats a title to our preferences, notably +// by stripping spaces and punctuation characters. +func formatTitle(title string) string { + s := strings.Map(stripNonLetters, title) + if len(s) > maxPartLength { + // truncate to maxPartLength + m := fmt.Sprintf("%%.%ds", maxPartLength) + s = fmt.Sprintf(m, s) + } + return s +} + +// getMetadata queries Google Books for metadata we care about +// and returns it formatted as we need it. +func getMetadata(id string) (string, string, string, error) { + var author, title, year string + url := fmt.Sprintf("https://www.googleapis.com/books/v1/volumes/%s", id) + + // designed to be unmarshalled by encoding/json's Unmarshal() + type bookInfo struct { + VolumeInfo struct { + Title string + Authors []string + PublishedDate string + } + } + + resp, err := http.Get(url) + if err != nil { + return author, title, year, fmt.Errorf("Error downloading metadata %s: %v", url, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return author, title, year, fmt.Errorf("Error downloading metadata %s: %v", url, err) + } + + b, err := ioutil.ReadAll(resp.Body) + if err != nil { + return author, title, year, fmt.Errorf("Error reading metadata %s: %v", url, err) + } + + v := bookInfo{} + err = json.Unmarshal(b, &v) + if err != nil { + return author, title, year, fmt.Errorf("Error parsing metadata %s: %v", url, err) + } + + author = formatAuthors(v.VolumeInfo.Authors) + title = formatTitle(v.VolumeInfo.Title) + year = v.VolumeInfo.PublishedDate + + return author, title, year, nil +} + +// moveFile just copies a file to the destination without +// using os.Rename, as that can fail if crossing filesystem +// boundaries +func moveFile(from string, to string) error { + ffrom, err := os.Open(from) + if err != nil { + return err + } + defer ffrom.Close() + + fto, err := os.Create(to) + if err != nil { + return err + } + defer fto.Close() + + _, err = io.Copy(fto, ffrom) + if err != nil { + return err + } + + ffrom.Close() + err = os.Remove(from) + if err != nil { + return err + } + + return nil +} + +// getGoogleBook downloads all images of a book to a directory +// named YEAR_AUTHORSURNAME_Title_bookid inside basedir, returning +// the directory path +func getGoogleBook(ctx context.Context, gbookcmd string, id string, basedir string) (string, error) { + author, title, year, err := getMetadata(id) + if err != nil { + return "", err + } + + tmpdir, err := ioutil.TempDir("", "bookpipeline") + if err != nil { + return "", fmt.Errorf("Error setting up temporary directory: %v", err) + } + + cmd := exec.CommandContext(ctx, gbookcmd, id) + pipeline.HideCmd(cmd) + cmd.Dir = tmpdir + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + err = cmd.Run() + if err != nil { + return "", fmt.Errorf("Error running getgbook %s: %v", id, err) + } + + select { + case <-ctx.Done(): + _ = os.Remove(tmpdir) + return "", ctx.Err() + default: + } + + // getgbook downloads into id directory, so move files out of + // there directly into dir + tmpdir = path.Join(tmpdir, id) + f, err := os.Open(tmpdir) + if err != nil { + return "", fmt.Errorf("Failed to open %s to move files: %v", tmpdir, err) + } + files, err := f.Readdir(0) + if err != nil { + return "", fmt.Errorf("Failed to readdir %s to move files: %v", tmpdir, err) + } + + d := fmt.Sprintf("%s_%s_%s_%s", year, author, title, id) + dir := path.Join(basedir, d) + err = os.MkdirAll(dir, 0755) + if err != nil { + return "", fmt.Errorf("Couldn't create directory %s: %v", dir, err) + } + + for _, v := range files { + orig := path.Join(tmpdir, v.Name()) + new := path.Join(dir, v.Name()) + err = moveFile(orig, new) + if err != nil { + return dir, fmt.Errorf("Failed to move %s to %s: %v", orig, new, err) + } + } + + err = os.Remove(tmpdir) + if err != nil { + return dir, fmt.Errorf("Failed to remove temporary directory %s: %v", tmpdir, err) + } + + return dir, nil +} + +// getBookIdFromUrl returns a 12 character Google Book ID from +// a Google URL, or an error if one can't be found. +// Example URLs: +// https://books.google.it/books?id=QjQepCuN8JYC +// https://www.google.it/books/edition/_/VJbr-Oe2au0C +func getBookIdFromUrl(url string) (string, error) { + lurl := strings.ToLower(url) + if len(url) == 12 && !strings.ContainsAny(url, "?/:") { + return url, nil + } + + matchUrl, err := regexp.MatchString("https://www.google.[^\\/]*/books/", url) + if err != nil { + return "", err + } + + if strings.HasPrefix(lurl, "https://books.google") { + start := strings.Index(lurl, "?id=") + if start == -1 { + start = strings.Index(lurl, "&id=") + } + + if start >= 0 { + start += 4 + if len(url) - start < 12 { + return "", fmt.Errorf("Could not find book ID in URL") + } + return url[start : start+12], nil + } + + return "", fmt.Errorf("Could not find book ID in URL") + } + if matchUrl == true { + start := strings.Index(lurl, "edition/_/") + + if start >= 0 { + start += 10 + if len(url) - start < 12 { + return "", fmt.Errorf("Could not find book ID in URL") + } + return url[start : start+12], nil + } + } + return "", fmt.Errorf("Could not find book ID in URL") +} |