summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2021-11-23 15:59:06 +0000
committerNick White <git@njw.name>2021-11-23 15:59:06 +0000
commit3230f43a88729440ae7408b5b2914b186be95a84 (patch)
treeaeac5810c94e1a880691ebc64566f4f444450c48
parentecc7e3e9e49d58c9bc22784a96279c4d5ba814a2 (diff)
rescribe: Improve pdf consumption by ensuring only jpg or png are saved to upload
-rw-r--r--cmd/rescribe/main.go49
1 files changed, 47 insertions, 2 deletions
diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go
index cd0b955..0724c88 100644
--- a/cmd/rescribe/main.go
+++ b/cmd/rescribe/main.go
@@ -15,6 +15,8 @@ import (
_ "embed"
"flag"
"fmt"
+ "image/jpeg"
+ "image/png"
"io"
"io/ioutil"
"log"
@@ -325,7 +327,8 @@ func extractPdfImgs(path string) (string, error) {
}
fn := fmt.Sprintf("%s-%04d.jpg", k, pgnum)
- w, err := os.Create(filepath.Join(tempdir, fn))
+ path := filepath.Join(tempdir, fn)
+ w, err := os.Create(path)
defer w.Close()
if err != nil {
return tempdir, fmt.Errorf("Error creating file to extract PDF image: %v\n", err)
@@ -339,7 +342,10 @@ func extractPdfImgs(path string) (string, error) {
w.Close()
r.Close()
- // TODO: check that what we've written is actually a JPEG
+ err = rmIfNotImage(path)
+ if err != nil {
+ return tempdir, fmt.Errorf("Error removing extracted image %s from PDF: %v\n", fn, err)
+ }
}
}
// TODO: check for places where there are multiple images per page, and only keep largest ones where that's the case
@@ -347,6 +353,45 @@ func extractPdfImgs(path string) (string, error) {
return tempdir, nil
}
+// rmIfNotImage attempts to decode a given file as an image. If it is
+// decode-able as PNG, then rename file extension from .jpg to .png,
+// if it fails to be read as PNG or JPEG it will be deleted.
+func rmIfNotImage(f string) error {
+ r, err := os.Open(f)
+ defer r.Close()
+ if err != nil {
+ return fmt.Errorf("Failed to open image %s: %v\n", f, err)
+ }
+ _, err = png.Decode(r)
+ r.Close()
+ if err == nil {
+ b := strings.TrimSuffix(f, ".jpg")
+ fmt.Printf("%s is PNG; renaming\n", f)
+ err = os.Rename(f, b + ".png")
+ if err != nil {
+ return fmt.Errorf("Error renaming %s to %s: %v", f, b + ".png", err)
+ }
+ return nil
+ }
+
+ r, err = os.Open(f)
+ defer r.Close()
+ if err != nil {
+ return fmt.Errorf("Failed to open image %s: %v\n", f, err)
+ }
+ _, err = jpeg.Decode(r)
+ if err != nil {
+ r.Close()
+ fmt.Printf("%s is not PNG or JPEG; removing\n", f)
+ err = os.Remove(f)
+ if err != nil {
+ return fmt.Errorf("Failed to remove invalid image %s: %v", f, err)
+ }
+ }
+
+ return nil
+}
+
func startProcess(logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, systess bool, savedir string, tessdir string) error {
_, err := exec.Command(tessCommand, "--help").Output()
if err != nil {