diff options
| author | Nick White <git@njw.name> | 2021-11-23 15:59:06 +0000 | 
|---|---|---|
| committer | Nick White <git@njw.name> | 2021-11-23 15:59:06 +0000 | 
| commit | 3230f43a88729440ae7408b5b2914b186be95a84 (patch) | |
| tree | aeac5810c94e1a880691ebc64566f4f444450c48 /cmd/rescribe | |
| parent | ecc7e3e9e49d58c9bc22784a96279c4d5ba814a2 (diff) | |
rescribe: Improve pdf consumption by ensuring only jpg or png are saved to upload
Diffstat (limited to 'cmd/rescribe')
| -rw-r--r-- | cmd/rescribe/main.go | 49 | 
1 files changed, 47 insertions, 2 deletions
| diff --git a/cmd/rescribe/main.go b/cmd/rescribe/main.go index cd0b955..0724c88 100644 --- a/cmd/rescribe/main.go +++ b/cmd/rescribe/main.go @@ -15,6 +15,8 @@ import (  	_ "embed"  	"flag"  	"fmt" +	"image/jpeg" +	"image/png"  	"io"  	"io/ioutil"  	"log" @@ -325,7 +327,8 @@ func extractPdfImgs(path string) (string, error) {  			}  			fn := fmt.Sprintf("%s-%04d.jpg", k, pgnum) -			w, err := os.Create(filepath.Join(tempdir, fn)) +			path := filepath.Join(tempdir, fn) +			w, err := os.Create(path)  			defer w.Close()  			if err != nil {  				return tempdir, fmt.Errorf("Error creating file to extract PDF image: %v\n", err) @@ -339,7 +342,10 @@ func extractPdfImgs(path string) (string, error) {  			w.Close()  			r.Close() -			// TODO: check that what we've written is actually a JPEG +			err = rmIfNotImage(path) +			if err != nil { +				return tempdir, fmt.Errorf("Error removing extracted image %s from PDF: %v\n", fn, err) +			}  		}  	}  	// TODO: check for places where there are multiple images per page, and only keep largest ones where that's the case @@ -347,6 +353,45 @@ func extractPdfImgs(path string) (string, error) {  	return tempdir, nil  } +// rmIfNotImage attempts to decode a given file as an image. If it is +// decode-able as PNG, then rename file extension from .jpg to .png, +// if it fails to be read as PNG or JPEG it will be deleted. +func rmIfNotImage(f string) error { +	r, err := os.Open(f) +	defer r.Close() +	if err != nil { +		return fmt.Errorf("Failed to open image %s: %v\n", f, err) +	} +	_, err = png.Decode(r) +	r.Close() +	if err == nil { +		b := strings.TrimSuffix(f, ".jpg") +		fmt.Printf("%s is PNG; renaming\n", f) +		err = os.Rename(f, b + ".png") +		if err != nil { +			return fmt.Errorf("Error renaming %s to %s: %v", f, b + ".png", err) +		} +		return nil +	} + +	r, err = os.Open(f) +	defer r.Close() +	if err != nil { +		return fmt.Errorf("Failed to open image %s: %v\n", f, err) +	} +	_, err = jpeg.Decode(r) +	if err != nil { +		r.Close() +		fmt.Printf("%s is not PNG or JPEG; removing\n", f) +		err = os.Remove(f) +		if err != nil { +			return fmt.Errorf("Failed to remove invalid image %s: %v", f, err) +		} +	} + +	return nil +} +  func startProcess(logger log.Logger, tessCommand string, bookdir string, bookname string, trainingName string, systess bool, savedir string, tessdir string) error {  	_, err := exec.Command(tessCommand, "--help").Output()  	if err != nil { | 
