diff options
| author | Nick White <git@njw.name> | 2019-10-31 11:47:31 +0000 | 
|---|---|---|
| committer | Nick White <git@njw.name> | 2019-10-31 11:47:31 +0000 | 
| commit | 99ebbef45d19d14636049ec8d863abb38305c6c5 (patch) | |
| tree | 778874c497508539e3fe24c37c96b621c8688081 /cmd/pdfbook | |
| parent | 459e7939910213035a432ee3ce7986b6d48c604b (diff) | |
Many improvements to pdfbook; basically working now
Diffstat (limited to 'cmd/pdfbook')
| -rw-r--r-- | cmd/pdfbook/main.go | 129 | 
1 files changed, 82 insertions, 47 deletions
| diff --git a/cmd/pdfbook/main.go b/cmd/pdfbook/main.go index 09202e0..45bbc4f 100644 --- a/cmd/pdfbook/main.go +++ b/cmd/pdfbook/main.go @@ -1,11 +1,17 @@  package main  import ( +	"errors"  	"flag"  	"fmt" +	"html" +	"image" +	_ "image/jpeg" +	_ "image/png"  	"io/ioutil"  	"log"  	"os" +	"path"  	"path/filepath"  	"strings" @@ -13,54 +19,92 @@ import (  	"rescribe.xyz/utils/pkg/hocr"  ) -// see notebook for rationale; experimental +const pageWidth = 5 // pageWidth in inches + +// pxToPt converts a pixel value into a pt value (72 pts per inch) +// This uses pageWidth to determine the appropriate value  func pxToPt(i int) float64 { -	return float64(i) / 5 +	return float64(i) / pageWidth  } -func lineText(l hocr.OcrLine) string { -	// TODO: handle cases of OcrLine being where the text is, and OcrChar being where the text is -	var t string -	for _, w := range l.Words { -		if len(t) > 0 { -			t += " " +// setupPdf creates a new PDF with appropriate settings and fonts +// TODO: this will go in pdf.go in due course +// TODO: find a font that's closer to the average dimensions of the +//       text we're dealing with, and put it somewhere sensible +func setupPdf() *gofpdf.Fpdf { +	pdf := gofpdf.New("P", "pt", "A4", "") +	// Even though it's invisible, we need to add a font which can do +	// UTF-8 so that text renders correctly. +	pdf.AddUTF8Font("dejavu", "", "DejaVuSansCondensed.ttf") +	pdf.SetFont("dejavu", "", 10) +	pdf.SetAutoPageBreak(false, float64(0)) +	return pdf +} + +// addPage adds a page to the pdf with an image and (invisible) +// text from an hocr file +func addPage(pdf *gofpdf.Fpdf, imgpath string, hocrpath string) error { +	file, err := ioutil.ReadFile(hocrpath) +	if err != nil { +		return errors.New(fmt.Sprintf("Could not read file %s: %v", hocrpath, err)) +	} +	// TODO: change hocr.Parse to take a Reader rather than []byte +	h, err := hocr.Parse(file) +	if err != nil { +		return errors.New(fmt.Sprintf("Could not parse hocr in file %s: %v", hocrpath, err)) +	} + +	f, err := os.Open(imgpath) +	defer f.Close() +	if err != nil { +		return errors.New(fmt.Sprintf("Could not open file %s: %v", imgpath, err)) +	} +	img, _, err := image.Decode(f) +	if err != nil { +		return errors.New(fmt.Sprintf("Could not decode image: %v", err)) +	} +	b := img.Bounds() +	pdf.AddPageFormat("P", gofpdf.SizeType{Wd: pxToPt(b.Dx()), Ht: pxToPt(b.Dy())}) + +	// TODO: check for errors in pdf as going through + +	_ = pdf.RegisterImageOptions(imgpath, gofpdf.ImageOptions{}) +	pdf.ImageOptions(imgpath, 0, 0, pxToPt(b.Dx()), pxToPt(b.Dy()), false, gofpdf.ImageOptions{}, 0, "") + +	pdf.SetTextRenderingMode(gofpdf.TextRenderingModeInvisible) + +	for _, l := range h.Lines { +		coords, err := hocr.BoxCoords(l.Title) +		if err != nil { +			continue  		} -		t += w.Text +		pdf.SetXY(pxToPt(coords[0]), pxToPt(coords[1])) +		pdf.CellFormat(pxToPt(coords[2]), pxToPt(coords[3]), html.UnescapeString(hocr.LineText(l)), "", 0, "T", false, 0, "")  	} -	return t +	return nil +} + +func savePdf(pdf *gofpdf.Fpdf, p string) error { +	return pdf.OutputFileAndClose(p)  }  func walker(pdf *gofpdf.Fpdf) filepath.WalkFunc { -	return func(path string, info os.FileInfo, err error) error { +	return func(fpath string, info os.FileInfo, err error) error {  		if info.IsDir() {  			return nil  		} -		if !strings.HasSuffix(path, ".hocr") { +		if !strings.HasSuffix(fpath, ".hocr") {  			return nil  		} -		// TODO: have errors returned include the file path of the error -		file, err := ioutil.ReadFile(path) -		if err != nil { -			return err -		} -		h, err := hocr.Parse(file) -		if err != nil { -			return err -		} -		// TODO: get page dimensions from image dimensions -		pdf.AddPageFormat("P", gofpdf.SizeType{Wd: pxToPt(1414), Ht: pxToPt(2500)}) -		//pdf.SetTextRenderingMode(gofpdf.TextRenderingModeInvisible) -		// TODO: add page image -		for _, l := range h.Lines { -			coords, err := hocr.BoxCoords(l.Title) -			if err != nil { -				return err -			} -			pdf.SetXY(pxToPt(coords[0]), pxToPt(coords[1])) -			// TODO: html escape text -			pdf.CellFormat(pxToPt(coords[2]), pxToPt(coords[3]), hocr.LineText(l), "", 0, "T", false, 0, "") +		// TODO: handle jpg or binarised versions according to a flag +		imgpath := "" +		p := strings.SplitN(path.Base(fpath), "_bin", 2) +		if len(p) > 1 { +			imgpath = path.Join(path.Dir(fpath), p[0] + ".jpg") +		} else { +			imgpath = strings.TrimSuffix(fpath, ".hocr") + ".png"  		} -		return nil +		return addPage(pdf, imgpath, fpath)  	}  } @@ -79,24 +123,15 @@ func main() {  		return  	} -	// TODO: this will go in pdf.go in due course, potentially with a -	//       type which covers gofpdf.Fpdf, and an interface, so that -	//       the backend can be switched out like aws.go -	pdf := gofpdf.New("P", "pt", "A4", "") -	// Even though it's invisible, we need to add a font which can do UTF-8 so text is correctly rendered -	// TODO: find a font that's closer to the average dimensions of the -	//       text we're dealing with, and put it somewhere sensible -	pdf.AddUTF8Font("dejavu", "", "DejaVuSansCondensed.ttf") -	pdf.SetFont("dejavu", "", 10) -	pdf.SetAutoPageBreak(false, float64(0)) +	pdf := setupPdf()  	err := filepath.Walk(flag.Arg(0), walker(pdf))  	if err != nil {  		log.Fatalln("Failed to walk", flag.Arg(0), err) -        } +	} -	err = pdf.OutputFileAndClose(flag.Arg(1)) +	err = savePdf(pdf, flag.Arg(1))  	if err != nil {  		log.Fatalln("Failed to save", flag.Arg(1), err) -        } +	}  } | 
