diff options
-rw-r--r-- | cmd/pdfbook/main.go | 85 | ||||
-rw-r--r-- | pdf.go | 90 |
2 files changed, 102 insertions, 73 deletions
diff --git a/cmd/pdfbook/main.go b/cmd/pdfbook/main.go index 45bbc4f..3146865 100644 --- a/cmd/pdfbook/main.go +++ b/cmd/pdfbook/main.go @@ -1,24 +1,23 @@ package main import ( - "errors" "flag" "fmt" - "html" - "image" - _ "image/jpeg" - _ "image/png" - "io/ioutil" "log" "os" "path" "path/filepath" "strings" - "rescribe.xyz/gofpdf" - "rescribe.xyz/utils/pkg/hocr" + "rescribe.xyz/bookpipeline" ) +type Pdfer interface { + Setup() error + AddPage(imgpath, hocrpath string) error + Save(path string) error +} + const pageWidth = 5 // pageWidth in inches // pxToPt converts a pixel value into a pt value (72 pts per inch) @@ -27,68 +26,7 @@ func pxToPt(i int) float64 { return float64(i) / pageWidth } -// setupPdf creates a new PDF with appropriate settings and fonts -// TODO: this will go in pdf.go in due course -// TODO: find a font that's closer to the average dimensions of the -// text we're dealing with, and put it somewhere sensible -func setupPdf() *gofpdf.Fpdf { - pdf := gofpdf.New("P", "pt", "A4", "") - // Even though it's invisible, we need to add a font which can do - // UTF-8 so that text renders correctly. - pdf.AddUTF8Font("dejavu", "", "DejaVuSansCondensed.ttf") - pdf.SetFont("dejavu", "", 10) - pdf.SetAutoPageBreak(false, float64(0)) - return pdf -} - -// addPage adds a page to the pdf with an image and (invisible) -// text from an hocr file -func addPage(pdf *gofpdf.Fpdf, imgpath string, hocrpath string) error { - file, err := ioutil.ReadFile(hocrpath) - if err != nil { - return errors.New(fmt.Sprintf("Could not read file %s: %v", hocrpath, err)) - } - // TODO: change hocr.Parse to take a Reader rather than []byte - h, err := hocr.Parse(file) - if err != nil { - return errors.New(fmt.Sprintf("Could not parse hocr in file %s: %v", hocrpath, err)) - } - - f, err := os.Open(imgpath) - defer f.Close() - if err != nil { - return errors.New(fmt.Sprintf("Could not open file %s: %v", imgpath, err)) - } - img, _, err := image.Decode(f) - if err != nil { - return errors.New(fmt.Sprintf("Could not decode image: %v", err)) - } - b := img.Bounds() - pdf.AddPageFormat("P", gofpdf.SizeType{Wd: pxToPt(b.Dx()), Ht: pxToPt(b.Dy())}) - - // TODO: check for errors in pdf as going through - - _ = pdf.RegisterImageOptions(imgpath, gofpdf.ImageOptions{}) - pdf.ImageOptions(imgpath, 0, 0, pxToPt(b.Dx()), pxToPt(b.Dy()), false, gofpdf.ImageOptions{}, 0, "") - - pdf.SetTextRenderingMode(gofpdf.TextRenderingModeInvisible) - - for _, l := range h.Lines { - coords, err := hocr.BoxCoords(l.Title) - if err != nil { - continue - } - pdf.SetXY(pxToPt(coords[0]), pxToPt(coords[1])) - pdf.CellFormat(pxToPt(coords[2]), pxToPt(coords[3]), html.UnescapeString(hocr.LineText(l)), "", 0, "T", false, 0, "") - } - return nil -} - -func savePdf(pdf *gofpdf.Fpdf, p string) error { - return pdf.OutputFileAndClose(p) -} - -func walker(pdf *gofpdf.Fpdf) filepath.WalkFunc { +func walker(pdf Pdfer) filepath.WalkFunc { return func(fpath string, info os.FileInfo, err error) error { if info.IsDir() { return nil @@ -104,7 +42,7 @@ func walker(pdf *gofpdf.Fpdf) filepath.WalkFunc { } else { imgpath = strings.TrimSuffix(fpath, ".hocr") + ".png" } - return addPage(pdf, imgpath, fpath) + return pdf.AddPage(imgpath, fpath) } } @@ -123,14 +61,15 @@ func main() { return } - pdf := setupPdf() + pdf := new(bookpipeline.Fpdf) + pdf.Setup() err := filepath.Walk(flag.Arg(0), walker(pdf)) if err != nil { log.Fatalln("Failed to walk", flag.Arg(0), err) } - err = savePdf(pdf, flag.Arg(1)) + err = pdf.Save(flag.Arg(1)) if err != nil { log.Fatalln("Failed to save", flag.Arg(1), err) } @@ -0,0 +1,90 @@ +package bookpipeline + +import ( + "errors" + "fmt" + "html" + "image" + _ "image/jpeg" + _ "image/png" + "io/ioutil" + "os" + + //"rescribe.xyz/gofpdf@addtextrenderingmode" + "rescribe.xyz/gofpdf" + "rescribe.xyz/utils/pkg/hocr" +) + +const pageWidth = 5 // pageWidth in inches + +// pxToPt converts a pixel value into a pt value (72 pts per inch) +// This uses pageWidth to determine the appropriate value +func pxToPt(i int) float64 { + return float64(i) / pageWidth +} + +type Fpdf struct { + fpdf *gofpdf.Fpdf +} + +// Setup creates a new PDF with appropriate settings and fonts +// TODO: find a font that's closer to the average dimensions of the +// text we're dealing with +// TODO: once we have a good font, embed it in the binary as bytes +func (p *Fpdf) Setup() error { + p.fpdf = gofpdf.New("P", "pt", "A4", "") + // Even though it's invisible, we need to add a font which can do + // UTF-8 so that text renders correctly. + p.fpdf.AddUTF8Font("dejavu", "", "DejaVuSansCondensed.ttf") + p.fpdf.SetFont("dejavu", "", 10) + p.fpdf.SetAutoPageBreak(false, float64(0)) + return p.fpdf.Error() +} + +// AddPage adds a page to the pdf with an image and (invisible) +// text from an hocr file +func (p *Fpdf) AddPage(imgpath, hocrpath string) error { + file, err := ioutil.ReadFile(hocrpath) + if err != nil { + return errors.New(fmt.Sprintf("Could not read file %s: %v", hocrpath, err)) + } + // TODO: change hocr.Parse to take a Reader rather than []byte + h, err := hocr.Parse(file) + if err != nil { + return errors.New(fmt.Sprintf("Could not parse hocr in file %s: %v", hocrpath, err)) + } + + f, err := os.Open(imgpath) + defer f.Close() + if err != nil { + return errors.New(fmt.Sprintf("Could not open file %s: %v", imgpath, err)) + } + img, _, err := image.Decode(f) + if err != nil { + return errors.New(fmt.Sprintf("Could not decode image: %v", err)) + } + b := img.Bounds() + p.fpdf.AddPageFormat("P", gofpdf.SizeType{Wd: pxToPt(b.Dx()), Ht: pxToPt(b.Dy())}) + + // TODO: check for errors in pdf as going through + + _ = p.fpdf.RegisterImageOptions(imgpath, gofpdf.ImageOptions{}) + p.fpdf.ImageOptions(imgpath, 0, 0, pxToPt(b.Dx()), pxToPt(b.Dy()), false, gofpdf.ImageOptions{}, 0, "") + + p.fpdf.SetTextRenderingMode(gofpdf.TextRenderingModeInvisible) + + for _, l := range h.Lines { + coords, err := hocr.BoxCoords(l.Title) + if err != nil { + continue + } + p.fpdf.SetXY(pxToPt(coords[0]), pxToPt(coords[1])) + p.fpdf.CellFormat(pxToPt(coords[2]), pxToPt(coords[3]), html.UnescapeString(hocr.LineText(l)), "", 0, "T", false, 0, "") + } + return p.fpdf.Error() +} + +// Save saves the PDF to the file at path +func (p *Fpdf) Save(path string) error { + return p.fpdf.OutputFileAndClose(path) +} |