summaryrefslogtreecommitdiff
path: root/cmd/pdfbook
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-10-31 11:47:31 +0000
committerNick White <git@njw.name>2019-10-31 11:47:31 +0000
commit99ebbef45d19d14636049ec8d863abb38305c6c5 (patch)
tree778874c497508539e3fe24c37c96b621c8688081 /cmd/pdfbook
parent459e7939910213035a432ee3ce7986b6d48c604b (diff)
Many improvements to pdfbook; basically working now
Diffstat (limited to 'cmd/pdfbook')
-rw-r--r--cmd/pdfbook/main.go129
1 files changed, 82 insertions, 47 deletions
diff --git a/cmd/pdfbook/main.go b/cmd/pdfbook/main.go
index 09202e0..45bbc4f 100644
--- a/cmd/pdfbook/main.go
+++ b/cmd/pdfbook/main.go
@@ -1,11 +1,17 @@
package main
import (
+ "errors"
"flag"
"fmt"
+ "html"
+ "image"
+ _ "image/jpeg"
+ _ "image/png"
"io/ioutil"
"log"
"os"
+ "path"
"path/filepath"
"strings"
@@ -13,54 +19,92 @@ import (
"rescribe.xyz/utils/pkg/hocr"
)
-// see notebook for rationale; experimental
+const pageWidth = 5 // pageWidth in inches
+
+// pxToPt converts a pixel value into a pt value (72 pts per inch)
+// This uses pageWidth to determine the appropriate value
func pxToPt(i int) float64 {
- return float64(i) / 5
+ return float64(i) / pageWidth
}
-func lineText(l hocr.OcrLine) string {
- // TODO: handle cases of OcrLine being where the text is, and OcrChar being where the text is
- var t string
- for _, w := range l.Words {
- if len(t) > 0 {
- t += " "
+// setupPdf creates a new PDF with appropriate settings and fonts
+// TODO: this will go in pdf.go in due course
+// TODO: find a font that's closer to the average dimensions of the
+// text we're dealing with, and put it somewhere sensible
+func setupPdf() *gofpdf.Fpdf {
+ pdf := gofpdf.New("P", "pt", "A4", "")
+ // Even though it's invisible, we need to add a font which can do
+ // UTF-8 so that text renders correctly.
+ pdf.AddUTF8Font("dejavu", "", "DejaVuSansCondensed.ttf")
+ pdf.SetFont("dejavu", "", 10)
+ pdf.SetAutoPageBreak(false, float64(0))
+ return pdf
+}
+
+// addPage adds a page to the pdf with an image and (invisible)
+// text from an hocr file
+func addPage(pdf *gofpdf.Fpdf, imgpath string, hocrpath string) error {
+ file, err := ioutil.ReadFile(hocrpath)
+ if err != nil {
+ return errors.New(fmt.Sprintf("Could not read file %s: %v", hocrpath, err))
+ }
+ // TODO: change hocr.Parse to take a Reader rather than []byte
+ h, err := hocr.Parse(file)
+ if err != nil {
+ return errors.New(fmt.Sprintf("Could not parse hocr in file %s: %v", hocrpath, err))
+ }
+
+ f, err := os.Open(imgpath)
+ defer f.Close()
+ if err != nil {
+ return errors.New(fmt.Sprintf("Could not open file %s: %v", imgpath, err))
+ }
+ img, _, err := image.Decode(f)
+ if err != nil {
+ return errors.New(fmt.Sprintf("Could not decode image: %v", err))
+ }
+ b := img.Bounds()
+ pdf.AddPageFormat("P", gofpdf.SizeType{Wd: pxToPt(b.Dx()), Ht: pxToPt(b.Dy())})
+
+ // TODO: check for errors in pdf as going through
+
+ _ = pdf.RegisterImageOptions(imgpath, gofpdf.ImageOptions{})
+ pdf.ImageOptions(imgpath, 0, 0, pxToPt(b.Dx()), pxToPt(b.Dy()), false, gofpdf.ImageOptions{}, 0, "")
+
+ pdf.SetTextRenderingMode(gofpdf.TextRenderingModeInvisible)
+
+ for _, l := range h.Lines {
+ coords, err := hocr.BoxCoords(l.Title)
+ if err != nil {
+ continue
}
- t += w.Text
+ pdf.SetXY(pxToPt(coords[0]), pxToPt(coords[1]))
+ pdf.CellFormat(pxToPt(coords[2]), pxToPt(coords[3]), html.UnescapeString(hocr.LineText(l)), "", 0, "T", false, 0, "")
}
- return t
+ return nil
+}
+
+func savePdf(pdf *gofpdf.Fpdf, p string) error {
+ return pdf.OutputFileAndClose(p)
}
func walker(pdf *gofpdf.Fpdf) filepath.WalkFunc {
- return func(path string, info os.FileInfo, err error) error {
+ return func(fpath string, info os.FileInfo, err error) error {
if info.IsDir() {
return nil
}
- if !strings.HasSuffix(path, ".hocr") {
+ if !strings.HasSuffix(fpath, ".hocr") {
return nil
}
- // TODO: have errors returned include the file path of the error
- file, err := ioutil.ReadFile(path)
- if err != nil {
- return err
- }
- h, err := hocr.Parse(file)
- if err != nil {
- return err
- }
- // TODO: get page dimensions from image dimensions
- pdf.AddPageFormat("P", gofpdf.SizeType{Wd: pxToPt(1414), Ht: pxToPt(2500)})
- //pdf.SetTextRenderingMode(gofpdf.TextRenderingModeInvisible)
- // TODO: add page image
- for _, l := range h.Lines {
- coords, err := hocr.BoxCoords(l.Title)
- if err != nil {
- return err
- }
- pdf.SetXY(pxToPt(coords[0]), pxToPt(coords[1]))
- // TODO: html escape text
- pdf.CellFormat(pxToPt(coords[2]), pxToPt(coords[3]), hocr.LineText(l), "", 0, "T", false, 0, "")
+ // TODO: handle jpg or binarised versions according to a flag
+ imgpath := ""
+ p := strings.SplitN(path.Base(fpath), "_bin", 2)
+ if len(p) > 1 {
+ imgpath = path.Join(path.Dir(fpath), p[0] + ".jpg")
+ } else {
+ imgpath = strings.TrimSuffix(fpath, ".hocr") + ".png"
}
- return nil
+ return addPage(pdf, imgpath, fpath)
}
}
@@ -79,24 +123,15 @@ func main() {
return
}
- // TODO: this will go in pdf.go in due course, potentially with a
- // type which covers gofpdf.Fpdf, and an interface, so that
- // the backend can be switched out like aws.go
- pdf := gofpdf.New("P", "pt", "A4", "")
- // Even though it's invisible, we need to add a font which can do UTF-8 so text is correctly rendered
- // TODO: find a font that's closer to the average dimensions of the
- // text we're dealing with, and put it somewhere sensible
- pdf.AddUTF8Font("dejavu", "", "DejaVuSansCondensed.ttf")
- pdf.SetFont("dejavu", "", 10)
- pdf.SetAutoPageBreak(false, float64(0))
+ pdf := setupPdf()
err := filepath.Walk(flag.Arg(0), walker(pdf))
if err != nil {
log.Fatalln("Failed to walk", flag.Arg(0), err)
- }
+ }
- err = pdf.OutputFileAndClose(flag.Arg(1))
+ err = savePdf(pdf, flag.Arg(1))
if err != nil {
log.Fatalln("Failed to save", flag.Arg(1), err)
- }
+ }
}