From 459e7939910213035a432ee3ce7986b6d48c604b Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.name>
Date: Thu, 31 Oct 2019 10:41:23 +0000
Subject: Add work in progress PDF producer

---
 cmd/pdfbook/main.go | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 cmd/pdfbook/main.go

diff --git a/cmd/pdfbook/main.go b/cmd/pdfbook/main.go
new file mode 100644
index 0000000..09202e0
--- /dev/null
+++ b/cmd/pdfbook/main.go
@@ -0,0 +1,102 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"rescribe.xyz/gofpdf"
+	"rescribe.xyz/utils/pkg/hocr"
+)
+
+// see notebook for rationale; experimental
+func pxToPt(i int) float64 {
+	return float64(i) / 5
+}
+
+func lineText(l hocr.OcrLine) string {
+	// TODO: handle cases of OcrLine being where the text is, and OcrChar being where the text is
+	var t string
+	for _, w := range l.Words {
+		if len(t) > 0 {
+			t += " "
+		}
+		t += w.Text
+	}
+	return t
+}
+
+func walker(pdf *gofpdf.Fpdf) filepath.WalkFunc {
+	return func(path string, info os.FileInfo, err error) error {
+		if info.IsDir() {
+			return nil
+		}
+		if !strings.HasSuffix(path, ".hocr") {
+			return nil
+		}
+		// TODO: have errors returned include the file path of the error
+		file, err := ioutil.ReadFile(path)
+		if err != nil {
+			return err
+		}
+		h, err := hocr.Parse(file)
+		if err != nil {
+			return err
+		}
+		// TODO: get page dimensions from image dimensions
+		pdf.AddPageFormat("P", gofpdf.SizeType{Wd: pxToPt(1414), Ht: pxToPt(2500)})
+		//pdf.SetTextRenderingMode(gofpdf.TextRenderingModeInvisible)
+		// TODO: add page image
+		for _, l := range h.Lines {
+			coords, err := hocr.BoxCoords(l.Title)
+			if err != nil {
+				return err
+			}
+			pdf.SetXY(pxToPt(coords[0]), pxToPt(coords[1]))
+			// TODO: html escape text
+			pdf.CellFormat(pxToPt(coords[2]), pxToPt(coords[3]), hocr.LineText(l), "", 0, "T", false, 0, "")
+		}
+		return nil
+	}
+}
+
+func main() {
+	// TODO: handle best
+	// TODO: take flags to do colour or binarised
+	// TODO: probably also take flags to resize / change quality in due course
+	flag.Usage = func() {
+		fmt.Fprintln(flag.CommandLine.Output(), "Usage: pdfbook hocrdir out.pdf")
+		flag.PrintDefaults()
+	}
+	flag.Parse()
+
+	if flag.NArg() != 2 {
+		flag.Usage()
+		return
+	}
+
+	// TODO: this will go in pdf.go in due course, potentially with a
+	//       type which covers gofpdf.Fpdf, and an interface, so that
+	//       the backend can be switched out like aws.go
+	pdf := gofpdf.New("P", "pt", "A4", "")
+	// Even though it's invisible, we need to add a font which can do UTF-8 so text is correctly rendered
+	// TODO: find a font that's closer to the average dimensions of the
+	//       text we're dealing with, and put it somewhere sensible
+	pdf.AddUTF8Font("dejavu", "", "DejaVuSansCondensed.ttf")
+	pdf.SetFont("dejavu", "", 10)
+	pdf.SetAutoPageBreak(false, float64(0))
+
+	err := filepath.Walk(flag.Arg(0), walker(pdf))
+	if err != nil {
+		log.Fatalln("Failed to walk", flag.Arg(0), err)
+        }
+
+	err = pdf.OutputFileAndClose(flag.Arg(1))
+	if err != nil {
+		log.Fatalln("Failed to save", flag.Arg(1), err)
+        }
+}
-- 
cgit v1.2.1-24-ge1ad