summaryrefslogtreecommitdiff
path: root/cmd
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-10-31 10:41:23 +0000
committerNick White <git@njw.name>2019-10-31 10:41:23 +0000
commit459e7939910213035a432ee3ce7986b6d48c604b (patch)
tree1c30db40a93d765539f57a65ae10382df8e8b56b /cmd
parent9789666a4f6d30b07ce0c1ec3b06987c5a920d7b (diff)
Add work in progress PDF producer
Diffstat (limited to 'cmd')
-rw-r--r--cmd/pdfbook/main.go102
1 files changed, 102 insertions, 0 deletions
diff --git a/cmd/pdfbook/main.go b/cmd/pdfbook/main.go
new file mode 100644
index 0000000..09202e0
--- /dev/null
+++ b/cmd/pdfbook/main.go
@@ -0,0 +1,102 @@
+package main
+
+import (
+ "flag"
+ "fmt"
+ "io/ioutil"
+ "log"
+ "os"
+ "path/filepath"
+ "strings"
+
+ "rescribe.xyz/gofpdf"
+ "rescribe.xyz/utils/pkg/hocr"
+)
+
+// see notebook for rationale; experimental
+func pxToPt(i int) float64 {
+ return float64(i) / 5
+}
+
+func lineText(l hocr.OcrLine) string {
+ // TODO: handle cases of OcrLine being where the text is, and OcrChar being where the text is
+ var t string
+ for _, w := range l.Words {
+ if len(t) > 0 {
+ t += " "
+ }
+ t += w.Text
+ }
+ return t
+}
+
+func walker(pdf *gofpdf.Fpdf) filepath.WalkFunc {
+ return func(path string, info os.FileInfo, err error) error {
+ if info.IsDir() {
+ return nil
+ }
+ if !strings.HasSuffix(path, ".hocr") {
+ return nil
+ }
+ // TODO: have errors returned include the file path of the error
+ file, err := ioutil.ReadFile(path)
+ if err != nil {
+ return err
+ }
+ h, err := hocr.Parse(file)
+ if err != nil {
+ return err
+ }
+ // TODO: get page dimensions from image dimensions
+ pdf.AddPageFormat("P", gofpdf.SizeType{Wd: pxToPt(1414), Ht: pxToPt(2500)})
+ //pdf.SetTextRenderingMode(gofpdf.TextRenderingModeInvisible)
+ // TODO: add page image
+ for _, l := range h.Lines {
+ coords, err := hocr.BoxCoords(l.Title)
+ if err != nil {
+ return err
+ }
+ pdf.SetXY(pxToPt(coords[0]), pxToPt(coords[1]))
+ // TODO: html escape text
+ pdf.CellFormat(pxToPt(coords[2]), pxToPt(coords[3]), hocr.LineText(l), "", 0, "T", false, 0, "")
+ }
+ return nil
+ }
+}
+
+func main() {
+ // TODO: handle best
+ // TODO: take flags to do colour or binarised
+ // TODO: probably also take flags to resize / change quality in due course
+ flag.Usage = func() {
+ fmt.Fprintln(flag.CommandLine.Output(), "Usage: pdfbook hocrdir out.pdf")
+ flag.PrintDefaults()
+ }
+ flag.Parse()
+
+ if flag.NArg() != 2 {
+ flag.Usage()
+ return
+ }
+
+ // TODO: this will go in pdf.go in due course, potentially with a
+ // type which covers gofpdf.Fpdf, and an interface, so that
+ // the backend can be switched out like aws.go
+ pdf := gofpdf.New("P", "pt", "A4", "")
+ // Even though it's invisible, we need to add a font which can do UTF-8 so text is correctly rendered
+ // TODO: find a font that's closer to the average dimensions of the
+ // text we're dealing with, and put it somewhere sensible
+ pdf.AddUTF8Font("dejavu", "", "DejaVuSansCondensed.ttf")
+ pdf.SetFont("dejavu", "", 10)
+ pdf.SetAutoPageBreak(false, float64(0))
+
+ err := filepath.Walk(flag.Arg(0), walker(pdf))
+ if err != nil {
+ log.Fatalln("Failed to walk", flag.Arg(0), err)
+ }
+
+ err = pdf.OutputFileAndClose(flag.Arg(1))
+ if err != nil {
+ log.Fatalln("Failed to save", flag.Arg(1), err)
+ }
+}