summaryrefslogtreecommitdiff
path: root/cmd/pdfbook/main.go
blob: 09202e0badb03aa543a86f407b026bf005bd9d91 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
package main

import (
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"os"
	"path/filepath"
	"strings"

	"rescribe.xyz/gofpdf"
	"rescribe.xyz/utils/pkg/hocr"
)

// see notebook for rationale; experimental
func pxToPt(i int) float64 {
	return float64(i) / 5
}

func lineText(l hocr.OcrLine) string {
	// TODO: handle cases of OcrLine being where the text is, and OcrChar being where the text is
	var t string
	for _, w := range l.Words {
		if len(t) > 0 {
			t += " "
		}
		t += w.Text
	}
	return t
}

func walker(pdf *gofpdf.Fpdf) filepath.WalkFunc {
	return func(path string, info os.FileInfo, err error) error {
		if info.IsDir() {
			return nil
		}
		if !strings.HasSuffix(path, ".hocr") {
			return nil
		}
		// TODO: have errors returned include the file path of the error
		file, err := ioutil.ReadFile(path)
		if err != nil {
			return err
		}
		h, err := hocr.Parse(file)
		if err != nil {
			return err
		}
		// TODO: get page dimensions from image dimensions
		pdf.AddPageFormat("P", gofpdf.SizeType{Wd: pxToPt(1414), Ht: pxToPt(2500)})
		//pdf.SetTextRenderingMode(gofpdf.TextRenderingModeInvisible)
		// TODO: add page image
		for _, l := range h.Lines {
			coords, err := hocr.BoxCoords(l.Title)
			if err != nil {
				return err
			}
			pdf.SetXY(pxToPt(coords[0]), pxToPt(coords[1]))
			// TODO: html escape text
			pdf.CellFormat(pxToPt(coords[2]), pxToPt(coords[3]), hocr.LineText(l), "", 0, "T", false, 0, "")
		}
		return nil
	}
}

func main() {
	// TODO: handle best
	// TODO: take flags to do colour or binarised
	// TODO: probably also take flags to resize / change quality in due course
	flag.Usage = func() {
		fmt.Fprintln(flag.CommandLine.Output(), "Usage: pdfbook hocrdir out.pdf")
		flag.PrintDefaults()
	}
	flag.Parse()

	if flag.NArg() != 2 {
		flag.Usage()
		return
	}

	// TODO: this will go in pdf.go in due course, potentially with a
	//       type which covers gofpdf.Fpdf, and an interface, so that
	//       the backend can be switched out like aws.go
	pdf := gofpdf.New("P", "pt", "A4", "")
	// Even though it's invisible, we need to add a font which can do UTF-8 so text is correctly rendered
	// TODO: find a font that's closer to the average dimensions of the
	//       text we're dealing with, and put it somewhere sensible
	pdf.AddUTF8Font("dejavu", "", "DejaVuSansCondensed.ttf")
	pdf.SetFont("dejavu", "", 10)
	pdf.SetAutoPageBreak(false, float64(0))

	err := filepath.Walk(flag.Arg(0), walker(pdf))
	if err != nil {
		log.Fatalln("Failed to walk", flag.Arg(0), err)
        }

	err = pdf.OutputFileAndClose(flag.Arg(1))
	if err != nil {
		log.Fatalln("Failed to save", flag.Arg(1), err)
        }
}