summaryrefslogtreecommitdiff
path: root/cmd/pdfbook/main.go
blob: 45bbc4f637cf97c6ae3d9e9f84f92c63e0f150fc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
package main

import (
	"errors"
	"flag"
	"fmt"
	"html"
	"image"
	_ "image/jpeg"
	_ "image/png"
	"io/ioutil"
	"log"
	"os"
	"path"
	"path/filepath"
	"strings"

	"rescribe.xyz/gofpdf"
	"rescribe.xyz/utils/pkg/hocr"
)

const pageWidth = 5 // pageWidth in inches

// pxToPt converts a pixel value into a pt value (72 pts per inch)
// This uses pageWidth to determine the appropriate value
func pxToPt(i int) float64 {
	return float64(i) / pageWidth
}

// setupPdf creates a new PDF with appropriate settings and fonts
// TODO: this will go in pdf.go in due course
// TODO: find a font that's closer to the average dimensions of the
//       text we're dealing with, and put it somewhere sensible
func setupPdf() *gofpdf.Fpdf {
	pdf := gofpdf.New("P", "pt", "A4", "")
	// Even though it's invisible, we need to add a font which can do
	// UTF-8 so that text renders correctly.
	pdf.AddUTF8Font("dejavu", "", "DejaVuSansCondensed.ttf")
	pdf.SetFont("dejavu", "", 10)
	pdf.SetAutoPageBreak(false, float64(0))
	return pdf
}

// addPage adds a page to the pdf with an image and (invisible)
// text from an hocr file
func addPage(pdf *gofpdf.Fpdf, imgpath string, hocrpath string) error {
	file, err := ioutil.ReadFile(hocrpath)
	if err != nil {
		return errors.New(fmt.Sprintf("Could not read file %s: %v", hocrpath, err))
	}
	// TODO: change hocr.Parse to take a Reader rather than []byte
	h, err := hocr.Parse(file)
	if err != nil {
		return errors.New(fmt.Sprintf("Could not parse hocr in file %s: %v", hocrpath, err))
	}

	f, err := os.Open(imgpath)
	defer f.Close()
	if err != nil {
		return errors.New(fmt.Sprintf("Could not open file %s: %v", imgpath, err))
	}
	img, _, err := image.Decode(f)
	if err != nil {
		return errors.New(fmt.Sprintf("Could not decode image: %v", err))
	}
	b := img.Bounds()
	pdf.AddPageFormat("P", gofpdf.SizeType{Wd: pxToPt(b.Dx()), Ht: pxToPt(b.Dy())})

	// TODO: check for errors in pdf as going through

	_ = pdf.RegisterImageOptions(imgpath, gofpdf.ImageOptions{})
	pdf.ImageOptions(imgpath, 0, 0, pxToPt(b.Dx()), pxToPt(b.Dy()), false, gofpdf.ImageOptions{}, 0, "")

	pdf.SetTextRenderingMode(gofpdf.TextRenderingModeInvisible)

	for _, l := range h.Lines {
		coords, err := hocr.BoxCoords(l.Title)
		if err != nil {
			continue
		}
		pdf.SetXY(pxToPt(coords[0]), pxToPt(coords[1]))
		pdf.CellFormat(pxToPt(coords[2]), pxToPt(coords[3]), html.UnescapeString(hocr.LineText(l)), "", 0, "T", false, 0, "")
	}
	return nil
}

func savePdf(pdf *gofpdf.Fpdf, p string) error {
	return pdf.OutputFileAndClose(p)
}

func walker(pdf *gofpdf.Fpdf) filepath.WalkFunc {
	return func(fpath string, info os.FileInfo, err error) error {
		if info.IsDir() {
			return nil
		}
		if !strings.HasSuffix(fpath, ".hocr") {
			return nil
		}
		// TODO: handle jpg or binarised versions according to a flag
		imgpath := ""
		p := strings.SplitN(path.Base(fpath), "_bin", 2)
		if len(p) > 1 {
			imgpath = path.Join(path.Dir(fpath), p[0] + ".jpg")
		} else {
			imgpath = strings.TrimSuffix(fpath, ".hocr") + ".png"
		}
		return addPage(pdf, imgpath, fpath)
	}
}

func main() {
	// TODO: handle best
	// TODO: take flags to do colour or binarised
	// TODO: probably also take flags to resize / change quality in due course
	flag.Usage = func() {
		fmt.Fprintln(flag.CommandLine.Output(), "Usage: pdfbook hocrdir out.pdf")
		flag.PrintDefaults()
	}
	flag.Parse()

	if flag.NArg() != 2 {
		flag.Usage()
		return
	}

	pdf := setupPdf()

	err := filepath.Walk(flag.Arg(0), walker(pdf))
	if err != nil {
		log.Fatalln("Failed to walk", flag.Arg(0), err)
	}

	err = savePdf(pdf, flag.Arg(1))
	if err != nil {
		log.Fatalln("Failed to save", flag.Arg(1), err)
	}
}