1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
package bookpipeline
import (
"errors"
"fmt"
"html"
"image"
_ "image/jpeg"
_ "image/png"
"io/ioutil"
"os"
//"rescribe.xyz/gofpdf@addtextrenderingmode"
"rescribe.xyz/gofpdf"
"rescribe.xyz/utils/pkg/hocr"
)
const pageWidth = 5 // pageWidth in inches
// pxToPt converts a pixel value into a pt value (72 pts per inch)
// This uses pageWidth to determine the appropriate value
func pxToPt(i int) float64 {
return float64(i) / pageWidth
}
type Fpdf struct {
fpdf *gofpdf.Fpdf
}
// Setup creates a new PDF with appropriate settings and fonts
// TODO: find a font that's closer to the average dimensions of the
// text we're dealing with
// TODO: once we have a good font, embed it in the binary as bytes
func (p *Fpdf) Setup() error {
p.fpdf = gofpdf.New("P", "pt", "A4", "")
// Even though it's invisible, we need to add a font which can do
// UTF-8 so that text renders correctly.
p.fpdf.AddUTF8Font("dejavu", "", "DejaVuSansCondensed.ttf")
p.fpdf.SetFont("dejavu", "", 10)
p.fpdf.SetAutoPageBreak(false, float64(0))
return p.fpdf.Error()
}
// AddPage adds a page to the pdf with an image and (invisible)
// text from an hocr file
func (p *Fpdf) AddPage(imgpath, hocrpath string) error {
file, err := ioutil.ReadFile(hocrpath)
if err != nil {
return errors.New(fmt.Sprintf("Could not read file %s: %v", hocrpath, err))
}
// TODO: change hocr.Parse to take a Reader rather than []byte
h, err := hocr.Parse(file)
if err != nil {
return errors.New(fmt.Sprintf("Could not parse hocr in file %s: %v", hocrpath, err))
}
f, err := os.Open(imgpath)
defer f.Close()
if err != nil {
return errors.New(fmt.Sprintf("Could not open file %s: %v", imgpath, err))
}
img, _, err := image.Decode(f)
if err != nil {
return errors.New(fmt.Sprintf("Could not decode image: %v", err))
}
b := img.Bounds()
p.fpdf.AddPageFormat("P", gofpdf.SizeType{Wd: pxToPt(b.Dx()), Ht: pxToPt(b.Dy())})
// TODO: check for errors in pdf as going through
_ = p.fpdf.RegisterImageOptions(imgpath, gofpdf.ImageOptions{})
p.fpdf.ImageOptions(imgpath, 0, 0, pxToPt(b.Dx()), pxToPt(b.Dy()), false, gofpdf.ImageOptions{}, 0, "")
p.fpdf.SetTextRenderingMode(gofpdf.TextRenderingModeInvisible)
for _, l := range h.Lines {
coords, err := hocr.BoxCoords(l.Title)
if err != nil {
continue
}
p.fpdf.SetXY(pxToPt(coords[0]), pxToPt(coords[1]))
p.fpdf.CellFormat(pxToPt(coords[2]), pxToPt(coords[3]), html.UnescapeString(hocr.LineText(l)), "", 0, "T", false, 0, "")
}
return p.fpdf.Error()
}
// Save saves the PDF to the file at path
func (p *Fpdf) Save(path string) error {
return p.fpdf.OutputFileAndClose(path)
}
|