summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2021-08-09 15:51:50 +0100
committerNick White <git@njw.name>2021-08-09 15:51:50 +0100
commit69eeb41a33f6a764fc6baf1a95e629a6482b67ea (patch)
tree0e28eef52d0eef3405c16d9ee853e43977e33c4f
parent1f2a05e466c195dde83effd82c96d4329259d249 (diff)
pdf: significantly improve character coordinates
A few good changes to make word coordinate lookups significantly more accurate: - Set font size dynamically based on the line height (previously it was fixed as size 10) - Correct height and width of word boxes (previously they were way too large, which probably didn't make a difference in the general case, but now they're correct) - Set word box margin to zero Also change PDF size to A5 paper, as that's closer to an average book page size.
-rw-r--r--pdf.go17
1 files changed, 12 insertions, 5 deletions
diff --git a/pdf.go b/pdf.go
index 64a8654..a0c6a63 100644
--- a/pdf.go
+++ b/pdf.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Nick White.
+// Copyright 2021 Nick White.
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.
@@ -22,7 +22,7 @@ import (
)
// TODO: maybe set this in Fpdf struct
-const pageWidth = 5 // pageWidth in inches
+const pageWidth = 5.8 // pageWidth in inches - 5.8" is A5
const scaleSmaller = 3 // amount the width and height are divided by
// pxToPt converts a pixel value into a pt value (72 pts per inch)
@@ -38,7 +38,7 @@ type Fpdf struct {
// Setup creates a new PDF with appropriate settings and fonts
func (p *Fpdf) Setup() error {
- p.fpdf = gofpdf.New("P", "pt", "A4", "")
+ p.fpdf = gofpdf.New("P", "pt", "A5", "")
// Even though it's invisible, we need to add a font which can do
// UTF-8 so that text renders correctly.
@@ -108,13 +108,20 @@ func (p *Fpdf) AddPage(imgpath, hocrpath string, smaller bool) error {
p.fpdf.SetTextRenderingMode(3)
for _, l := range h.Lines {
+ coords, err := hocr.BoxCoords(l.Title)
+ if err != nil {
+ continue
+ }
+ lineheight := coords[3] - coords[1]
for _, w := range l.Words {
- coords, err := hocr.BoxCoords(w.Title)
+ coords, err = hocr.BoxCoords(w.Title)
if err != nil {
continue
}
p.fpdf.SetXY(pxToPt(coords[0]), pxToPt(coords[1]))
- p.fpdf.CellFormat(pxToPt(coords[2]), pxToPt(coords[3]), html.UnescapeString(w.Text)+" ", "", 0, "T", false, 0, "")
+ p.fpdf.SetCellMargin(0)
+ p.fpdf.SetFontSize(pxToPt(lineheight))
+ p.fpdf.CellFormat(pxToPt(coords[2] - coords[0]), pxToPt(coords[3] - coords[1]), html.UnescapeString(w.Text)+" ", "", 0, "T", false, 0, "")
}
}
return p.fpdf.Error()