diff options
author | Nick White <git@njw.name> | 2021-08-09 15:51:50 +0100 |
---|---|---|
committer | Nick White <git@njw.name> | 2021-08-09 15:51:50 +0100 |
commit | 69eeb41a33f6a764fc6baf1a95e629a6482b67ea (patch) | |
tree | 0e28eef52d0eef3405c16d9ee853e43977e33c4f | |
parent | 1f2a05e466c195dde83effd82c96d4329259d249 (diff) |
pdf: significantly improve character coordinates
A few good changes to make word coordinate lookups significantly
more accurate:
- Set font size dynamically based on the line height (previously it was
fixed as size 10)
- Correct height and width of word boxes (previously they were way too
large, which probably didn't make a difference in the general case,
but now they're correct)
- Set word box margin to zero
Also change PDF size to A5 paper, as that's closer to an average book page size.
-rw-r--r-- | pdf.go | 17 |
1 files changed, 12 insertions, 5 deletions
@@ -1,4 +1,4 @@ -// Copyright 2019 Nick White. +// Copyright 2021 Nick White. // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. @@ -22,7 +22,7 @@ import ( ) // TODO: maybe set this in Fpdf struct -const pageWidth = 5 // pageWidth in inches +const pageWidth = 5.8 // pageWidth in inches - 5.8" is A5 const scaleSmaller = 3 // amount the width and height are divided by // pxToPt converts a pixel value into a pt value (72 pts per inch) @@ -38,7 +38,7 @@ type Fpdf struct { // Setup creates a new PDF with appropriate settings and fonts func (p *Fpdf) Setup() error { - p.fpdf = gofpdf.New("P", "pt", "A4", "") + p.fpdf = gofpdf.New("P", "pt", "A5", "") // Even though it's invisible, we need to add a font which can do // UTF-8 so that text renders correctly. @@ -108,13 +108,20 @@ func (p *Fpdf) AddPage(imgpath, hocrpath string, smaller bool) error { p.fpdf.SetTextRenderingMode(3) for _, l := range h.Lines { + coords, err := hocr.BoxCoords(l.Title) + if err != nil { + continue + } + lineheight := coords[3] - coords[1] for _, w := range l.Words { - coords, err := hocr.BoxCoords(w.Title) + coords, err = hocr.BoxCoords(w.Title) if err != nil { continue } p.fpdf.SetXY(pxToPt(coords[0]), pxToPt(coords[1])) - p.fpdf.CellFormat(pxToPt(coords[2]), pxToPt(coords[3]), html.UnescapeString(w.Text)+" ", "", 0, "T", false, 0, "") + p.fpdf.SetCellMargin(0) + p.fpdf.SetFontSize(pxToPt(lineheight)) + p.fpdf.CellFormat(pxToPt(coords[2] - coords[0]), pxToPt(coords[3] - coords[1]), html.UnescapeString(w.Text)+" ", "", 0, "T", false, 0, "") } } return p.fpdf.Error() |