summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2021-08-17 13:39:09 +0100
committerNick White <git@njw.name>2021-08-17 13:39:09 +0100
commit767b60db23311adaf1035e821bc189877d63b7f0 (patch)
tree16f20e400c2c259488622a98b71d199cad4f62e7
parent48c68cfa7f18992b26765c7b67c52c11687ad74a (diff)
pdf: Stretch words to fit in their boxes, for more perfect embedding
- Words are stretched to fit their boxes, which means the accuracy is now very high indeed. This was done by modifying gofpdf to add the SetCellStretchToFit function, which will hopefully be upstreamed in due course. - Copy pasting from a PDF works well with lines rarely if ever being erroneously broken by the PDF reader. There was quite a bit of trial-and-error to improve this, and the stretched text plus a space being added after the word in CellFormat was the best (plus preserves accuracy of word and character locations).
-rw-r--r--go.mod2
-rw-r--r--go.sum4
-rw-r--r--pdf.go15
3 files changed, 14 insertions, 7 deletions
diff --git a/go.mod b/go.mod
index 796618d..ec8afcc 100644
--- a/go.mod
+++ b/go.mod
@@ -4,7 +4,7 @@ go 1.16
require (
github.com/aws/aws-sdk-go v1.40.6
- github.com/phpdave11/gofpdf v1.4.2
+ github.com/nickjwhite/gofpdf v1.12.7-0.20210817123627-3cbaeb9797ef
github.com/wcharczuk/go-chart/v2 v2.1.0
golang.org/x/image v0.0.0-20210628002857-a66eb6448b8d
rescribe.xyz/preproc v0.4.2
diff --git a/go.sum b/go.sum
index e802e09..45867e6 100644
--- a/go.sum
+++ b/go.sum
@@ -10,8 +10,8 @@ github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHW
github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8=
github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
-github.com/phpdave11/gofpdf v1.4.2 h1:KPKiIbfwbvC/wOncwhrpRdXVj2CZTCFlw4wnoyjtHfQ=
-github.com/phpdave11/gofpdf v1.4.2/go.mod h1:zpO6xFn9yxo3YLyMvW8HcKWVdbNqgIfOOp2dXMnm1mY=
+github.com/nickjwhite/gofpdf v1.12.7-0.20210817123627-3cbaeb9797ef h1:Pq7OiIQ0gWrNQr12kRIvAH7qEr3i6hfSfd0smfwGgC0=
+github.com/nickjwhite/gofpdf v1.12.7-0.20210817123627-3cbaeb9797ef/go.mod h1:HHhz41M4FT6ogJi7KMknlu/sEJd6halot657/rJwzVM=
github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
diff --git a/pdf.go b/pdf.go
index be7f8a9..eb5f021 100644
--- a/pdf.go
+++ b/pdf.go
@@ -16,7 +16,8 @@ import (
"io/ioutil"
"os"
- "github.com/phpdave11/gofpdf"
+ //"github.com/phpdave11/gofpdf"
+ "github.com/nickjwhite/gofpdf" // adds SetCellStretchToFit function
"golang.org/x/image/draw"
"rescribe.xyz/utils/pkg/hocr"
)
@@ -112,7 +113,7 @@ func (p *Fpdf) AddPage(imgpath, hocrpath string, smaller bool) error {
if err != nil {
continue
}
- lineheight := linecoords[3] - linecoords[1]
+ lineheight := pxToPt(linecoords[3] - linecoords[1])
for _, w := range l.Words {
coords, err := hocr.BoxCoords(w.Title)
if err != nil {
@@ -120,8 +121,14 @@ func (p *Fpdf) AddPage(imgpath, hocrpath string, smaller bool) error {
}
p.fpdf.SetXY(pxToPt(coords[0]), pxToPt(linecoords[1]))
p.fpdf.SetCellMargin(0)
- p.fpdf.SetFontSize(pxToPt(lineheight))
- p.fpdf.CellFormat(pxToPt(coords[2] - coords[0]), pxToPt(lineheight), html.UnescapeString(w.Text)+" ", "", 0, "T", false, 0, "")
+ p.fpdf.SetFontSize(lineheight)
+ cellW := pxToPt(coords[2] - coords[0])
+ cellText := html.UnescapeString(w.Text)
+ p.fpdf.SetCellStretchToFit(cellW, cellText)
+ // Adding a space after each word causes fewer line breaks to
+ // be erroneously inserted when copy pasting from the PDF, for
+ // some reason.
+ p.fpdf.CellFormat(cellW, lineheight, cellText + " ", "", 0, "T", false, 0, "")
}
}
return p.fpdf.Error()