diff options
author | Nick White <git@njw.name> | 2021-08-17 13:39:09 +0100 |
---|---|---|
committer | Nick White <git@njw.name> | 2021-08-17 13:39:09 +0100 |
commit | 767b60db23311adaf1035e821bc189877d63b7f0 (patch) | |
tree | 16f20e400c2c259488622a98b71d199cad4f62e7 | |
parent | 48c68cfa7f18992b26765c7b67c52c11687ad74a (diff) |
pdf: Stretch words to fit in their boxes, for more perfect embedding
- Words are stretched to fit their boxes, which means the accuracy
is now very high indeed. This was done by modifying gofpdf to add
the SetCellStretchToFit function, which will hopefully be
upstreamed in due course.
- Copy pasting from a PDF works well with lines rarely if ever being
erroneously broken by the PDF reader. There was quite a bit of
trial-and-error to improve this, and the stretched text plus a space
being added after the word in CellFormat was the best (plus preserves
accuracy of word and character locations).
-rw-r--r-- | go.mod | 2 | ||||
-rw-r--r-- | go.sum | 4 | ||||
-rw-r--r-- | pdf.go | 15 |
3 files changed, 14 insertions, 7 deletions
@@ -4,7 +4,7 @@ go 1.16 require ( github.com/aws/aws-sdk-go v1.40.6 - github.com/phpdave11/gofpdf v1.4.2 + github.com/nickjwhite/gofpdf v1.12.7-0.20210817123627-3cbaeb9797ef github.com/wcharczuk/go-chart/v2 v2.1.0 golang.org/x/image v0.0.0-20210628002857-a66eb6448b8d rescribe.xyz/preproc v0.4.2 @@ -10,8 +10,8 @@ github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHW github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= -github.com/phpdave11/gofpdf v1.4.2 h1:KPKiIbfwbvC/wOncwhrpRdXVj2CZTCFlw4wnoyjtHfQ= -github.com/phpdave11/gofpdf v1.4.2/go.mod h1:zpO6xFn9yxo3YLyMvW8HcKWVdbNqgIfOOp2dXMnm1mY= +github.com/nickjwhite/gofpdf v1.12.7-0.20210817123627-3cbaeb9797ef h1:Pq7OiIQ0gWrNQr12kRIvAH7qEr3i6hfSfd0smfwGgC0= +github.com/nickjwhite/gofpdf v1.12.7-0.20210817123627-3cbaeb9797ef/go.mod h1:HHhz41M4FT6ogJi7KMknlu/sEJd6halot657/rJwzVM= github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= @@ -16,7 +16,8 @@ import ( "io/ioutil" "os" - "github.com/phpdave11/gofpdf" + //"github.com/phpdave11/gofpdf" + "github.com/nickjwhite/gofpdf" // adds SetCellStretchToFit function "golang.org/x/image/draw" "rescribe.xyz/utils/pkg/hocr" ) @@ -112,7 +113,7 @@ func (p *Fpdf) AddPage(imgpath, hocrpath string, smaller bool) error { if err != nil { continue } - lineheight := linecoords[3] - linecoords[1] + lineheight := pxToPt(linecoords[3] - linecoords[1]) for _, w := range l.Words { coords, err := hocr.BoxCoords(w.Title) if err != nil { @@ -120,8 +121,14 @@ func (p *Fpdf) AddPage(imgpath, hocrpath string, smaller bool) error { } p.fpdf.SetXY(pxToPt(coords[0]), pxToPt(linecoords[1])) p.fpdf.SetCellMargin(0) - p.fpdf.SetFontSize(pxToPt(lineheight)) - p.fpdf.CellFormat(pxToPt(coords[2] - coords[0]), pxToPt(lineheight), html.UnescapeString(w.Text)+" ", "", 0, "T", false, 0, "") + p.fpdf.SetFontSize(lineheight) + cellW := pxToPt(coords[2] - coords[0]) + cellText := html.UnescapeString(w.Text) + p.fpdf.SetCellStretchToFit(cellW, cellText) + // Adding a space after each word causes fewer line breaks to + // be erroneously inserted when copy pasting from the PDF, for + // some reason. + p.fpdf.CellFormat(cellW, lineheight, cellText + " ", "", 0, "T", false, 0, "") } } return p.fpdf.Error() |