From 459e7939910213035a432ee3ce7986b6d48c604b Mon Sep 17 00:00:00 2001 From: Nick White Date: Thu, 31 Oct 2019 10:41:23 +0000 Subject: Add work in progress PDF producer --- cmd/pdfbook/main.go | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 cmd/pdfbook/main.go (limited to 'cmd') diff --git a/cmd/pdfbook/main.go b/cmd/pdfbook/main.go new file mode 100644 index 0000000..09202e0 --- /dev/null +++ b/cmd/pdfbook/main.go @@ -0,0 +1,102 @@ +package main + +import ( + "flag" + "fmt" + "io/ioutil" + "log" + "os" + "path/filepath" + "strings" + + "rescribe.xyz/gofpdf" + "rescribe.xyz/utils/pkg/hocr" +) + +// see notebook for rationale; experimental +func pxToPt(i int) float64 { + return float64(i) / 5 +} + +func lineText(l hocr.OcrLine) string { + // TODO: handle cases of OcrLine being where the text is, and OcrChar being where the text is + var t string + for _, w := range l.Words { + if len(t) > 0 { + t += " " + } + t += w.Text + } + return t +} + +func walker(pdf *gofpdf.Fpdf) filepath.WalkFunc { + return func(path string, info os.FileInfo, err error) error { + if info.IsDir() { + return nil + } + if !strings.HasSuffix(path, ".hocr") { + return nil + } + // TODO: have errors returned include the file path of the error + file, err := ioutil.ReadFile(path) + if err != nil { + return err + } + h, err := hocr.Parse(file) + if err != nil { + return err + } + // TODO: get page dimensions from image dimensions + pdf.AddPageFormat("P", gofpdf.SizeType{Wd: pxToPt(1414), Ht: pxToPt(2500)}) + //pdf.SetTextRenderingMode(gofpdf.TextRenderingModeInvisible) + // TODO: add page image + for _, l := range h.Lines { + coords, err := hocr.BoxCoords(l.Title) + if err != nil { + return err + } + pdf.SetXY(pxToPt(coords[0]), pxToPt(coords[1])) + // TODO: html escape text + pdf.CellFormat(pxToPt(coords[2]), pxToPt(coords[3]), hocr.LineText(l), "", 0, "T", false, 0, "") + } + return nil + } +} + +func main() { + // TODO: handle best + // TODO: take flags to do colour or binarised + // TODO: probably also take flags to resize / change quality in due course + flag.Usage = func() { + fmt.Fprintln(flag.CommandLine.Output(), "Usage: pdfbook hocrdir out.pdf") + flag.PrintDefaults() + } + flag.Parse() + + if flag.NArg() != 2 { + flag.Usage() + return + } + + // TODO: this will go in pdf.go in due course, potentially with a + // type which covers gofpdf.Fpdf, and an interface, so that + // the backend can be switched out like aws.go + pdf := gofpdf.New("P", "pt", "A4", "") + // Even though it's invisible, we need to add a font which can do UTF-8 so text is correctly rendered + // TODO: find a font that's closer to the average dimensions of the + // text we're dealing with, and put it somewhere sensible + pdf.AddUTF8Font("dejavu", "", "DejaVuSansCondensed.ttf") + pdf.SetFont("dejavu", "", 10) + pdf.SetAutoPageBreak(false, float64(0)) + + err := filepath.Walk(flag.Arg(0), walker(pdf)) + if err != nil { + log.Fatalln("Failed to walk", flag.Arg(0), err) + } + + err = pdf.OutputFileAndClose(flag.Arg(1)) + if err != nil { + log.Fatalln("Failed to save", flag.Arg(1), err) + } +} -- cgit v1.2.1-24-ge1ad