diff options
| author | Nick White <git@njw.name> | 2019-10-31 10:41:23 +0000 | 
|---|---|---|
| committer | Nick White <git@njw.name> | 2019-10-31 10:41:23 +0000 | 
| commit | 459e7939910213035a432ee3ce7986b6d48c604b (patch) | |
| tree | 1c30db40a93d765539f57a65ae10382df8e8b56b /cmd/pdfbook | |
| parent | 9789666a4f6d30b07ce0c1ec3b06987c5a920d7b (diff) | |
Add work in progress PDF producer
Diffstat (limited to 'cmd/pdfbook')
| -rw-r--r-- | cmd/pdfbook/main.go | 102 | 
1 files changed, 102 insertions, 0 deletions
| diff --git a/cmd/pdfbook/main.go b/cmd/pdfbook/main.go new file mode 100644 index 0000000..09202e0 --- /dev/null +++ b/cmd/pdfbook/main.go @@ -0,0 +1,102 @@ +package main + +import ( +	"flag" +	"fmt" +	"io/ioutil" +	"log" +	"os" +	"path/filepath" +	"strings" + +	"rescribe.xyz/gofpdf" +	"rescribe.xyz/utils/pkg/hocr" +) + +// see notebook for rationale; experimental +func pxToPt(i int) float64 { +	return float64(i) / 5 +} + +func lineText(l hocr.OcrLine) string { +	// TODO: handle cases of OcrLine being where the text is, and OcrChar being where the text is +	var t string +	for _, w := range l.Words { +		if len(t) > 0 { +			t += " " +		} +		t += w.Text +	} +	return t +} + +func walker(pdf *gofpdf.Fpdf) filepath.WalkFunc { +	return func(path string, info os.FileInfo, err error) error { +		if info.IsDir() { +			return nil +		} +		if !strings.HasSuffix(path, ".hocr") { +			return nil +		} +		// TODO: have errors returned include the file path of the error +		file, err := ioutil.ReadFile(path) +		if err != nil { +			return err +		} +		h, err := hocr.Parse(file) +		if err != nil { +			return err +		} +		// TODO: get page dimensions from image dimensions +		pdf.AddPageFormat("P", gofpdf.SizeType{Wd: pxToPt(1414), Ht: pxToPt(2500)}) +		//pdf.SetTextRenderingMode(gofpdf.TextRenderingModeInvisible) +		// TODO: add page image +		for _, l := range h.Lines { +			coords, err := hocr.BoxCoords(l.Title) +			if err != nil { +				return err +			} +			pdf.SetXY(pxToPt(coords[0]), pxToPt(coords[1])) +			// TODO: html escape text +			pdf.CellFormat(pxToPt(coords[2]), pxToPt(coords[3]), hocr.LineText(l), "", 0, "T", false, 0, "") +		} +		return nil +	} +} + +func main() { +	// TODO: handle best +	// TODO: take flags to do colour or binarised +	// TODO: probably also take flags to resize / change quality in due course +	flag.Usage = func() { +		fmt.Fprintln(flag.CommandLine.Output(), "Usage: pdfbook hocrdir out.pdf") +		flag.PrintDefaults() +	} +	flag.Parse() + +	if flag.NArg() != 2 { +		flag.Usage() +		return +	} + +	// TODO: this will go in pdf.go in due course, potentially with a +	//       type which covers gofpdf.Fpdf, and an interface, so that +	//       the backend can be switched out like aws.go +	pdf := gofpdf.New("P", "pt", "A4", "") +	// Even though it's invisible, we need to add a font which can do UTF-8 so text is correctly rendered +	// TODO: find a font that's closer to the average dimensions of the +	//       text we're dealing with, and put it somewhere sensible +	pdf.AddUTF8Font("dejavu", "", "DejaVuSansCondensed.ttf") +	pdf.SetFont("dejavu", "", 10) +	pdf.SetAutoPageBreak(false, float64(0)) + +	err := filepath.Walk(flag.Arg(0), walker(pdf)) +	if err != nil { +		log.Fatalln("Failed to walk", flag.Arg(0), err) +        } + +	err = pdf.OutputFileAndClose(flag.Arg(1)) +	if err != nil { +		log.Fatalln("Failed to save", flag.Arg(1), err) +        } +} | 
