1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
|
// Copyright 2019 Nick White.
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.
// pdfbook creates a searchable PDF from a directory of hOCR and
// image files.
package main
import (
"bufio"
"errors"
"flag"
"fmt"
"log"
"os"
"path"
"path/filepath"
"sort"
"strings"
"rescribe.xyz/bookpipeline"
)
const usage = `Usage: pdfbook [-c] [-s] dir out.pdf
Creates a searchable PDF from a directory of hOCR and image files.
If a 'best' file exists in the directory, each hOCR listed in it is
used to provide the searchable text for each page. Otherwise pdfbook
just looks for a .hocr with the same file base as the image for the
searchable text.
`
type Pdfer interface {
Setup() error
AddPage(imgpath, hocrpath string, smaller bool) error
Save(path string) error
}
const pageWidth = 5 // pageWidth in inches
// pxToPt converts a pixel value into a pt value (72 pts per inch)
// This uses pageWidth to determine the appropriate value
func pxToPt(i int) float64 {
return float64(i) / pageWidth
}
// imgPath returns an appropriate path for the image that
// corresponds with the hocrpath
func imgPath(hocrpath string, colour bool) string {
d := path.Dir(hocrpath)
name := path.Base(hocrpath)
nosuffix := strings.TrimSuffix(name, ".hocr")
imgname := ""
if colour {
p := strings.SplitN(name, "_bin", 2)
if len(p) > 1 {
imgname = p[0] + ".jpg"
} else {
imgname = nosuffix + ".jpg"
}
} else {
imgname = nosuffix + ".png"
}
return path.Join(d, imgname)
}
// addBest adds the pages in dir/best to a PDF
func addBest(dir string, pdf Pdfer, colour, smaller bool) error {
f, err := os.Open(path.Join(dir, "best"))
if err != nil {
log.Fatalln("Failed to open best file", err)
}
defer f.Close()
s := bufio.NewScanner(f)
var files []string
for s.Scan() {
fn := s.Text()
if path.Ext(fn) != ".hocr" {
continue
}
files = append(files, fn)
}
sort.Strings(files)
for _, f := range files {
hocrpath := path.Join(dir, f)
img := imgPath(hocrpath, colour)
err := pdf.AddPage(img, hocrpath, smaller)
if err != nil {
return errors.New(fmt.Sprintf("Failed to add page %s: %v", f, err))
}
}
return nil
}
// walker walks each hocr file in a directory and adds a page to
// the pdf for each one.
func walker(pdf Pdfer, colour, smaller bool) filepath.WalkFunc {
return func(fpath string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if info.IsDir() {
return nil
}
if path.Ext(fpath) != ".hocr" {
return nil
}
return pdf.AddPage(imgPath(fpath, colour), fpath, smaller)
}
}
func main() {
colour := flag.Bool("c", false, "colour")
smaller := flag.Bool("s", false, "smaller")
flag.Usage = func() {
fmt.Fprintln(flag.CommandLine.Output(), usage)
flag.PrintDefaults()
}
flag.Parse()
if flag.NArg() != 2 {
flag.Usage()
return
}
pdf := new(bookpipeline.Fpdf)
err := pdf.Setup()
if err != nil {
log.Fatalln("Failed to set up PDF", err)
}
_, err = os.Stat(path.Join(flag.Arg(0), "best"))
if err != nil && !os.IsNotExist(err) {
log.Fatalln("Failed to stat best", err)
}
if os.IsNotExist(err) {
err = filepath.Walk(flag.Arg(0), walker(pdf, *colour, *smaller))
if err != nil {
log.Fatalln("Failed to walk", flag.Arg(0), err)
}
} else {
err = addBest(flag.Arg(0), pdf, *colour, *smaller)
if err != nil {
log.Fatalln("Failed to add best pages", err)
}
}
err = pdf.Save(flag.Arg(1))
if err != nil {
log.Fatalln("Failed to save", flag.Arg(1), err)
}
}
|