summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-10-08 15:49:52 +0100
committerNick White <git@njw.name>2019-10-08 15:49:52 +0100
commit69aae6b93dcadd9e4895f86fe661ee80e79dcf9e (patch)
treebfc42b832a6d6e3051631eb6cf530e1400b2c080
parentd43c11bf653bfe3c1ad1ed277f1ec08bf155cf98 (diff)
Remove parts that have been moved elsewhere, and rename to rescribe.xyz/utils
bookpipeline is now at rescribe.xyz/bookpipeline preproc is now at rescribe.xyz/preproc integralimg is now at rescribe.xyz/preproc/integralimg
-rw-r--r--avg-lines/html.go2
-rw-r--r--avg-lines/main.go6
-rw-r--r--bookpipeline/aws.go322
-rw-r--r--bookpipeline/cmd/bookpipeline/main.go488
-rw-r--r--bookpipeline/cmd/booktopipeline/main.go140
-rw-r--r--bookpipeline/cmd/confgraph/main.go71
-rw-r--r--bookpipeline/cmd/getpipelinebook/main.go122
-rw-r--r--bookpipeline/cmd/lspipeline/main.go250
-rw-r--r--bookpipeline/cmd/mkpipeline/main.go79
-rw-r--r--bookpipeline/graph.go155
-rw-r--r--bucket-lines/bucket.go2
-rw-r--r--bucket-lines/main.go6
-rw-r--r--dehyphenate/main.go2
-rw-r--r--hocrtotxt/main.go2
-rw-r--r--integralimg/integralimg.go169
-rw-r--r--pgconf/main.go2
-rw-r--r--pkg/hocr/hocr.go (renamed from lib/hocr/hocr.go)0
-rw-r--r--pkg/hocr/lines.go (renamed from lib/hocr/lines.go)2
-rw-r--r--pkg/line/line.go (renamed from lib/line/line.go)0
-rw-r--r--pkg/prob/prob.go (renamed from lib/prob/prob.go)2
-rw-r--r--preproc/cmd/binarize/main.go78
-rw-r--r--preproc/cmd/preproc/main.go90
-rw-r--r--preproc/cmd/preprocmulti/main.go101
-rw-r--r--preproc/cmd/wipe/main.go55
-rw-r--r--preproc/preprocmulti.go94
-rw-r--r--preproc/sauvola.go76
-rw-r--r--preproc/sauvola_test.go70
-rw-r--r--preproc/test_helpers.go53
-rw-r--r--preproc/testdata/pg1.pngbin651071 -> 0 bytes
-rw-r--r--preproc/testdata/pg1_integralsauvola_k0.3_w19.pngbin19456 -> 0 bytes
-rw-r--r--preproc/testdata/pg1_integralsauvola_k0.5_w19.pngbin18241 -> 0 bytes
-rw-r--r--preproc/testdata/pg1_integralsauvola_k0.5_w41.pngbin18260 -> 0 bytes
-rw-r--r--preproc/testdata/pg1_sauvola_k0.3_w19.pngbin19447 -> 0 bytes
-rw-r--r--preproc/testdata/pg1_sauvola_k0.5_w19.pngbin18231 -> 0 bytes
-rw-r--r--preproc/testdata/pg1_sauvola_k0.5_w41.pngbin18275 -> 0 bytes
-rw-r--r--preproc/testdata/pg2.pngbin30803 -> 0 bytes
-rw-r--r--preproc/testdata/pg2_integralwipesides_t0.02_w5.pngbin33595 -> 0 bytes
-rw-r--r--preproc/testdata/pg2_integralwipesides_t0.05_w25.pngbin33432 -> 0 bytes
-rw-r--r--preproc/testdata/pg2_integralwipesides_t0.05_w5.pngbin14546 -> 0 bytes
-rw-r--r--preproc/util.go95
-rw-r--r--preproc/wipesides.go160
-rw-r--r--preproc/wipesides_test.go57
42 files changed, 13 insertions, 2738 deletions
diff --git a/avg-lines/html.go b/avg-lines/html.go
index 443cc4a..97d8ec9 100644
--- a/avg-lines/html.go
+++ b/avg-lines/html.go
@@ -5,7 +5,7 @@ import (
"os"
"path/filepath"
- "rescribe.xyz/go.git/lib/line"
+ "rescribe.xyz/utils/pkg/line"
)
func copylineimg(fn string, l line.Detail) error {
diff --git a/avg-lines/main.go b/avg-lines/main.go
index 14b21bd..f7cedab 100644
--- a/avg-lines/main.go
+++ b/avg-lines/main.go
@@ -8,9 +8,9 @@ import (
"path/filepath"
"sort"
- "rescribe.xyz/go.git/lib/hocr"
- "rescribe.xyz/go.git/lib/line"
- "rescribe.xyz/go.git/lib/prob"
+ "rescribe.xyz/utils/pkg/hocr"
+ "rescribe.xyz/utils/pkg/line"
+ "rescribe.xyz/utils/pkg/prob"
)
func main() {
diff --git a/bookpipeline/aws.go b/bookpipeline/aws.go
deleted file mode 100644
index 0127d6e..0000000
--- a/bookpipeline/aws.go
+++ /dev/null
@@ -1,322 +0,0 @@
-package bookpipeline
-
-import (
- "errors"
- "fmt"
- "log"
- "os"
- "time"
-
- "github.com/aws/aws-sdk-go/aws"
- "github.com/aws/aws-sdk-go/aws/awserr"
- "github.com/aws/aws-sdk-go/aws/session"
- "github.com/aws/aws-sdk-go/service/ec2"
- "github.com/aws/aws-sdk-go/service/s3"
- "github.com/aws/aws-sdk-go/service/s3/s3manager"
- "github.com/aws/aws-sdk-go/service/sqs"
-)
-
-const PreprocPattern = `_bin[0-9].[0-9].png`
-const heartbeatRetry = 10
-
-type Qmsg struct {
- Id, Handle, Body string
-}
-
-type InstanceDetails struct {
- Id, Name, Ip, Spot, Type, State, LaunchTime string
-}
-
-type AwsConn struct {
- // these need to be set before running Init()
- Region string
- Logger *log.Logger
-
- // these are used internally
- sess *session.Session
- ec2svc *ec2.EC2
- s3svc *s3.S3
- sqssvc *sqs.SQS
- downloader *s3manager.Downloader
- uploader *s3manager.Uploader
- wipequrl, prequrl, ocrqurl, analysequrl string
- wipstorageid string
-}
-
-// TODO: split this up, as not everything is needed for different uses
-func (a *AwsConn) Init() error {
- if a.Region == "" {
- return errors.New("No Region set")
- }
- if a.Logger == nil {
- return errors.New("No logger set")
- }
-
- var err error
- a.sess, err = session.NewSession(&aws.Config{
- Region: aws.String(a.Region),
- })
- if err != nil {
- return errors.New(fmt.Sprintf("Failed to set up aws session: %s", err))
- }
- a.ec2svc = ec2.New(a.sess)
- a.s3svc = s3.New(a.sess)
- a.sqssvc = sqs.New(a.sess)
- a.downloader = s3manager.NewDownloader(a.sess)
- a.uploader = s3manager.NewUploader(a.sess)
-
- a.Logger.Println("Getting preprocess queue URL")
- result, err := a.sqssvc.GetQueueUrl(&sqs.GetQueueUrlInput{
- QueueName: aws.String("rescribepreprocess"),
- })
- if err != nil {
- return errors.New(fmt.Sprintf("Error getting preprocess queue URL: %s", err))
- }
- a.prequrl = *result.QueueUrl
-
- a.Logger.Println("Getting wipeonly queue URL")
- result, err = a.sqssvc.GetQueueUrl(&sqs.GetQueueUrlInput{
- QueueName: aws.String("rescribewipeonly"),
- })
- if err != nil {
- return errors.New(fmt.Sprintf("Error getting wipeonly queue URL: %s", err))
- }
- a.wipequrl = *result.QueueUrl
-
- a.Logger.Println("Getting OCR queue URL")
- result, err = a.sqssvc.GetQueueUrl(&sqs.GetQueueUrlInput{
- QueueName: aws.String("rescribeocr"),
- })
- if err != nil {
- return errors.New(fmt.Sprintf("Error getting OCR queue URL: %s", err))
- }
- a.ocrqurl = *result.QueueUrl
-
- a.Logger.Println("Getting analyse queue URL")
- result, err = a.sqssvc.GetQueueUrl(&sqs.GetQueueUrlInput{
- QueueName: aws.String("rescribeanalyse"),
- })
- if err != nil {
- return errors.New(fmt.Sprintf("Error getting analyse queue URL: %s", err))
- }
- a.analysequrl = *result.QueueUrl
-
- a.wipstorageid = "rescribeinprogress"
-
- return nil
-}
-
-func (a *AwsConn) CheckQueue(url string, timeout int64) (Qmsg, error) {
- msgResult, err := a.sqssvc.ReceiveMessage(&sqs.ReceiveMessageInput{
- MaxNumberOfMessages: aws.Int64(1),
- VisibilityTimeout: &timeout,
- WaitTimeSeconds: aws.Int64(20),
- QueueUrl: &url,
- })
- if err != nil {
- return Qmsg{}, err
- }
-
- if len(msgResult.Messages) > 0 {
- msg := Qmsg{Id: *msgResult.Messages[0].MessageId,
- Handle: *msgResult.Messages[0].ReceiptHandle,
- Body: *msgResult.Messages[0].Body}
- a.Logger.Println("Message received:", msg.Body)
- return msg, nil
- } else {
- return Qmsg{}, nil
- }
-}
-
-// QueueHeartbeat updates the visibility timeout of a message. This
-// ensures that the message remains "in flight", meaning that it
-// cannot be seen by other processes, but if this process fails the
-// timeout will expire and it will go back to being available for
-// any other process to retrieve and process.
-//
-// SQS only allows messages to be "in flight" for up to 12 hours, so
-// this will detect if the request for an update to visibility timeout
-// fails, and if so will attempt to find the message on the queue, and
-// return it, as the handle will have changed.
-func (a *AwsConn) QueueHeartbeat(msg Qmsg, qurl string, duration int64) (Qmsg, error) {
- _, err := a.sqssvc.ChangeMessageVisibility(&sqs.ChangeMessageVisibilityInput{
- ReceiptHandle: &msg.Handle,
- QueueUrl: &qurl,
- VisibilityTimeout: &duration,
- })
- if err != nil {
- aerr, ok := err.(awserr.Error)
-
- // Check if the visibility timeout has exceeded the maximum allowed,
- // and if so try to find the message again to get a new handle.
- if ok && aerr.Code() == "InvalidParameterValue" {
- // Try heartbeatRetry times to find the message
- for range [heartbeatRetry]bool{} {
- // Wait a little in case existing visibilitytimeout needs to expire
- time.Sleep((time.Duration(duration) / heartbeatRetry) * time.Second)
-
- msgResult, err := a.sqssvc.ReceiveMessage(&sqs.ReceiveMessageInput{
- MaxNumberOfMessages: aws.Int64(10),
- VisibilityTimeout: &duration,
- WaitTimeSeconds: aws.Int64(20),
- QueueUrl: &qurl,
- })
- if err != nil {
- return Qmsg{}, errors.New(fmt.Sprintf("Heartbeat error looking for message to update heartbeat: %s", err))
- }
- for _, m := range msgResult.Messages {
- if *m.MessageId == msg.Id {
- return Qmsg{
- Id: *m.MessageId,
- Handle: *m.ReceiptHandle,
- Body: *m.Body,
- }, nil
- }
- }
- }
- return Qmsg{}, errors.New("Heartbeat error failed to find message to update heartbeat")
- } else {
- return Qmsg{}, errors.New(fmt.Sprintf("Heartbeat error updating queue duration: %s", err))
- }
- }
- return Qmsg{}, nil
-}
-
-// GetQueueDetails gets the number of in progress and available
-// messages for a queue. These are returned as strings.
-func (a *AwsConn) GetQueueDetails(url string) (string, string, error) {
- numAvailable := "ApproximateNumberOfMessages"
- numInProgress := "ApproximateNumberOfMessagesNotVisible"
- attrs, err := a.sqssvc.GetQueueAttributes(&sqs.GetQueueAttributesInput{
- AttributeNames: []*string{&numAvailable, &numInProgress},
- QueueUrl: &url,
- })
- if err != nil {
- return "", "", errors.New(fmt.Sprintf("Failed to get queue attributes: %s", err))
- }
- return *attrs.Attributes[numAvailable], *attrs.Attributes[numInProgress], nil
-}
-
-func (a *AwsConn) PreQueueId() string {
- return a.prequrl
-}
-
-func (a *AwsConn) WipeQueueId() string {
- return a.wipequrl
-}
-
-func (a *AwsConn) OCRQueueId() string {
- return a.ocrqurl
-}
-
-func (a *AwsConn) AnalyseQueueId() string {
- return a.analysequrl
-}
-
-func (a *AwsConn) WIPStorageId() string {
- return a.wipstorageid
-}
-
-func (a *AwsConn) ListObjects(bucket string, prefix string) ([]string, error) {
- var names []string
- err := a.s3svc.ListObjectsV2Pages(&s3.ListObjectsV2Input{
- Bucket: aws.String(bucket),
- Prefix: aws.String(prefix),
- }, func(page *s3.ListObjectsV2Output, last bool) bool {
- for _, r := range page.Contents {
- names = append(names, *r.Key)
- }
- return true
- })
- return names, err
-}
-
-func (a *AwsConn) AddToQueue(url string, msg string) error {
- _, err := a.sqssvc.SendMessage(&sqs.SendMessageInput{
- MessageBody: &msg,
- QueueUrl: &url,
- })
- return err
-}
-
-func (a *AwsConn) DelFromQueue(url string, handle string) error {
- _, err := a.sqssvc.DeleteMessage(&sqs.DeleteMessageInput{
- QueueUrl: &url,
- ReceiptHandle: &handle,
- })
- return err
-}
-
-func (a *AwsConn) Download(bucket string, key string, path string) error {
- f, err := os.Create(path)
- if err != nil {
- return err
- }
- defer f.Close()
-
- _, err = a.downloader.Download(f,
- &s3.GetObjectInput{
- Bucket: aws.String(bucket),
- Key: &key,
- })
- return err
-}
-
-func (a *AwsConn) Upload(bucket string, key string, path string) error {
- file, err := os.Open(path)
- if err != nil {
- log.Fatalln("Failed to open file", path, err)
- }
- defer file.Close()
-
- _, err = a.uploader.Upload(&s3manager.UploadInput{
- Bucket: aws.String(bucket),
- Key: aws.String(key),
- Body: file,
- })
- return err
-}
-
-func (a *AwsConn) GetLogger() *log.Logger {
- return a.Logger
-}
-
-func instanceDetailsFromPage(page *ec2.DescribeInstancesOutput) []InstanceDetails {
- var details []InstanceDetails
- for _, r := range page.Reservations {
- for _, i := range r.Instances {
- var d InstanceDetails
-
- for _, t := range i.Tags {
- if *t.Key == "Name" {
- d.Name = *t.Value
- }
- }
- if i.PublicIpAddress != nil {
- d.Ip = *i.PublicIpAddress
- }
- if i.SpotInstanceRequestId != nil {
- d.Spot = *i.SpotInstanceRequestId
- }
- d.Type = *i.InstanceType
- d.Id = *i.InstanceId
- d.LaunchTime = i.LaunchTime.String()
- d.State = *i.State.Name
-
- details = append(details, d)
- }
- }
-
- return details
-}
-
-func (a *AwsConn) GetInstanceDetails() ([]InstanceDetails, error) {
- var details []InstanceDetails
- err := a.ec2svc.DescribeInstancesPages(&ec2.DescribeInstancesInput{}, func(page *ec2.DescribeInstancesOutput, lastPage bool) bool {
- for _, d := range instanceDetailsFromPage(page) {
- details = append(details, d)
- }
- return !lastPage
- })
- return details, err
-}
diff --git a/bookpipeline/cmd/bookpipeline/main.go b/bookpipeline/cmd/bookpipeline/main.go
deleted file mode 100644
index 59ece72..0000000
--- a/bookpipeline/cmd/bookpipeline/main.go
+++ /dev/null
@@ -1,488 +0,0 @@
-package main
-
-// TODO: check if images are prebinarised and if so skip multiple binarisation
-
-import (
- "errors"
- "flag"
- "fmt"
- "log"
- "os"
- "os/exec"
- "path/filepath"
- "regexp"
- "strings"
- "time"
-
- "rescribe.xyz/go.git/bookpipeline"
- "rescribe.xyz/go.git/lib/hocr"
- "rescribe.xyz/go.git/preproc"
-)
-
-const usage = `Usage: bookpipeline [-v] [-np] [-nw] [-no] [-na] [-t training]
-
-Watches the preprocess, ocr and analyse queues for book names. When
-one is found this general process is followed:
-
-- The book name is hidden from the queue, and a 'heartbeat' is
- started which keeps it hidden (this will time out after 2 minutes
- if the program is terminated)
-- The necessary files from bookname/ are downloaded
-- The files are processed
-- The resulting files are uploaded to bookname/
-- The heartbeat is stopped
-- The book name is removed from the queue it was taken from, and
- added to the next queue for future processing
-
-`
-
-const PauseBetweenChecks = 3 * time.Minute
-const HeartbeatTime = 60
-
-// null writer to enable non-verbose logging to be discarded
-type NullWriter bool
-
-func (w NullWriter) Write(p []byte) (n int, err error) {
- return len(p), nil
-}
-
-type Clouder interface {
- Init() error
- ListObjects(bucket string, prefix string) ([]string, error)
- Download(bucket string, key string, fn string) error
- Upload(bucket string, key string, path string) error
- CheckQueue(url string, timeout int64) (bookpipeline.Qmsg, error)
- AddToQueue(url string, msg string) error
- DelFromQueue(url string, handle string) error
- QueueHeartbeat(msg bookpipeline.Qmsg, qurl string, duration int64) (bookpipeline.Qmsg, error)
-}
-
-type Pipeliner interface {
- Clouder
- PreQueueId() string
- WipeQueueId() string
- OCRQueueId() string
- AnalyseQueueId() string
- WIPStorageId() string
- GetLogger() *log.Logger
-}
-
-func download(dl chan string, process chan string, conn Pipeliner, dir string, errc chan error, logger *log.Logger) {
- for key := range dl {
- fn := filepath.Join(dir, filepath.Base(key))
- logger.Println("Downloading", key)
- err := conn.Download(conn.WIPStorageId(), key, fn)
- if err != nil {
- for range dl {
- } // consume the rest of the receiving channel so it isn't blocked
- close(process)
- errc <- err
- return
- }
- process <- fn
- }
- close(process)
-}
-
-func up(c chan string, done chan bool, conn Pipeliner, bookname string, errc chan error, logger *log.Logger) {
- for path := range c {
- name := filepath.Base(path)
- key := filepath.Join(bookname, name)
- logger.Println("Uploading", key)
- err := conn.Upload(conn.WIPStorageId(), key, path)
- if err != nil {
- for range c {
- } // consume the rest of the receiving channel so it isn't blocked
- errc <- err
- return
- }
- }
-
- done <- true
-}
-
-func preprocess(pre chan string, up chan string, errc chan error, logger *log.Logger) {
- for path := range pre {
- logger.Println("Preprocessing", path)
- done, err := preproc.PreProcMulti(path, []float64{0.1, 0.2, 0.4, 0.5}, "binary", 0, true, 5, 30)
- if err != nil {
- for range pre {
- } // consume the rest of the receiving channel so it isn't blocked
- close(up)
- errc <- err
- return
- }
- for _, p := range done {
- up <- p
- }
- }
- close(up)
-}
-
-func wipe(towipe chan string, up chan string, errc chan error, logger *log.Logger) {
- for path := range towipe {
- logger.Println("Wiping", path)
- s := strings.Split(path, ".")
- base := strings.Join(s[:len(s)-1], "")
- outpath := base + "_bin0.0.png"
- err := preproc.WipeFile(path, outpath, 5, 0.03, 30)
- if err != nil {
- for range towipe {
- } // consume the rest of the receiving channel so it isn't blocked
- close(up)
- errc <- err
- return
- }
- up <- outpath
- }
- close(up)
-}
-
-func ocr(training string) func(chan string, chan string, chan error, *log.Logger) {
- return func(toocr chan string, up chan string, errc chan error, logger *log.Logger) {
- for path := range toocr {
- logger.Println("OCRing", path)
- name := strings.Replace(path, ".png", "", 1)
- cmd := exec.Command("tesseract", "-l", training, path, name, "hocr")
- err := cmd.Run()
- if err != nil {
- for range toocr {
- } // consume the rest of the receiving channel so it isn't blocked
- close(up)
- errc <- errors.New(fmt.Sprintf("Error ocring %s: %s", path, err))
- return
- }
- up <- name + ".hocr"
- }
- close(up)
- }
-}
-
-func analyse(toanalyse chan string, up chan string, errc chan error, logger *log.Logger) {
- confs := make(map[string][]*bookpipeline.Conf)
- bestconfs := make(map[string]*bookpipeline.Conf)
- savedir := ""
-
- for path := range toanalyse {
- if savedir == "" {
- savedir = filepath.Dir(path)
- }
- logger.Println("Calculating confidence for", path)
- avg, err := hocr.GetAvgConf(path)
- if err != nil && err.Error() == "No words found" {
- continue
- }
- if err != nil {
- for range toanalyse {
- } // consume the rest of the receiving channel so it isn't blocked
- close(up)
- errc <- errors.New(fmt.Sprintf("Error retreiving confidence for %s: %s", path, err))
- return
- }
- base := filepath.Base(path)
- codestart := strings.Index(base, "_bin")
- name := base[0:codestart]
- var c bookpipeline.Conf
- c.Path = path
- c.Code = base[codestart:]
- c.Conf = avg
- confs[name] = append(confs[name], &c)
-
- }
-
- fn := filepath.Join(savedir, "conf")
- logger.Println("Saving confidences in file", fn)
- f, err := os.Create(fn)
- if err != nil {
- close(up)
- errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err))
- return
- }
- defer f.Close()
-
- logger.Println("Finding best confidence for each page, and saving all confidences")
- for base, conf := range confs {
- var best float64
- for _, c := range conf {
- if c.Conf > best {
- best = c.Conf
- bestconfs[base] = c
- }
- _, err = fmt.Fprintf(f, "%s\t%02.f\n", c.Path, c.Conf)
- if err != nil {
- close(up)
- errc <- errors.New(fmt.Sprintf("Error writing confidences file: %s", err))
- return
- }
- }
- }
- up <- fn
-
- logger.Println("Creating best file listing the best file for each page")
- fn = filepath.Join(savedir, "best")
- f, err = os.Create(fn)
- if err != nil {
- close(up)
- errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err))
- return
- }
- defer f.Close()
- for _, conf := range bestconfs {
- _, err = fmt.Fprintf(f, "%s\n", filepath.Base(conf.Path))
- }
- up <- fn
-
- logger.Println("Creating graph")
- fn = filepath.Join(savedir, "graph.png")
- f, err = os.Create(fn)
- if err != nil {
- close(up)
- errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err))
- return
- }
- defer f.Close()
- err = bookpipeline.Graph(bestconfs, filepath.Base(savedir), f)
- if err != nil {
- close(up)
- errc <- errors.New(fmt.Sprintf("Error rendering graph: %s", err))
- return
- }
- up <- fn
-
- close(up)
-}
-
-func heartbeat(conn Pipeliner, t *time.Ticker, msg bookpipeline.Qmsg, queue string, msgc chan bookpipeline.Qmsg, errc chan error) {
- currentmsg := msg
- for range t.C {
- m, err := conn.QueueHeartbeat(currentmsg, queue, HeartbeatTime*2)
- if err != nil {
- errc <- err
- t.Stop()
- return
- }
- if m.Id != "" {
- conn.GetLogger().Println("Replaced message handle as visibilitytimeout limit was reached")
- currentmsg = m
- // TODO: maybe handle communicating new msg more gracefully than this
- for range msgc {
- } // throw away any old msgc
- msgc <- m
- }
- }
-}
-
-func processBook(msg bookpipeline.Qmsg, conn Pipeliner, process func(chan string, chan string, chan error, *log.Logger), match *regexp.Regexp, fromQueue string, toQueue string) error {
- dl := make(chan string)
- msgc := make(chan bookpipeline.Qmsg)
- processc := make(chan string)
- upc := make(chan string)
- done := make(chan bool)
- errc := make(chan error)
-
- bookname := msg.Body
-
- d := filepath.Join(os.TempDir(), bookname)
- err := os.MkdirAll(d, 0755)
- if err != nil {
- return errors.New(fmt.Sprintf("Failed to create directory %s: %s", d, err))
- }
-
- t := time.NewTicker(HeartbeatTime * time.Second)
- go heartbeat(conn, t, msg, fromQueue, msgc, errc)
-
- // these functions will do their jobs when their channels have data
- go download(dl, processc, conn, d, errc, conn.GetLogger())
- go process(processc, upc, errc, conn.GetLogger())
- go up(upc, done, conn, bookname, errc, conn.GetLogger())
-
- conn.GetLogger().Println("Getting list of objects to download")
- objs, err := conn.ListObjects(conn.WIPStorageId(), bookname)
- if err != nil {
- t.Stop()
- _ = os.RemoveAll(d)
- return errors.New(fmt.Sprintf("Failed to get list of files for book %s: %s", bookname, err))
- }
- var todl []string
- for _, n := range objs {
- if !match.MatchString(n) {
- conn.GetLogger().Println("Skipping item that doesn't match target", n)
- continue
- }
- todl = append(todl, n)
- }
- for _, a := range todl {
- dl <- a
- }
- close(dl)
-
- // wait for either the done or errc channel to be sent to
- select {
- case err = <-errc:
- t.Stop()
- _ = os.RemoveAll(d)
- return err
- case <-done:
- }
-
- if toQueue != "" {
- conn.GetLogger().Println("Sending", bookname, "to queue", toQueue)
- err = conn.AddToQueue(toQueue, bookname)
- if err != nil {
- t.Stop()
- _ = os.RemoveAll(d)
- return errors.New(fmt.Sprintf("Error adding to queue %s: %s", bookname, err))
- }
- }
-
- t.Stop()
-
- // check whether we're using a newer msg handle
- select {
- case m, ok := <-msgc:
- if ok {
- msg = m
- conn.GetLogger().Println("Using new message handle to delete message from old queue")
- }
- default:
- conn.GetLogger().Println("Using original message handle to delete message from old queue")
- }
-
- conn.GetLogger().Println("Deleting original message from queue", fromQueue)
- err = conn.DelFromQueue(fromQueue, msg.Handle)
- if err != nil {
- _ = os.RemoveAll(d)
- return errors.New(fmt.Sprintf("Error deleting message from queue: %s", err))
- }
-
- err = os.RemoveAll(d)
- if err != nil {
- return errors.New(fmt.Sprintf("Failed to remove directory %s: %s", d, err))
- }
-
- return nil
-}
-
-func main() {
- verbose := flag.Bool("v", false, "verbose")
- training := flag.String("t", "rescribealphav5", "tesseract training file to use")
- nopreproc := flag.Bool("np", false, "disable preprocessing")
- nowipe := flag.Bool("nw", false, "disable wipeonly")
- noocr := flag.Bool("no", false, "disable ocr")
- noanalyse := flag.Bool("na", false, "disable analysis")
-
- flag.Usage = func() {
- fmt.Fprintf(flag.CommandLine.Output(), usage)
- flag.PrintDefaults()
- }
- flag.Parse()
-
- var verboselog *log.Logger
- if *verbose {
- verboselog = log.New(os.Stdout, "", 0)
- } else {
- var n NullWriter
- verboselog = log.New(n, "", 0)
- }
-
- origPattern := regexp.MustCompile(`[0-9]{4}.jpg$`) // TODO: match alternative file naming
- wipePattern := regexp.MustCompile(`[0-9]{4}.png$`)
- preprocessedPattern := regexp.MustCompile(`_bin[0-9].[0-9].png$`)
- ocredPattern := regexp.MustCompile(`.hocr$`)
-
- var conn Pipeliner
- conn = &bookpipeline.AwsConn{Region: "eu-west-2", Logger: verboselog}
-
- verboselog.Println("Setting up AWS session")
- err := conn.Init()
- if err != nil {
- log.Fatalln("Error setting up cloud connection:", err)
- }
- verboselog.Println("Finished setting up AWS session")
-
- var checkPreQueue <-chan time.Time
- var checkWipeQueue <-chan time.Time
- var checkOCRQueue <-chan time.Time
- var checkAnalyseQueue <-chan time.Time
- if !*nopreproc {
- checkPreQueue = time.After(0)
- }
- if !*nowipe {
- checkWipeQueue = time.After(0)
- }
- if !*noocr {
- checkOCRQueue = time.After(0)
- }
- if !*noanalyse {
- checkAnalyseQueue = time.After(0)
- }
-
- for {
- select {
- case <-checkPreQueue:
- msg, err := conn.CheckQueue(conn.PreQueueId(), HeartbeatTime*2)
- checkPreQueue = time.After(PauseBetweenChecks)
- if err != nil {
- log.Println("Error checking preprocess queue", err)
- continue
- }
- if msg.Handle == "" {
- verboselog.Println("No message received on preprocess queue, sleeping")
- continue
- }
- verboselog.Println("Message received on preprocess queue, processing", msg.Body)
- err = processBook(msg, conn, preprocess, origPattern, conn.PreQueueId(), conn.OCRQueueId())
- if err != nil {
- log.Println("Error during preprocess", err)
- }
- case <-checkWipeQueue:
- msg, err := conn.CheckQueue(conn.WipeQueueId(), HeartbeatTime*2)
- checkWipeQueue = time.After(PauseBetweenChecks)
- if err != nil {
- log.Println("Error checking wipeonly queue", err)
- continue
- }
- if msg.Handle == "" {
- verboselog.Println("No message received on wipeonly queue, sleeping")
- continue
- }
- verboselog.Println("Message received on wipeonly queue, processing", msg.Body)
- err = processBook(msg, conn, wipe, wipePattern, conn.WipeQueueId(), conn.OCRQueueId())
- if err != nil {
- log.Println("Error during wipe", err)
- }
- case <-checkOCRQueue:
- msg, err := conn.CheckQueue(conn.OCRQueueId(), HeartbeatTime*2)
- checkOCRQueue = time.After(PauseBetweenChecks)
- if err != nil {
- log.Println("Error checking OCR queue", err)
- continue
- }
- if msg.Handle == "" {
- verboselog.Println("No message received on OCR queue, sleeping")
- continue
- }
- verboselog.Println("Message received on OCR queue, processing", msg.Body)
- err = processBook(msg, conn, ocr(*training), preprocessedPattern, conn.OCRQueueId(), conn.AnalyseQueueId())
- if err != nil {
- log.Println("Error during OCR process", err)
- }
- case <-checkAnalyseQueue:
- msg, err := conn.CheckQueue(conn.AnalyseQueueId(), HeartbeatTime*2)
- checkAnalyseQueue = time.After(PauseBetweenChecks)
- if err != nil {
- log.Println("Error checking analyse queue", err)
- continue
- }
- if msg.Handle == "" {
- verboselog.Println("No message received on analyse queue, sleeping")
- continue
- }
- verboselog.Println("Message received on analyse queue, processing", msg.Body)
- err = processBook(msg, conn, analyse, ocredPattern, conn.AnalyseQueueId(), "")
- if err != nil {
- log.Println("Error during analysis", err)
- }
- }
- }
-}
diff --git a/bookpipeline/cmd/booktopipeline/main.go b/bookpipeline/cmd/booktopipeline/main.go
deleted file mode 100644
index 6d9f146..0000000
--- a/bookpipeline/cmd/booktopipeline/main.go
+++ /dev/null
@@ -1,140 +0,0 @@
-package main
-
-// TODO: use bookpipeline package to do aws stuff
-
-import (
- "flag"
- "fmt"
- "log"
- "os"
- "path/filepath"
-
- "github.com/aws/aws-sdk-go/aws"
- "github.com/aws/aws-sdk-go/aws/session"
- "github.com/aws/aws-sdk-go/service/s3/s3manager"
- "github.com/aws/aws-sdk-go/service/sqs"
-)
-
-const usage = `Usage: booktopipeline [-prebinarised] [-v] bookdir [bookname]
-
-Uploads the book in bookdir to the S3 'inprogress' bucket and adds it
-to the 'preprocess' SQS queue, or the 'wipeonly' queue if the
-prebinarised flag is set.
-
-If bookname is omitted the last part of the bookdir is used.
-`
-
-// null writer to enable non-verbose logging to be discarded
-type NullWriter bool
-
-func (w NullWriter) Write(p []byte) (n int, err error) {
- return len(p), nil
-}
-
-var verboselog *log.Logger
-
-type fileWalk chan string
-
-func (f fileWalk) Walk(path string, info os.FileInfo, err error) error {
- if err != nil {
- return err
- }
- if !info.IsDir() {
- f <- path
- }
- return nil
-}
-
-func main() {
- verbose := flag.Bool("v", false, "Verbose")
- wipeonly := flag.Bool("prebinarised", false, "Prebinarised: only preprocessing will be to wipe")
-
- flag.Usage = func() {
- fmt.Fprintf(flag.CommandLine.Output(), usage)
- flag.PrintDefaults()
- }
- flag.Parse()
- if flag.NArg() < 1 {
- flag.Usage()
- return
- }
-
- bookdir := flag.Arg(0)
- var bookname string
- if flag.NArg() > 2 {
- bookname = flag.Arg(1)
- } else {
- bookname = filepath.Base(bookdir)
- }
-
- if *verbose {
- verboselog = log.New(os.Stdout, "", log.LstdFlags)
- } else {
- var n NullWriter
- verboselog = log.New(n, "", log.LstdFlags)
- }
-
- verboselog.Println("Setting up AWS session")
- sess, err := session.NewSession(&aws.Config{
- Region: aws.String("eu-west-2"),
- })
- if err != nil {
- log.Fatalln("Error: failed to set up aws session:", err)
- }
- sqssvc := sqs.New(sess)
- uploader := s3manager.NewUploader(sess)
-
- var qname string
- if *wipeonly {
- qname = "rescribewipeonly"
- } else {
- qname = "rescribepreprocess"
- }
- verboselog.Println("Getting Queue URL for", qname)
- result, err := sqssvc.GetQueueUrl(&sqs.GetQueueUrlInput{
- QueueName: aws.String(qname),
- })
- if err != nil {
- log.Fatalln("Error getting queue URL for", qname, ":", err)
- }
- qurl := *result.QueueUrl
-
- // concurrent walking upload based on example at
- // https://docs.aws.amazon.com/sdk-for-go/v1/developer-guide/sdk-utilities.html
- verboselog.Println("Walking", bookdir)
- walker := make(fileWalk)
- go func() {
- err = filepath.Walk(bookdir, walker.Walk)
- if err != nil {
- log.Fatalln("Filesystem walk failed:", err)
- }
- close(walker)
- }()
-
- for path := range walker {
- verboselog.Println("Uploading", path)
- name := filepath.Base(path)
- file, err := os.Open(path)
- if err != nil {
- log.Fatalln("Open file", path, "failed:", err)
- }
- defer file.Close()
- _, err = uploader.Upload(&s3manager.UploadInput{
- Bucket: aws.String("rescribeinprogress"),
- Key: aws.String(filepath.Join(bookname, name)),
- Body: file,
- })
- if err != nil {
- log.Fatalln("Failed to upload", path, err)
- }
- }
-
- verboselog.Println("Sending message", bookname, "to queue", qurl)
- _, err = sqssvc.SendMessage(&sqs.SendMessageInput{
- MessageBody: aws.String(bookname),
- QueueUrl: &qurl,
- })
- if err != nil {
- log.Fatalln("Error adding book to queue:", err)
- }
-}
diff --git a/bookpipeline/cmd/confgraph/main.go b/bookpipeline/cmd/confgraph/main.go
deleted file mode 100644
index b60821e..0000000
--- a/bookpipeline/cmd/confgraph/main.go
+++ /dev/null
@@ -1,71 +0,0 @@
-package main
-
-import (
- "flag"
- "fmt"
- "log"
- "os"
- "path/filepath"
- "strings"
-
- "rescribe.xyz/go.git/bookpipeline"
- "rescribe.xyz/go.git/lib/hocr"
-)
-
-func walker(confs *[]*bookpipeline.Conf) filepath.WalkFunc {
- return func(path string, info os.FileInfo, err error) error {
- if info.IsDir() {
- return nil
- }
- if !strings.HasSuffix(path, ".hocr") {
- return nil
- }
- avg, err := hocr.GetAvgConf(path)
- if err != nil {
- return err
- }
- c := bookpipeline.Conf{
- Conf: avg,
- Path: path,
- }
- *confs = append(*confs, &c)
- return nil
- }
-}
-
-func main() {
- flag.Usage = func() {
- fmt.Fprintln(flag.CommandLine.Output(), "Usage: bookpipeline hocrdir graph.png")
- flag.PrintDefaults()
- }
- flag.Parse()
-
- if flag.NArg() != 2 {
- flag.Usage()
- return
- }
-
- var confs []*bookpipeline.Conf
- err := filepath.Walk(flag.Arg(0), walker(&confs))
- if err != nil {
- log.Fatalln("Failed to walk", flag.Arg(0), err)
- }
-
- // Structure to fit what bookpipeline.Graph needs
- // TODO: probably reorganise bookpipeline to just need []*Conf
- cconfs := make(map[string]*bookpipeline.Conf)
- for _, c := range confs {
- cconfs[c.Path] = c
- }
-
- fn := flag.Arg(1)
- f, err := os.Create(fn)
- if err != nil {
- log.Fatalln("Error creating file", fn, err)
- }
- defer f.Close()
- err = bookpipeline.Graph(cconfs, filepath.Base(flag.Arg(0)), f)
- if err != nil {
- log.Fatalln("Error creating graph", err)
- }
-}
diff --git a/bookpipeline/cmd/getpipelinebook/main.go b/bookpipeline/cmd/getpipelinebook/main.go
deleted file mode 100644
index 66e3f70..0000000
--- a/bookpipeline/cmd/getpipelinebook/main.go
+++ /dev/null
@@ -1,122 +0,0 @@
-package main
-
-import (
- "bufio"
- "flag"
- "fmt"
- "log"
- "os"
- "path/filepath"
-
- "rescribe.xyz/go.git/bookpipeline"
-)
-
-const usage = "Usage: getpipelinebook [-a] [-v] bookname\n\nDownloads the pipeline results for a book.\n"
-
-// null writer to enable non-verbose logging to be discarded
-type NullWriter bool
-
-func (w NullWriter) Write(p []byte) (n int, err error) {
- return len(p), nil
-}
-
-type Pipeliner interface {
- Init() error
- ListObjects(bucket string, prefix string) ([]string, error)
- Download(bucket string, key string, fn string) error
- Upload(bucket string, key string, path string) error
- CheckQueue(url string, timeout int64) (bookpipeline.Qmsg, error)
- AddToQueue(url string, msg string) error
- DelFromQueue(url string, handle string) error
- WIPStorageId() string
-}
-
-func main() {
- all := flag.Bool("a", false, "Get all files for book, not just hOCR and analysis files")
- verbose := flag.Bool("v", false, "Verbose")
- flag.Usage = func() {
- fmt.Fprintf(flag.CommandLine.Output(), usage)
- flag.PrintDefaults()
- }
- flag.Parse()
-
- if flag.NArg() < 1 {
- flag.Usage()
- return
- }
-
- var verboselog *log.Logger
- if *verbose {
- verboselog = log.New(os.Stdout, "", log.LstdFlags)
- } else {
- var n NullWriter
- verboselog = log.New(n, "", log.LstdFlags)
- }
-
- var conn Pipeliner
- conn = &bookpipeline.AwsConn{Region: "eu-west-2", Logger: verboselog}
-
- verboselog.Println("Setting up AWS session")
- err := conn.Init()
- if err != nil {
- log.Fatalln("Error setting up cloud connection:", err)
- }
- verboselog.Println("Finished setting up AWS session")
-
- bookname := flag.Arg(0)
-
- err = os.MkdirAll(bookname, 0755)
- if err != nil {
- log.Fatalln("Failed to create directory", bookname, err)
- }
-
- if *all {
- verboselog.Println("Downloading all files for", bookname)
- objs, err := conn.ListObjects(conn.WIPStorageId(), bookname)
- if err != nil {
- log.Fatalln("Failed to get list of files for book", bookname, err)
- }
- for _, i := range objs {
- verboselog.Println("Downloading", i)
- err = conn.Download(conn.WIPStorageId(), i, i)
- if err != nil {
- log.Fatalln("Failed to download file", i, err)
- }
- }
- return
- }
-
- verboselog.Println("Downloading best file")
- fn := filepath.Join(bookname, "best")
- err = conn.Download(conn.WIPStorageId(), fn, fn)
- if err != nil {
- log.Fatalln("Failed to download 'best' file", err)
- }
- f, err := os.Open(fn)
- if err != nil {
- log.Fatalln("Failed to open best file", err)
- }
- defer f.Close()
-
- verboselog.Println("Downloading HOCR files")
- s := bufio.NewScanner(f)
- for s.Scan() {
- fn = filepath.Join(bookname, s.Text())
- verboselog.Println("Downloading file", fn)
- err = conn.Download(conn.WIPStorageId(), fn, fn)
- if err != nil {
- log.Fatalln("Failed to download file", fn, err)
- }
- }
-
- analyses := []string{"conf", "graph.png"}
- verboselog.Println("Downloading analysis files")
- for _, a := range analyses {
- fn = filepath.Join(bookname, a)
- verboselog.Println("Downloading file", fn)
- err = conn.Download(conn.WIPStorageId(), fn, fn)
- if err != nil {
- log.Fatalln("Failed to download file", fn, err)
- }
- }
-}
diff --git a/bookpipeline/cmd/lspipeline/main.go b/bookpipeline/cmd/lspipeline/main.go
deleted file mode 100644
index 46a1d63..0000000
--- a/bookpipeline/cmd/lspipeline/main.go
+++ /dev/null
@@ -1,250 +0,0 @@
-package main
-
-import (
- "flag"
- "fmt"
- "log"
- "os/exec"
- "strings"
-
- "rescribe.xyz/go.git/bookpipeline"
-)
-
-const usage = `Usage: lspipeline [-i key] [-n num]
-
-Lists useful things related to the pipeline.
-
-- Instances running
-- Messages in each queue
-- Books not completed
-- Books done
-- Last n lines of bookpipeline logs from each running instance
-`
-
-type LsPipeliner interface {
- Init() error
- PreQueueId() string
- WipeQueueId() string
- OCRQueueId() string
- AnalyseQueueId() string
- GetQueueDetails(url string) (string, string, error)
- GetInstanceDetails() ([]bookpipeline.InstanceDetails, error)
- ListObjects(bucket string, prefix string) ([]string, error)
- WIPStorageId() string
-}
-
-// NullWriter is used so non-verbose logging may be discarded
-type NullWriter bool
-
-func (w NullWriter) Write(p []byte) (n int, err error) {
- return len(p), nil
-}
-
-type queueDetails struct {
- name, numAvailable, numInProgress string
-}
-
-func getInstances(conn LsPipeliner, detailsc chan bookpipeline.InstanceDetails) {
- details, err := conn.GetInstanceDetails()
- if err != nil {
- log.Println("Error getting instance details:", err)
- }
- for _, d := range details {
- detailsc <- d
- }
- close(detailsc)
-}
-
-func getQueueDetails(conn LsPipeliner, qdetails chan queueDetails) {
- queues := []struct{ name, id string }{
- {"preprocess", conn.PreQueueId()},
- {"wipeonly", conn.WipeQueueId()},
- {"ocr", conn.OCRQueueId()},
- {"analyse", conn.AnalyseQueueId()},
- }
- for _, q := range queues {
- avail, inprog, err := conn.GetQueueDetails(q.id)
- if err != nil {
- log.Println("Error getting queue details:", err)
- }
- var qd queueDetails
- qd.name = q.name
- qd.numAvailable = avail
- qd.numInProgress = inprog
- qdetails <- qd
- }
- close(qdetails)
-}
-
-// getBookStatus returns a list of in progress and done books.
-// It determines this by listing all objects, and splitting the
-// prefixes into two lists, those which have a 'graph.png' file,
-// which are classed as done, and those which are not.
-func getBookStatus(conn LsPipeliner) (inprogress []string, done []string, err error) {
- allfiles, err := conn.ListObjects(conn.WIPStorageId(), "")
- if err != nil {
- log.Println("Error getting list of objects:", err)
- return inprogress, done, err
- }
- for _, f := range allfiles {
- parts := strings.Split(f, "/")
- if parts[1] != "graph.png" {
- continue
- }
- prefix := parts[0]
- found := false
- for _, i := range done {
- if i == prefix {
- found = true
- continue
- }
- }
- if !found {
- done = append(done, prefix)
- }
- }
-
- for _, f := range allfiles {
- parts := strings.Split(f, "/")
- prefix := parts[0]
- found := false
- for _, i := range done {
- if i == prefix {
- found = true
- continue
- }
- }
- for _, i := range inprogress {
- if i == prefix {
- found = true
- continue
- }
- }
- if !found {
- inprogress = append(inprogress, prefix)
- }
- }
-
- return inprogress, done, err
-}
-
-func getBookStatusChan(conn LsPipeliner, inprogressc chan string, donec chan string) {
- inprogress, done, err := getBookStatus(conn)
- if err != nil {
- log.Println("Error getting book status:", err)
- close(inprogressc)
- close(donec)
- return
- }
- for _, i := range inprogress {
- inprogressc <- i
- }
- close(inprogressc)
- for _, i := range done {
- donec <- i
- }
- close(donec)
-}
-
-func getRecentSSHLogs(ip string, id string, n int) (string, error) {
- addr := fmt.Sprintf("%s@%s", "admin", ip)
- logcmd := fmt.Sprintf("journalctl -n %d -u bookpipeline", n)
- var cmd *exec.Cmd
- if id == "" {
- cmd = exec.Command("ssh", "-o", "StrictHostKeyChecking no", addr, logcmd)
- } else {
- cmd = exec.Command("ssh", "-o", "StrictHostKeyChecking no", "-i", id, addr, logcmd)
- }
- out, err := cmd.Output()
- if err != nil {
- return "", err
- }
- return string(out), nil
-}
-
-func getRecentSSHLogsChan(ips []string, id string, lognum int, logs chan string) {
- for _, ip := range ips {
- sshlog, err := getRecentSSHLogs(ip, id, lognum)
- if err != nil {
- log.Printf("Error getting SSH logs for %s: %s\n", ip, err)
- continue
- }
- logs <- fmt.Sprintf("%s\n%s", ip, sshlog)
- }
- close(logs)
-}
-
-func main() {
- keyfile := flag.String("i", "", "private key file for SSH")
- lognum := flag.Int("n", 5, "number of lines to include in SSH logs")
- flag.Usage = func() {
- fmt.Fprintf(flag.CommandLine.Output(), usage)
- flag.PrintDefaults()
- }
- flag.Parse()
-
- var verboselog *log.Logger
- var n NullWriter
- verboselog = log.New(n, "", 0)
-
- var conn LsPipeliner
- conn = &bookpipeline.AwsConn{Region: "eu-west-2", Logger: verboselog}
- err := conn.Init()
- if err != nil {
- log.Fatalln("Failed to set up cloud connection:", err)
- }
-
- instances := make(chan bookpipeline.InstanceDetails, 100)
- queues := make(chan queueDetails)
- inprogress := make(chan string, 100)
- done := make(chan string, 100)
- logs := make(chan string, 10)
-
- go getInstances(conn, instances)
- go getQueueDetails(conn, queues)
- go getBookStatusChan(conn, inprogress, done)
-
- var ips []string
-
- fmt.Println("# Instances")
- for i := range instances {
- fmt.Printf("ID: %s, Type: %s, LaunchTime: %s, State: %s", i.Id, i.Type, i.LaunchTime, i.State)
- if i.Name != "" {
- fmt.Printf(", Name: %s", i.Name)
- }
- if i.Ip != "" {
- fmt.Printf(", IP: %s", i.Ip)
- if i.State == "running" && i.Name != "workhorse" {
- ips = append(ips, i.Ip)
- }
- }
- if i.Spot != "" {
- fmt.Printf(", SpotRequest: %s", i.Spot)
- }
- fmt.Printf("\n")
- }
-
- go getRecentSSHLogsChan(ips, *keyfile, *lognum, logs)
-
- fmt.Println("\n# Queues")
- for i := range queues {
- fmt.Printf("%s: %s available, %s in progress\n", i.name, i.numAvailable, i.numInProgress)
- }
-
- fmt.Println("\n# Books not completed")
- for i := range inprogress {
- fmt.Println(i)
- }
-
- fmt.Println("\n# Books done")
- for i := range done {
- fmt.Println(i)
- }
-
- if len(ips) > 0 {
- fmt.Println("\n# Recent logs")
- for i := range logs {
- fmt.Printf("\n%s", i)
- }
- }
-}
diff --git a/bookpipeline/cmd/mkpipeline/main.go b/bookpipeline/cmd/mkpipeline/main.go
deleted file mode 100644
index e37a56d..0000000
--- a/bookpipeline/cmd/mkpipeline/main.go
+++ /dev/null
@@ -1,79 +0,0 @@
-package main
-
-// TODO: use the bookpipeline package for aws stuff
-// TODO: set up iam role and policy needed for ec2 instances to access this stuff;
-// see arn:aws:iam::557852942063:policy/pipelinestorageandqueue
-// and arn:aws:iam::557852942063:role/pipeliner
-// TODO: set up launch template for ec2 instances
-// NOTE: potentially use json templates to define things, ala aws cli
-
-import (
- "log"
- "os"
-
- "github.com/aws/aws-sdk-go/aws"
- "github.com/aws/aws-sdk-go/aws/awserr"
- "github.com/aws/aws-sdk-go/aws/session"
- "github.com/aws/aws-sdk-go/service/s3"
- "github.com/aws/aws-sdk-go/service/sqs"
-)
-
-func main() {
- if len(os.Args) != 1 {
- log.Fatal("Usage: mkpipeline\n\nSets up necessary S3 buckets and SQS queues for our AWS pipeline\n")
- }
-
- sess, err := session.NewSession(&aws.Config{
- Region: aws.String("eu-west-2"),
- })
- if err != nil {
- log.Fatalf("Error: failed to set up aws session: %v\n", err)
- }
- s3svc := s3.New(sess)
- sqssvc := sqs.New(sess)
-
- prefix := "rescribe"
- buckets := []string{"inprogress", "done"}
- queues := []string{"preprocess", "wipeonly", "ocr", "analyse"}
-
- for _, bucket := range buckets {
- bname := prefix + bucket
- log.Printf("Creating bucket %s\n", bname)
- _, err = s3svc.CreateBucket(&s3.CreateBucketInput{
- Bucket: aws.String(bname),
- })
- if err != nil {
- aerr, ok := err.(awserr.Error)
- if ok && (aerr.Code() == s3.ErrCodeBucketAlreadyExists || aerr.Code() == s3.ErrCodeBucketAlreadyOwnedByYou) {
- log.Printf("Bucket %s already exists\n", bname)
- } else {
- log.Fatalf("Error creating bucket %s: %v\n", bname, err)
- }
- }
- }
-
- for _, queue := range queues {
- qname := prefix + queue
- log.Printf("Creating queue %s\n", qname)
- _, err = sqssvc.CreateQueue(&sqs.CreateQueueInput{
- QueueName: aws.String(qname),
- Attributes: map[string]*string{
- "VisibilityTimeout": aws.String("120"), // 2 minutes
- "MessageRetentionPeriod": aws.String("1209600"), // 14 days; max allowed by sqs
- "ReceiveMessageWaitTimeSeconds": aws.String("20"),
- },
- })
- if err != nil {
- aerr, ok := err.(awserr.Error)
- // Note the QueueAlreadyExists code is only emitted if an existing queue
- // has different attributes than the one that was being created. SQS just
- // quietly ignores the CreateQueue request if it is identical to an
- // existing queue.
- if ok && aerr.Code() == sqs.ErrCodeQueueNameExists {
- log.Fatalf("Error: Queue %s already exists but has different attributes\n", qname)
- } else {
- log.Fatalf("Error creating queue %s: %v\n", qname, err)
- }
- }
- }
-}
diff --git a/bookpipeline/graph.go b/bookpipeline/graph.go
deleted file mode 100644
index 955abbd..0000000
--- a/bookpipeline/graph.go
+++ /dev/null
@@ -1,155 +0,0 @@
-package bookpipeline
-
-import (
- "fmt"
- "io"
- "path/filepath"
- "sort"
- "strconv"
- "strings"
-
- "github.com/wcharczuk/go-chart"
- "github.com/wcharczuk/go-chart/drawing"
-)
-
-const maxticks = 40
-const goodCutoff = 70
-const mediumCutoff = 65
-const badCutoff = 60
-
-type Conf struct {
- Path, Code string
- Conf float64
-}
-
-type GraphConf struct {
- Pgnum, Conf float64
-}
-
-func createLine(xvalues []float64, y float64, c drawing.Color) chart.ContinuousSeries {
- var yvalues []float64
- for range xvalues {
- yvalues = append(yvalues, y)
- }
- return chart.ContinuousSeries{
- XValues: xvalues,
- YValues: yvalues,
- Style: chart.Style{
- StrokeColor: c,
- },
- }
-}
-
-func Graph(confs map[string]*Conf, bookname string, w io.Writer) error {
- // Organise confs to sort them by page
- var graphconf []GraphConf
- for _, conf := range confs {
- name := filepath.Base(conf.Path)
- var numend int
- numend = strings.Index(name, "_")
- if numend == -1 {
- numend = strings.Index(name, ".")
- }
- pgnum, err := strconv.ParseFloat(name[0:numend], 64)
- if err != nil {
- continue
- }
- var c GraphConf
- c.Pgnum = pgnum
- c.Conf = conf.Conf
- graphconf = append(graphconf, c)
- }
- sort.Slice(graphconf, func(i, j int) bool { return graphconf[i].Pgnum < graphconf[j].Pgnum })
-
- // Create main xvalues and yvalues, annotations and ticks
- var xvalues, yvalues []float64
- var annotations []chart.Value2
- var ticks []chart.Tick
- tickevery := len(graphconf) / maxticks
- if tickevery < 1 {
- tickevery = 1
- }
- for i, c := range graphconf {
- xvalues = append(xvalues, c.Pgnum)
- yvalues = append(yvalues, c.Conf)
- if c.Conf < goodCutoff {
- annotations = append(annotations, chart.Value2{Label: fmt.Sprintf("%.0f", c.Pgnum), XValue: c.Pgnum, YValue: c.Conf})
- }
- if i%tickevery == 0 {
- ticks = append(ticks, chart.Tick{c.Pgnum, fmt.Sprintf("%.0f", c.Pgnum)})
- }
- }
- // make last tick the final page
- final := graphconf[len(graphconf)-1]
- ticks[len(ticks)-1] = chart.Tick{final.Pgnum, fmt.Sprintf("%.0f", final.Pgnum)}
- mainSeries := chart.ContinuousSeries{
- XValues: xvalues,
- YValues: yvalues,
- }
-
- // Create lines
- goodCutoffSeries := createLine(xvalues, goodCutoff, chart.ColorAlternateGreen)
- mediumCutoffSeries := createLine(xvalues, mediumCutoff, chart.ColorOrange)
- badCutoffSeries := createLine(xvalues, badCutoff, chart.ColorRed)
-
- // Create lines marking top and bottom 10% confidence
- sort.Slice(graphconf, func(i, j int) bool { return graphconf[i].Conf < graphconf[j].Conf })
- lowconf := graphconf[int(len(graphconf)/10)].Conf
- highconf := graphconf[int((len(graphconf)/10)*9)].Conf
- yvalues = []float64{}
- for range graphconf {
- yvalues = append(yvalues, lowconf)
- }
- minSeries := &chart.ContinuousSeries{
- Style: chart.Style{
- StrokeColor: chart.ColorAlternateGray,
- StrokeDashArray: []float64{5.0, 5.0},
- },
- XValues: xvalues,
- YValues: yvalues,
- }
- yvalues = []float64{}
- for _ = range graphconf {
- yvalues = append(yvalues, highconf)
- }
- maxSeries := &chart.ContinuousSeries{
- Style: chart.Style{
- StrokeColor: chart.ColorAlternateGray,
- StrokeDashArray: []float64{5.0, 5.0},
- },
- XValues: xvalues,
- YValues: yvalues,
- }
-
- graph := chart.Chart{
- Title: bookname,
- Width: 3840,
- Height: 2160,
- XAxis: chart.XAxis{
- Name: "Page number",
- Range: &chart.ContinuousRange{
- Min: 0.0,
- },
- Ticks: ticks,
- },
- YAxis: chart.YAxis{
- Name: "Confidence",
- Range: &chart.ContinuousRange{
- Min: 0.0,
- Max: 100.0,
- },
- },
- Series: []chart.Series{
- mainSeries,
- minSeries,
- maxSeries,
- goodCutoffSeries,
- mediumCutoffSeries,
- badCutoffSeries,
- chart.AnnotationSeries{
- Annotations: annotations,
- },
- },
- }
- return graph.Render(chart.PNG, w)
-}
diff --git a/bucket-lines/bucket.go b/bucket-lines/bucket.go
index 9f98887..7b6fc4f 100644
--- a/bucket-lines/bucket.go
+++ b/bucket-lines/bucket.go
@@ -8,7 +8,7 @@ import (
"sort"
"strconv"
- "rescribe.xyz/go.git/lib/line"
+ "rescribe.xyz/utils/pkg/line"
)
type BucketSpec struct {
diff --git a/bucket-lines/main.go b/bucket-lines/main.go
index 990e84c..6ae506a 100644
--- a/bucket-lines/main.go
+++ b/bucket-lines/main.go
@@ -9,9 +9,9 @@ import (
"os"
"path/filepath"
- "rescribe.xyz/go.git/lib/hocr"
- "rescribe.xyz/go.git/lib/line"
- "rescribe.xyz/go.git/lib/prob"
+ "rescribe.xyz/utils/pkg/hocr"
+ "rescribe.xyz/utils/pkg/line"
+ "rescribe.xyz/utils/pkg/prob"
)
func main() {
diff --git a/dehyphenate/main.go b/dehyphenate/main.go
index 4393c8f..b2bd6f9 100644
--- a/dehyphenate/main.go
+++ b/dehyphenate/main.go
@@ -8,7 +8,7 @@ import (
"log"
"os"
- "rescribe.xyz/go.git/lib/hocr"
+ "rescribe.xyz/utils/pkg/hocr"
)
// BUGS:
diff --git a/hocrtotxt/main.go b/hocrtotxt/main.go
index 6821a9e..6716a9e 100644
--- a/hocrtotxt/main.go
+++ b/hocrtotxt/main.go
@@ -6,7 +6,7 @@ import (
"log"
"os"
- "rescribe.xyz/go.git/lib/hocr"
+ "rescribe.xyz/utils/pkg/hocr"
)
func main() {
diff --git a/integralimg/integralimg.go b/integralimg/integralimg.go
deleted file mode 100644
index 406ed61..0000000
--- a/integralimg/integralimg.go
+++ /dev/null
@@ -1,169 +0,0 @@
-package integralimg
-
-import (
- "image"
- "math"
-)
-
-// I is the Integral Image
-type I [][]uint64
-
-// Sq contains an Integral Image and its Square
-type WithSq struct {
- Img I
- Sq I
-}
-
-// Window is a part of an Integral Image
-type Window struct {
- topleft uint64
- topright uint64
- bottomleft uint64
- bottomright uint64
- width int
- height int
-}
-
-// ToIntegralImg creates an integral image
-func ToIntegralImg(img *image.Gray) I {
- var integral I
- var oldy, oldx, oldxy uint64
- b := img.Bounds()
- for y := b.Min.Y; y < b.Max.Y; y++ {
- newrow := []uint64{}
- for x := b.Min.X; x < b.Max.X; x++ {
- oldx, oldy, oldxy = 0, 0, 0
- if x > 0 {
- oldx = newrow[x-1]
- }
- if y > 0 {
- oldy = integral[y-1][x]
- }
- if x > 0 && y > 0 {
- oldxy = integral[y-1][x-1]
- }
- pixel := uint64(img.GrayAt(x, y).Y)
- i := pixel + oldx + oldy - oldxy
- newrow = append(newrow, i)
- }
- integral = append(integral, newrow)
- }
- return integral
-}
-
-// ToSqIntegralImg creates an integral image of the square of all
-// pixel values
-func ToSqIntegralImg(img *image.Gray) I {
- var integral I
- var oldy, oldx, oldxy uint64
- b := img.Bounds()
- for y := b.Min.Y; y < b.Max.Y; y++ {
- newrow := []uint64{}
- for x := b.Min.X; x < b.Max.X; x++ {
- oldx, oldy, oldxy = 0, 0, 0
- if x > 0 {
- oldx = newrow[x-1]
- }
- if y > 0 {
- oldy = integral[y-1][x]
- }
- if x > 0 && y > 0 {
- oldxy = integral[y-1][x-1]
- }
- pixel := uint64(img.GrayAt(x, y).Y)
- i := pixel * pixel + oldx + oldy - oldxy
- newrow = append(newrow, i)
- }
- integral = append(integral, newrow)
- }
- return integral
-}
-
-// ToAllIntegralImg creates a WithSq containing a regular and
-// squared Integral Image
-func ToAllIntegralImg(img *image.Gray) WithSq {
- var s WithSq
- s.Img = ToIntegralImg(img)
- s.Sq = ToSqIntegralImg(img)
- return s
-}
-
-
-// GetWindow gets the values of the corners of a square part of an
-// Integral Image, plus the dimensions of the part, which can
-// be used to quickly calculate the mean of the area
-func (i I) GetWindow(x, y, size int) Window {
- step := size / 2
-
- minx, miny := 0, 0
- maxy := len(i)-1
- maxx := len(i[0])-1
-
- if y > (step+1) {
- miny = y - step - 1
- }
- if x > (step+1) {
- minx = x - step - 1
- }
-
- if maxy > (y + step) {
- maxy = y + step
- }
- if maxx > (x + step) {
- maxx = x + step
- }
-
- return Window { i[miny][minx], i[miny][maxx], i[maxy][minx], i[maxy][maxx], maxx-minx, maxy-miny}
-}
-
-// GetVerticalWindow gets the values of the corners of a vertical
-// slice of an Integral Image, starting at x
-func (i I) GetVerticalWindow(x, width int) Window {
- maxy := len(i) - 1
- maxx := x + width
- if maxx > len(i[0])-1 {
- maxx = len(i[0]) - 1
- }
-
- return Window { i[0][x], i[0][maxx], i[maxy][x], i[maxy][maxx], width, maxy }
-}
-
-// Sum returns the sum of all pixels in a Window
-func (w Window) Sum() uint64 {
- return w.bottomright + w.topleft - w.topright - w.bottomleft
-}
-
-// Size returns the total size of a Window
-func (w Window) Size() int {
- return w.width * w.height
-}
-
-// Mean returns the average value of pixels in a Window
-func (w Window) Mean() float64 {
- return float64(w.Sum()) / float64(w.Size())
-}
-
-// Proportion returns the proportion of pixels which are on
-func (w Window) Proportion() float64 {
- area := w.width * w.height
- // divide by 255 as each on pixel has the value of 255
- sum := float64(w.Sum()) / 255
- return float64(area) / sum - 1
-}
-
-// MeanWindow calculates the mean value of a section of an Integral
-// Image
-func (i I) MeanWindow(x, y, size int) float64 {
- return i.GetWindow(x, y, size).Mean()
-}
-
-// MeanStdDevWindow calculates the mean and standard deviation of
-// a section on an Integral Image
-func (i WithSq) MeanStdDevWindow(x, y, size int) (float64, float64) {
- imean := i.Img.GetWindow(x, y, size).Mean()
- smean := i.Sq.GetWindow(x, y, size).Mean()
-
- variance := smean - (imean * imean)
-
- return imean, math.Sqrt(variance)
-}
diff --git a/pgconf/main.go b/pgconf/main.go
index bc09c23..dbc6af8 100644
--- a/pgconf/main.go
+++ b/pgconf/main.go
@@ -6,7 +6,7 @@ import (
"log"
"os"
- "rescribe.xyz/go.git/lib/hocr"
+ "rescribe.xyz/utils/pkg/hocr"
)
func main() {
diff --git a/lib/hocr/hocr.go b/pkg/hocr/hocr.go
index dcd0494..dcd0494 100644
--- a/lib/hocr/hocr.go
+++ b/pkg/hocr/hocr.go
diff --git a/lib/hocr/lines.go b/pkg/hocr/lines.go
index 74e8f9a..e90b0a8 100644
--- a/lib/hocr/lines.go
+++ b/pkg/hocr/lines.go
@@ -12,7 +12,7 @@ import (
"path/filepath"
"strings"
- "rescribe.xyz/go.git/lib/line"
+ "rescribe.xyz/utils/pkg/line"
)
func getLineText(l OcrLine) (string) {
diff --git a/lib/line/line.go b/pkg/line/line.go
index d4e3e44..d4e3e44 100644
--- a/lib/line/line.go
+++ b/pkg/line/line.go
diff --git a/lib/prob/prob.go b/pkg/prob/prob.go
index 31a484d..8bdb3d5 100644
--- a/lib/prob/prob.go
+++ b/pkg/prob/prob.go
@@ -6,7 +6,7 @@ import (
"strconv"
"strings"
- "rescribe.xyz/go.git/lib/line"
+ "rescribe.xyz/utils/pkg/line"
)
func getLineAvg(f string) (float64, error) {
diff --git a/preproc/cmd/binarize/main.go b/preproc/cmd/binarize/main.go
deleted file mode 100644
index e7f677e..0000000
--- a/preproc/cmd/binarize/main.go
+++ /dev/null
@@ -1,78 +0,0 @@
-package main
-
-import (
- "flag"
- "fmt"
- "image"
- "image/draw"
- _ "image/jpeg"
- "image/png"
- "log"
- "os"
-
- "rescribe.xyz/go.git/preproc"
-)
-
-// TODO: do more testing to see how good this assumption is
-func autowsize(bounds image.Rectangle) int {
- return bounds.Dx() / 60
-}
-
-func main() {
- flag.Usage = func() {
- fmt.Fprintf(os.Stderr, "Usage: binarize [-k num] [-t type] [-w num] inimg outimg\n")
- flag.PrintDefaults()
- }
- wsize := flag.Int("w", 0, "Window size for sauvola algorithm. Set automatically based on resolution if not set.")
- ksize := flag.Float64("k", 0.5, "K for sauvola algorithm. This controls the overall threshold level. Set it lower for very light text (try 0.1 or 0.2).")
- btype := flag.String("t", "binary", "Type of threshold. binary or zeroinv are currently implemented.")
- flag.Parse()
- if flag.NArg() < 2 {
- flag.Usage()
- os.Exit(1)
- }
-
- f, err := os.Open(flag.Arg(0))
- defer f.Close()
- if err != nil {
- log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err)
- }
- img, _, err := image.Decode(f)
- if err != nil {
- log.Fatalf("Could not decode image: %v\n", err)
- }
- b := img.Bounds()
- gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy()))
- draw.Draw(gray, b, img, b.Min, draw.Src)
-
- if *wsize == 0 {
- *wsize = autowsize(b)
- log.Printf("Set window size to %d\n", *wsize)
- }
-
- if *wsize%2 == 0 {
- *wsize++
- }
-
- // TODO: come up with a way to set a good ksize automatically
-
- var thresh image.Image
- thresh = preproc.IntegralSauvola(gray, *ksize, *wsize)
-
- if *btype == "zeroinv" {
- thresh, err = preproc.BinToZeroInv(thresh.(*image.Gray), img.(*image.RGBA))
- if err != nil {
- log.Fatal(err)
- }
- }
-
- f, err = os.Create(flag.Arg(1))
- if err != nil {
- log.Fatalf("Could not create file %s: %v\n", flag.Arg(1), err)
- }
- defer f.Close()
- err = png.Encode(f, thresh)
- if err != nil {
- log.Fatalf("Could not encode image: %v\n", err)
- }
-}
diff --git a/preproc/cmd/preproc/main.go b/preproc/cmd/preproc/main.go
deleted file mode 100644
index 1c248e0..0000000
--- a/preproc/cmd/preproc/main.go
+++ /dev/null
@@ -1,90 +0,0 @@
-package main
-
-// TODO: come up with a way to set a good ksize automatically
-
-import (
- "flag"
- "fmt"
- "image"
- "image/draw"
- _ "image/jpeg"
- "image/png"
- "log"
- "os"
-
- "rescribe.xyz/go.git/preproc"
-)
-
-// TODO: do more testing to see how good this assumption is
-func autowsize(bounds image.Rectangle) int {
- return bounds.Dx() / 60
-}
-
-func main() {
- flag.Usage = func() {
- fmt.Fprintf(os.Stderr, "Usage: preproc [-bt bintype] [-bw winsize] [-k num] [-m minperc] [-nowipe] [-wt wipethresh] [-ws wipesize] inimg outimg\n")
- fmt.Fprintf(os.Stderr, "Binarize and preprocess an image\n")
- flag.PrintDefaults()
- }
- binwsize := flag.Int("bw", 0, "Window size for sauvola binarization algorithm. Set automatically based on resolution if not set.")
- ksize := flag.Float64("k", 0.5, "K for sauvola binarization algorithm. This controls the overall threshold level. Set it lower for very light text (try 0.1 or 0.2).")
- btype := flag.String("bt", "binary", "Type of binarization threshold. binary or zeroinv are currently implemented.")
- min := flag.Int("m", 30, "Minimum percentage of the image width for the content width calculation to be considered valid.")
- nowipe := flag.Bool("nowipe", false, "Disable wiping completely.")
- wipewsize := flag.Int("ws", 5, "Window size for wiping algorithm.")
- thresh := flag.Float64("wt", 0.05, "Threshold for the wiping algorithm to determine the proportion of black pixels below which a window is determined to be the edge.")
- flag.Parse()
- if flag.NArg() < 2 {
- flag.Usage()
- os.Exit(1)
- }
-
- f, err := os.Open(flag.Arg(0))
- defer f.Close()
- if err != nil {
- log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err)
- }
- img, _, err := image.Decode(f)
- if err != nil {
- log.Fatalf("Could not decode image: %v\n", err)
- }
- b := img.Bounds()
- gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy()))
- draw.Draw(gray, b, img, b.Min, draw.Src)
-
- if *binwsize == 0 {
- *binwsize = autowsize(b)
- }
-
- if *binwsize%2 == 0 {
- *binwsize++
- }
-
- log.Print("Binarising")
- var clean, threshimg image.Image
- threshimg = preproc.IntegralSauvola(gray, *ksize, *binwsize)
-
- if *btype == "zeroinv" {
- threshimg, err = preproc.BinToZeroInv(threshimg.(*image.Gray), img.(*image.RGBA))
- if err != nil {
- log.Fatal(err)
- }
- }
-
- if !*nowipe {
- log.Print("Wiping sides")
- clean = preproc.Wipe(threshimg.(*image.Gray), *wipewsize, *thresh, *min)
- } else {
- clean = threshimg
- }
-
- f, err = os.Create(flag.Arg(1))
- if err != nil {
- log.Fatalf("Could not create file %s: %v\n", flag.Arg(1), err)
- }
- defer f.Close()
- err = png.Encode(f, clean)
- if err != nil {
- log.Fatalf("Could not encode image: %v\n", err)
- }
-}
diff --git a/preproc/cmd/preprocmulti/main.go b/preproc/cmd/preprocmulti/main.go
deleted file mode 100644
index c6c9fe4..0000000
--- a/preproc/cmd/preprocmulti/main.go
+++ /dev/null
@@ -1,101 +0,0 @@
-package main
-
-// TODO: come up with a way to set a good ksize automatically
-
-import (
- "flag"
- "fmt"
- "image"
- "image/draw"
- _ "image/jpeg"
- "image/png"
- "log"
- "os"
-
- "rescribe.xyz/go.git/integralimg"
- "rescribe.xyz/go.git/preproc"
-)
-
-// TODO: do more testing to see how good this assumption is
-func autowsize(bounds image.Rectangle) int {
- return bounds.Dx() / 60
-}
-
-func main() {
- ksizes := []float64{0.1, 0.2, 0.4, 0.5}
-
- flag.Usage = func() {
- fmt.Fprintf(os.Stderr, "Usage: preprocmulti [-bt bintype] [-bw winsize] [-m minperc] [-nowipe] [-ws wipesize] inimg outbase\n")
- fmt.Fprintf(os.Stderr, "Binarize and preprocess an image, with multiple binarisation levels,\n")
- fmt.Fprintf(os.Stderr, "saving images to outbase_bin{k}.png.\n")
- fmt.Fprintf(os.Stderr, "Binarises with these levels for k: %v.\n", ksizes)
- flag.PrintDefaults()
- }
- binwsize := flag.Int("bw", 0, "Window size for sauvola binarization algorithm. Set automatically based on resolution if not set.")
- btype := flag.String("bt", "binary", "Type of binarization threshold. binary or zeroinv are currently implemented.")
- min := flag.Int("m", 30, "Minimum percentage of the image width for the content width calculation to be considered valid.")
- nowipe := flag.Bool("nowipe", false, "Disable wiping completely.")
- wipewsize := flag.Int("ws", 5, "Window size for wiping algorithm.")
- flag.Parse()
- if flag.NArg() < 2 {
- flag.Usage()
- os.Exit(1)
- }
-
- log.Printf("Opening %s\n", flag.Arg(0))
- f, err := os.Open(flag.Arg(0))
- defer f.Close()
- if err != nil {
- log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err)
- }
- img, _, err := image.Decode(f)
- if err != nil {
- log.Fatalf("Could not decode image: %v\n", err)
- }
- b := img.Bounds()
- gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy()))
- draw.Draw(gray, b, img, b.Min, draw.Src)
-
- if *binwsize == 0 {
- *binwsize = autowsize(b)
- }
-
- if *binwsize%2 == 0 {
- *binwsize++
- }
-
- var clean, threshimg image.Image
- log.Print("Precalculating integral images")
- integrals := integralimg.ToAllIntegralImg(gray)
-
- for _, k := range ksizes {
- log.Print("Binarising")
- threshimg = preproc.PreCalcedSauvola(integrals, gray, k, *binwsize)
-
- if *btype == "zeroinv" {
- threshimg, err = preproc.BinToZeroInv(threshimg.(*image.Gray), img.(*image.RGBA))
- if err != nil {
- log.Fatal(err)
- }
- }
-
- if !*nowipe {
- log.Print("Wiping sides")
- clean = preproc.Wipe(threshimg.(*image.Gray), *wipewsize, k*0.02, *min)
- } else {
- clean = threshimg
- }
-
- savefn := fmt.Sprintf("%s_bin%0.1f.png", flag.Arg(1), k)
- log.Printf("Saving %s\n", savefn)
- f, err = os.Create(savefn)
- if err != nil {
- log.Fatalf("Could not create file %s: %v\n", savefn, err)
- }
- defer f.Close()
- err = png.Encode(f, clean)
- if err != nil {
- log.Fatalf("Could not encode image: %v\n", err)
- }
- }
-}
diff --git a/preproc/cmd/wipe/main.go b/preproc/cmd/wipe/main.go
deleted file mode 100644
index e5c039d..0000000
--- a/preproc/cmd/wipe/main.go
+++ /dev/null
@@ -1,55 +0,0 @@
-package main
-
-import (
- "flag"
- "fmt"
- "image"
- "image/draw"
- _ "image/jpeg"
- "image/png"
- "log"
- "os"
-
- "rescribe.xyz/go.git/preproc"
-)
-
-func main() {
- flag.Usage = func() {
- fmt.Fprintf(os.Stderr, "Usage: wipe [-m minperc] [-t thresh] [-w winsize] inimg outimg\n")
- fmt.Fprintf(os.Stderr, "Wipes the sections of an image which are outside the content area.\n")
- flag.PrintDefaults()
- }
- min := flag.Int("m", 30, "Minimum percentage of the image width for the content width calculation to be considered valid.")
- thresh := flag.Float64("t", 0.05, "Threshold for the proportion of black pixels below which a window is determined to be the edge. Higher means more aggressive wiping.")
- wsize := flag.Int("w", 5, "Window size for mask finding algorithm.")
- flag.Parse()
- if flag.NArg() < 2 {
- flag.Usage()
- os.Exit(1)
- }
-
- f, err := os.Open(flag.Arg(0))
- defer f.Close()
- if err != nil {
- log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err)
- }
- img, _, err := image.Decode(f)
- if err != nil {
- log.Fatalf("Could not decode image: %v\n", err)
- }
- b := img.Bounds()
- gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy()))
- draw.Draw(gray, b, img, b.Min, draw.Src)
-
- clean := preproc.Wipe(gray, *wsize, *thresh, *min)
-
- f, err = os.Create(flag.Arg(1))
- if err != nil {
- log.Fatalf("Could not create file %s: %v\n", flag.Arg(1), err)
- }
- defer f.Close()
- err = png.Encode(f, clean)
- if err != nil {
- log.Fatalf("Could not encode image: %v\n", err)
- }
-}
diff --git a/preproc/preprocmulti.go b/preproc/preprocmulti.go
deleted file mode 100644
index 2e7cb06..0000000
--- a/preproc/preprocmulti.go
+++ /dev/null
@@ -1,94 +0,0 @@
-package preproc
-
-// TODO: come up with a way to set a good ksize automatically
-
-import (
- "fmt"
- "image"
- "image/draw"
- _ "image/jpeg"
- "image/png"
- "os"
- "strings"
-
- "rescribe.xyz/go.git/integralimg"
-)
-
-// TODO: do more testing to see how good this assumption is
-func autowsize(bounds image.Rectangle) int {
- return bounds.Dx() / 60
-}
-
-// PreProcMulti binarizes and preprocesses an image with multiple binarisation levels.
-// inPath: Path of input image.
-// ksizes: Slice of k values to pass to Sauvola algorithm
-// binType: Type of binarization threshold. binary or zeroinv are currently implemented.
-// binWsize: Window size for sauvola binarization algorithm. Set automatically based on resolution if 0.
-// wipe: Whether to wipe (clear sides) the image
-// wipeWsize: Window size for wiping algorithm
-// wipeMinWidthPerc: Minimum percentage of the image width for the content width calculation to be considered valid
-// Note: copied from cmd/preprocmulti/main.go, should think about the best way
-// to organise this code later.
-// TODO: return errors that encapsulate the err describing where it was encountered
-// TODO: do the post-integral image stuff in separate goroutines for speed
-func PreProcMulti(inPath string, ksizes []float64, binType string, binWsize int, wipe bool, wipeWsize int, wipeMinWidthPerc int) ([]string, error) {
- // Make outBase inPath up to final .
- s := strings.Split(inPath, ".")
- outBase := strings.Join(s[:len(s)-1], "")
-
- var donePaths []string
-
- f, err := os.Open(inPath)
- if err != nil {
- return donePaths, err
- }
- defer f.Close()
- img, _, err := image.Decode(f)
- if err != nil {
- return donePaths, err
- }
- b := img.Bounds()
- gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy()))
- draw.Draw(gray, b, img, b.Min, draw.Src)
-
- if binWsize == 0 {
- binWsize = autowsize(b)
- }
-
- if binWsize%2 == 0 {
- binWsize++
- }
-
- var clean, threshimg image.Image
- integrals := integralimg.ToAllIntegralImg(gray)
-
- for _, k := range ksizes {
- threshimg = PreCalcedSauvola(integrals, gray, k, binWsize)
-
- if binType == "zeroinv" {
- threshimg, err = BinToZeroInv(threshimg.(*image.Gray), img.(*image.RGBA))
- if err != nil {
- return donePaths, err
- }
- }
-
- if wipe {
- clean = Wipe(threshimg.(*image.Gray), wipeWsize, k*0.02, wipeMinWidthPerc)
- } else {
- clean = threshimg
- }
-
- savefn := fmt.Sprintf("%s_bin%0.1f.png", outBase, k)
- f, err = os.Create(savefn)
- if err != nil {
- return donePaths, err
- }
- defer f.Close()
- err = png.Encode(f, clean)
- if err != nil {
- return donePaths, err
- }
- donePaths = append(donePaths, savefn)
- }
- return donePaths, nil
-}
diff --git a/preproc/sauvola.go b/preproc/sauvola.go
deleted file mode 100644
index 046bb7d..0000000
--- a/preproc/sauvola.go
+++ /dev/null
@@ -1,76 +0,0 @@
-package preproc
-
-import (
- "image"
- "image/color"
-
- "rescribe.xyz/go.git/integralimg"
-)
-
-// Implements Sauvola's algorithm for text binarization, see paper
-// "Adaptive document image binarization" (2000)
-func Sauvola(img *image.Gray, ksize float64, windowsize int) *image.Gray {
- b := img.Bounds()
- new := image.NewGray(b)
-
- for y := b.Min.Y; y < b.Max.Y; y++ {
- for x := b.Min.X; x < b.Max.X; x++ {
- window := surrounding(img, x, y, windowsize)
- m, dev := meanstddev(window)
- threshold := m * (1 + ksize*((dev/128)-1))
- if img.GrayAt(x, y).Y < uint8(threshold) {
- new.SetGray(x, y, color.Gray{0})
- } else {
- new.SetGray(x, y, color.Gray{255})
- }
- }
- }
-
- return new
-}
-
-// Implements Sauvola's algorithm using Integral Images, see paper
-// "Efficient Implementation of Local Adaptive Thresholding Techniques Using Integral Images"
-// and
-// https://stackoverflow.com/questions/13110733/computing-image-integral
-func IntegralSauvola(img *image.Gray, ksize float64, windowsize int) *image.Gray {
- b := img.Bounds()
- new := image.NewGray(b)
-
- integrals := integralimg.ToAllIntegralImg(img)
-
- for y := b.Min.Y; y < b.Max.Y; y++ {
- for x := b.Min.X; x < b.Max.X; x++ {
- m, dev := integrals.MeanStdDevWindow(x, y, windowsize)
- threshold := m * (1 + ksize*((dev/128)-1))
- if img.GrayAt(x, y).Y < uint8(threshold) {
- new.SetGray(x, y, color.Gray{0})
- } else {
- new.SetGray(x, y, color.Gray{255})
- }
- }
- }
-
- return new
-}
-
-// PreCalcedSauvola Implements Sauvola's algorithm using precalculated Integral Images
-// TODO: have this be the root function that the other two reference
-func PreCalcedSauvola(integrals integralimg.WithSq, img *image.Gray, ksize float64, windowsize int) *image.Gray {
- b := img.Bounds()
- new := image.NewGray(b)
-
- for y := b.Min.Y; y < b.Max.Y; y++ {
- for x := b.Min.X; x < b.Max.X; x++ {
- m, dev := integrals.MeanStdDevWindow(x, y, windowsize)
- threshold := m * (1 + ksize*((dev/128)-1))
- if img.GrayAt(x, y).Y < uint8(threshold) {
- new.SetGray(x, y, color.Gray{0})
- } else {
- new.SetGray(x, y, color.Gray{255})
- }
- }
- }
-
- return new
-}
diff --git a/preproc/sauvola_test.go b/preproc/sauvola_test.go
deleted file mode 100644
index 2331e10..0000000
--- a/preproc/sauvola_test.go
+++ /dev/null
@@ -1,70 +0,0 @@
-package preproc
-
-import (
- "flag"
- "fmt"
- "image"
- "image/png"
- "os"
- "testing"
-)
-
-func TestBinarization(t *testing.T) {
- var slow = flag.Bool("slow", false, "include slow tests")
- var update = flag.Bool("updatesauvola", false, "update golden files")
-
- cases := []struct {
- name string
- orig string
- golden string
- ksize float64
- wsize int
- }{
- {"integralsauvola", "testdata/pg1.png", "testdata/pg1_integralsauvola_k0.5_w41.png", 0.5, 41},
- {"integralsauvola", "testdata/pg1.png", "testdata/pg1_integralsauvola_k0.5_w19.png", 0.5, 19},
- {"integralsauvola", "testdata/pg1.png", "testdata/pg1_integralsauvola_k0.3_w19.png", 0.3, 19},
- {"sauvola", "testdata/pg1.png", "testdata/pg1_sauvola_k0.5_w41.png", 0.5, 41},
- {"sauvola", "testdata/pg1.png", "testdata/pg1_sauvola_k0.5_w19.png", 0.5, 19},
- {"sauvola", "testdata/pg1.png", "testdata/pg1_sauvola_k0.3_w19.png", 0.3, 19},
- }
-
- for _, c := range cases {
- t.Run(fmt.Sprintf("%s_%0.1f_%d", c.name, c.ksize, c.wsize), func(t *testing.T) {
- var actual *image.Gray
- orig, err := decode(c.orig)
- if err != nil {
- t.Fatalf("Could not open file %s: %v\n", c.orig, err)
- }
- switch c.name {
- case "integralsauvola":
- actual = IntegralSauvola(orig, c.ksize, c.wsize)
- case "sauvola":
- if *slow {
- actual = Sauvola(orig, c.ksize, c.wsize)
- } else {
- t.Skip("Skipping slow test; use -slow to run it.\n")
- }
- default:
- t.Fatalf("No method %s\n", c.name)
- }
- if *update {
- f, err := os.Create(c.golden)
- defer f.Close()
- if err != nil {
- t.Fatalf("Could not open file %s to update: %v\n", c.golden, err)
- }
- err = png.Encode(f, actual)
- if err != nil {
- t.Fatalf("Could not encode update of %s: %v\n", c.golden, err)
- }
- }
- golden, err := decode(c.golden)
- if err != nil {
- t.Fatalf("Could not open file %s: %v\n", c.golden, err)
- }
- if !imgsequal(golden, actual) {
- t.Errorf("Binarized %s differs to %s\n", c.orig, c.golden)
- }
- })
- }
-}
diff --git a/preproc/test_helpers.go b/preproc/test_helpers.go
deleted file mode 100644
index 20de5b1..0000000
--- a/preproc/test_helpers.go
+++ /dev/null
@@ -1,53 +0,0 @@
-package preproc
-
-// TODO: add different pages as test cases
-// TODO: test non integral img version
-
-import (
- "image"
- "image/draw"
- "image/png"
- "os"
-)
-
-func decode(s string) (*image.Gray, error) {
- f, err := os.Open(s)
- defer f.Close()
- if err != nil {
- return nil, err
- }
- img, err := png.Decode(f)
- if err != nil {
- return nil, err
- }
- b := img.Bounds()
- gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy()))
- draw.Draw(gray, b, img, b.Min, draw.Src)
- return gray, nil
-}
-
-func imgsequal(img1 *image.Gray, img2 *image.Gray) bool {
- b := img1.Bounds()
- if !b.Eq(img2.Bounds()) {
- return false
- }
- for y := b.Min.Y; y < b.Max.Y; y++ {
- for x := b.Min.X; x < b.Max.X; x++ {
- r0, g0, b0, a0 := img1.At(x, y).RGBA()
- r1, g1, b1, a1 := img2.At(x, y).RGBA()
- if r0 != r1 {
- return false
- }
- if g0 != g1 {
- return false
- }
- if b0 != b1 {
- return false
- }
- if a0 != a1 {
- return false
- }
- }
- }
- return true
-}
diff --git a/preproc/testdata/pg1.png b/preproc/testdata/pg1.png
deleted file mode 100644
index 2bcc4b1..0000000
--- a/preproc/testdata/pg1.png
+++ /dev/null
Binary files differ
diff --git a/preproc/testdata/pg1_integralsauvola_k0.3_w19.png b/preproc/testdata/pg1_integralsauvola_k0.3_w19.png
deleted file mode 100644
index bdf5712..0000000
--- a/preproc/testdata/pg1_integralsauvola_k0.3_w19.png
+++ /dev/null
Binary files differ
diff --git a/preproc/testdata/pg1_integralsauvola_k0.5_w19.png b/preproc/testdata/pg1_integralsauvola_k0.5_w19.png
deleted file mode 100644
index 5db2d9a..0000000
--- a/preproc/testdata/pg1_integralsauvola_k0.5_w19.png
+++ /dev/null
Binary files differ
diff --git a/preproc/testdata/pg1_integralsauvola_k0.5_w41.png b/preproc/testdata/pg1_integralsauvola_k0.5_w41.png
deleted file mode 100644
index 050d037..0000000
--- a/preproc/testdata/pg1_integralsauvola_k0.5_w41.png
+++ /dev/null
Binary files differ
diff --git a/preproc/testdata/pg1_sauvola_k0.3_w19.png b/preproc/testdata/pg1_sauvola_k0.3_w19.png
deleted file mode 100644
index bcd595f..0000000
--- a/preproc/testdata/pg1_sauvola_k0.3_w19.png
+++ /dev/null
Binary files differ
diff --git a/preproc/testdata/pg1_sauvola_k0.5_w19.png b/preproc/testdata/pg1_sauvola_k0.5_w19.png
deleted file mode 100644
index 8de596c..0000000
--- a/preproc/testdata/pg1_sauvola_k0.5_w19.png
+++ /dev/null
Binary files differ
diff --git a/preproc/testdata/pg1_sauvola_k0.5_w41.png b/preproc/testdata/pg1_sauvola_k0.5_w41.png
deleted file mode 100644
index b8f50e0..0000000
--- a/preproc/testdata/pg1_sauvola_k0.5_w41.png
+++ /dev/null
Binary files differ
diff --git a/preproc/testdata/pg2.png b/preproc/testdata/pg2.png
deleted file mode 100644
index c7c4249..0000000
--- a/preproc/testdata/pg2.png
+++ /dev/null
Binary files differ
diff --git a/preproc/testdata/pg2_integralwipesides_t0.02_w5.png b/preproc/testdata/pg2_integralwipesides_t0.02_w5.png
deleted file mode 100644
index 6b4ccb2..0000000
--- a/preproc/testdata/pg2_integralwipesides_t0.02_w5.png
+++ /dev/null
Binary files differ
diff --git a/preproc/testdata/pg2_integralwipesides_t0.05_w25.png b/preproc/testdata/pg2_integralwipesides_t0.05_w25.png
deleted file mode 100644
index 39dc88d..0000000
--- a/preproc/testdata/pg2_integralwipesides_t0.05_w25.png
+++ /dev/null
Binary files differ
diff --git a/preproc/testdata/pg2_integralwipesides_t0.05_w5.png b/preproc/testdata/pg2_integralwipesides_t0.05_w5.png
deleted file mode 100644
index 50df855..0000000
--- a/preproc/testdata/pg2_integralwipesides_t0.05_w5.png
+++ /dev/null
Binary files differ
diff --git a/preproc/util.go b/preproc/util.go
deleted file mode 100644
index e23829d..0000000
--- a/preproc/util.go
+++ /dev/null
@@ -1,95 +0,0 @@
-package preproc
-
-import (
- "errors"
- "image"
- "math"
-)
-
-// TODO: name better; maybe verb, x-er
-// TODO: implement these for regular image, and use them to make
-// image functions generic for integral and non- images
-type UsefulImg interface {
- MeanWindow()
- MeanStdDevWindow()
-}
-
-func mean(i []int) float64 {
- sum := 0
- for _, n := range i {
- sum += n
- }
- return float64(sum) / float64(len(i))
-}
-
-func stddev(i []int) float64 {
- m := mean(i)
-
- var sum float64
- for _, n := range i {
- sum += (float64(n) - m) * (float64(n) - m)
- }
- variance := sum / float64(len(i)-1)
- return math.Sqrt(variance)
-}
-
-func meanstddev(i []int) (float64, float64) {
- m := mean(i)
-
- var sum float64
- for _, n := range i {
- sum += (float64(n) - m) * (float64(n) - m)
- }
- variance := float64(sum) / float64(len(i)-1)
- return m, math.Sqrt(variance)
-}
-
-// gets the pixel values surrounding a point in the image
-func surrounding(img *image.Gray, x int, y int, size int) []int {
- b := img.Bounds()
- step := size / 2
-
- miny := y - step
- if miny < b.Min.Y {
- miny = b.Min.Y
- }
- minx := x - step
- if minx < b.Min.X {
- minx = b.Min.X
- }
- maxy := y + step
- if maxy > b.Max.Y {
- maxy = b.Max.Y
- }
- maxx := x + step
- if maxx > b.Max.X {
- maxx = b.Max.X
- }
-
- var s []int
- for yi := miny; yi <= maxy; yi++ {
- for xi := minx; xi <= maxx; xi++ {
- s = append(s, int(img.GrayAt(xi, yi).Y))
- }
- }
- return s
-}
-
-func BinToZeroInv(bin *image.Gray, orig *image.RGBA) (*image.RGBA, error) {
- b := bin.Bounds()
- if !b.Eq(orig.Bounds()) {
- return orig, errors.New("bin and orig images need to be the same dimensions")
- }
- newimg := image.NewRGBA(image.Rect(0, 0, b.Dx(), b.Dy()))
- for y := b.Min.Y; y < b.Max.Y; y++ {
- for x := b.Min.X; x < b.Max.X; x++ {
- if bin.GrayAt(x, y).Y == 255 {
- newimg.Set(x, y, bin.GrayAt(x, y))
- } else {
- newimg.Set(x, y, orig.At(x, y))
- }
- }
- }
-
- return newimg, nil
-}
diff --git a/preproc/wipesides.go b/preproc/wipesides.go
deleted file mode 100644
index 3d08053..0000000
--- a/preproc/wipesides.go
+++ /dev/null
@@ -1,160 +0,0 @@
-package preproc
-
-// TODO: add minimum size variable (default ~30%?)
-// TODO: switch to an interface rather than integralimg.I
-
-import (
- "errors"
- "fmt"
- "image"
- "image/color"
- "image/draw"
- _ "image/jpeg"
- "image/png"
- "os"
-
- "rescribe.xyz/go.git/integralimg"
-)
-
-// returns the proportion of the given window that is black pixels
-func proportion(i integralimg.I, x int, size int) float64 {
- w := i.GetVerticalWindow(x, size)
- return w.Proportion()
-}
-
-// findbestedge goes through every vertical line from x to x+w to
-// find the one with the lowest proportion of black pixels.
-func findbestedge(img integralimg.I, x int, w int) int {
- var bestx int
- var best float64
-
- if w == 1 {
- return x
- }
-
- right := x + w
- for ; x < right; x++ {
- prop := proportion(img, x, 1)
- if prop > best {
- best = prop
- bestx = x
- }
- }
-
- return bestx
-}
-
-// findedges finds the edges of the main content, by moving a window of wsize
-// from near the middle of the image to the left and right, stopping when it reaches
-// a point at which there is a lower proportion of black pixels than thresh.
-func findedges(img integralimg.I, wsize int, thresh float64) (int, int) {
- maxx := len(img[0]) - 1
- var lowedge, highedge int = 0, maxx
-
- // don't start at the middle, as this will fail for 2 column layouts,
- // start 10% left or right of the middle
- notcentre := maxx / 10
-
- for x := maxx/2 + notcentre; x < maxx-wsize; x++ {
- if proportion(img, x, wsize) <= thresh {
- highedge = findbestedge(img, x, wsize)
- break
- }
- }
-
- for x := maxx/2 - notcentre; x > 0; x-- {
- if proportion(img, x, wsize) <= thresh {
- lowedge = findbestedge(img, x, wsize)
- break
- }
- }
-
- return lowedge, highedge
-}
-
-// wipesides fills the sections of image not within the boundaries
-// of lowedge and highedge with white
-func wipesides(img *image.Gray, lowedge int, highedge int) *image.Gray {
- b := img.Bounds()
- new := image.NewGray(b)
-
- // set left edge white
- for x := b.Min.X; x < lowedge; x++ {
- for y := b.Min.Y; y < b.Max.Y; y++ {
- new.SetGray(x, y, color.Gray{255})
- }
- }
- // copy middle
- for x := lowedge; x < highedge; x++ {
- for y := b.Min.Y; y < b.Max.Y; y++ {
- new.SetGray(x, y, img.GrayAt(x, y))
- }
- }
- // set right edge white
- for x := highedge; x < b.Max.X; x++ {
- for y := b.Min.Y; y < b.Max.Y; y++ {
- new.SetGray(x, y, color.Gray{255})
- }
- }
-
- return new
-}
-
-// toonarrow checks whether the area between lowedge and highedge is
-// less than min % of the total image width
-func toonarrow(img *image.Gray, lowedge int, highedge int, min int) bool {
- b := img.Bounds()
- imgw := b.Max.X - b.Min.X
- wipew := highedge - lowedge
- if float64(wipew)/float64(imgw)*100 < float64(min) {
- return true
- }
- return false
-}
-
-// Wipe fills the sections of image which fall outside the content
-// area with white, providing the content area is above min %
-func Wipe(img *image.Gray, wsize int, thresh float64, min int) *image.Gray {
- integral := integralimg.ToIntegralImg(img)
- lowedge, highedge := findedges(integral, wsize, thresh)
- if toonarrow(img, lowedge, highedge, min) {
- return img
- }
- return wipesides(img, lowedge, highedge)
-}
-
-// WipeFile wipes an image file, filling the sections of the image
-// which fall outside the content area with white, providing the
-// content area is above min %.
-// inPath: path of the input image.
-// outPath: path to save the output image.
-// wsize: window size for wipe algorithm.
-// thresh: threshold for wipe algorithm.
-// min: minimum % of content area width to consider valid.
-func WipeFile(inPath string, outPath string, wsize int, thresh float64, min int) error {
- f, err := os.Open(inPath)
- defer f.Close()
- if err != nil {
- return errors.New(fmt.Sprintf("Could not open file %s: %v", inPath, err))
- }
- img, _, err := image.Decode(f)
- if err != nil {
- return errors.New(fmt.Sprintf("Could not decode image: %v", err))
- }
- b := img.Bounds()
- gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy()))
- draw.Draw(gray, b, img, b.Min, draw.Src)
-
- clean := Wipe(gray, wsize, thresh, min)
-
- f, err = os.Create(outPath)
- if err != nil {
- return errors.New(fmt.Sprintf("Could not create file %s: %v", outPath, err))
- }
- defer f.Close()
- err = png.Encode(f, clean)
- if err != nil {
- return errors.New(fmt.Sprintf("Could not encode image: %v", err))
- }
- return nil
-}
diff --git a/preproc/wipesides_test.go b/preproc/wipesides_test.go
deleted file mode 100644
index d5464e0..0000000
--- a/preproc/wipesides_test.go
+++ /dev/null
@@ -1,57 +0,0 @@
-package preproc
-
-// TODO: add different pages as test cases
-// TODO: test non integral img version
-
-import (
- "flag"
- "fmt"
- "image"
- "image/png"
- "os"
- "testing"
-)
-
-func TestWipeSides(t *testing.T) {
- var update = flag.Bool("updatewipe", false, "update golden files")
- cases := []struct {
- name string
- orig string
- golden string
- thresh float64
- wsize int
- }{
- {"integralwipesides", "testdata/pg2.png", "testdata/pg2_integralwipesides_t0.02_w5.png", 0.02, 5},
- {"integralwipesides", "testdata/pg2.png", "testdata/pg2_integralwipesides_t0.05_w5.png", 0.05, 5},
- {"integralwipesides", "testdata/pg2.png", "testdata/pg2_integralwipesides_t0.05_w25.png", 0.05, 25},
- }
-
- for _, c := range cases {
- t.Run(fmt.Sprintf("%s_%0.2f_%d", c.name, c.thresh, c.wsize), func(t *testing.T) {
- var actual *image.Gray
- orig, err := decode(c.orig)
- if err != nil {
- t.Fatalf("Could not open file %s: %v\n", c.orig, err)
- }
- actual = Wipe(orig, c.wsize, c.thresh)
- if *update {
- f, err := os.Create(c.golden)
- defer f.Close()
- if err != nil {
- t.Fatalf("Could not open file %s to update: %v\n", c.golden, err)
- }
- err = png.Encode(f, actual)
- if err != nil {
- t.Fatalf("Could not encode update of %s: %v\n", c.golden, err)
- }
- }
- golden, err := decode(c.golden)
- if err != nil {
- t.Fatalf("Could not open file %s: %v\n", c.golden, err)
- }
- if !imgsequal(golden, actual) {
- t.Errorf("Processed %s differs to %s\n", c.orig, c.golden)
- }
- })
- }
-}