From 787d63fc5d13c6250bd33da5a8e1eadbe86188cd Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 8 Oct 2019 15:37:07 +0100 Subject: Continue separating the repository; remove all but preproc, and move integralimg package under it --- avg-lines/html.go | 61 --- avg-lines/main.go | 69 --- bookpipeline/aws.go | 322 -------------- bookpipeline/cmd/bookpipeline/main.go | 488 --------------------- bookpipeline/cmd/booktopipeline/main.go | 140 ------ bookpipeline/cmd/confgraph/main.go | 71 --- bookpipeline/cmd/getpipelinebook/main.go | 122 ------ bookpipeline/cmd/lspipeline/main.go | 250 ----------- bookpipeline/cmd/mkpipeline/main.go | 79 ---- bookpipeline/graph.go | 155 ------- bucket-lines/bucket.go | 131 ------ bucket-lines/main.go | 87 ---- cmd/binarize/main.go | 78 ++++ cmd/preproc/main.go | 90 ++++ cmd/preprocmulti/main.go | 101 +++++ cmd/wipe/main.go | 55 +++ dehyphenate/main.go | 63 --- eeboxmltohocr/main.go | 135 ------ hocrtotxt/main.go | 30 -- lib/hocr/hocr.go | 129 ------ lib/hocr/lines.go | 131 ------ lib/line/line.go | 57 --- lib/prob/prob.go | 69 --- pgconf/main.go | 30 -- preproc/cmd/binarize/main.go | 78 ---- preproc/cmd/preproc/main.go | 90 ---- preproc/cmd/preprocmulti/main.go | 101 ----- preproc/cmd/wipe/main.go | 55 --- preproc/preprocmulti.go | 94 ---- preproc/sauvola.go | 76 ---- preproc/sauvola_test.go | 70 --- preproc/test_helpers.go | 53 --- preproc/testdata/pg1.png | Bin 651071 -> 0 bytes preproc/testdata/pg1_integralsauvola_k0.3_w19.png | Bin 19456 -> 0 bytes preproc/testdata/pg1_integralsauvola_k0.5_w19.png | Bin 18241 -> 0 bytes preproc/testdata/pg1_integralsauvola_k0.5_w41.png | Bin 18260 -> 0 bytes preproc/testdata/pg1_sauvola_k0.3_w19.png | Bin 19447 -> 0 bytes preproc/testdata/pg1_sauvola_k0.5_w19.png | Bin 18231 -> 0 bytes preproc/testdata/pg1_sauvola_k0.5_w41.png | Bin 18275 -> 0 bytes preproc/testdata/pg2.png | Bin 30803 -> 0 bytes .../testdata/pg2_integralwipesides_t0.02_w5.png | Bin 33595 -> 0 bytes .../testdata/pg2_integralwipesides_t0.05_w25.png | Bin 33432 -> 0 bytes .../testdata/pg2_integralwipesides_t0.05_w5.png | Bin 14546 -> 0 bytes preproc/util.go | 95 ---- preproc/wipesides.go | 160 ------- preproc/wipesides_test.go | 57 --- preprocmulti.go | 94 ++++ sauvola.go | 76 ++++ sauvola_test.go | 70 +++ test_helpers.go | 53 +++ testdata/pg1.png | Bin 0 -> 651071 bytes testdata/pg1_integralsauvola_k0.3_w19.png | Bin 0 -> 19456 bytes testdata/pg1_integralsauvola_k0.5_w19.png | Bin 0 -> 18241 bytes testdata/pg1_integralsauvola_k0.5_w41.png | Bin 0 -> 18260 bytes testdata/pg1_sauvola_k0.3_w19.png | Bin 0 -> 19447 bytes testdata/pg1_sauvola_k0.5_w19.png | Bin 0 -> 18231 bytes testdata/pg1_sauvola_k0.5_w41.png | Bin 0 -> 18275 bytes testdata/pg2.png | Bin 0 -> 30803 bytes testdata/pg2_integralwipesides_t0.02_w5.png | Bin 0 -> 33595 bytes testdata/pg2_integralwipesides_t0.05_w25.png | Bin 0 -> 33432 bytes testdata/pg2_integralwipesides_t0.05_w5.png | Bin 0 -> 14546 bytes util.go | 95 ++++ wipesides.go | 160 +++++++ wipesides_test.go | 57 +++ 64 files changed, 929 insertions(+), 3548 deletions(-) delete mode 100644 avg-lines/html.go delete mode 100644 avg-lines/main.go delete mode 100644 bookpipeline/aws.go delete mode 100644 bookpipeline/cmd/bookpipeline/main.go delete mode 100644 bookpipeline/cmd/booktopipeline/main.go delete mode 100644 bookpipeline/cmd/confgraph/main.go delete mode 100644 bookpipeline/cmd/getpipelinebook/main.go delete mode 100644 bookpipeline/cmd/lspipeline/main.go delete mode 100644 bookpipeline/cmd/mkpipeline/main.go delete mode 100644 bookpipeline/graph.go delete mode 100644 bucket-lines/bucket.go delete mode 100644 bucket-lines/main.go create mode 100644 cmd/binarize/main.go create mode 100644 cmd/preproc/main.go create mode 100644 cmd/preprocmulti/main.go create mode 100644 cmd/wipe/main.go delete mode 100644 dehyphenate/main.go delete mode 100644 eeboxmltohocr/main.go delete mode 100644 hocrtotxt/main.go delete mode 100644 lib/hocr/hocr.go delete mode 100644 lib/hocr/lines.go delete mode 100644 lib/line/line.go delete mode 100644 lib/prob/prob.go delete mode 100644 pgconf/main.go delete mode 100644 preproc/cmd/binarize/main.go delete mode 100644 preproc/cmd/preproc/main.go delete mode 100644 preproc/cmd/preprocmulti/main.go delete mode 100644 preproc/cmd/wipe/main.go delete mode 100644 preproc/preprocmulti.go delete mode 100644 preproc/sauvola.go delete mode 100644 preproc/sauvola_test.go delete mode 100644 preproc/test_helpers.go delete mode 100644 preproc/testdata/pg1.png delete mode 100644 preproc/testdata/pg1_integralsauvola_k0.3_w19.png delete mode 100644 preproc/testdata/pg1_integralsauvola_k0.5_w19.png delete mode 100644 preproc/testdata/pg1_integralsauvola_k0.5_w41.png delete mode 100644 preproc/testdata/pg1_sauvola_k0.3_w19.png delete mode 100644 preproc/testdata/pg1_sauvola_k0.5_w19.png delete mode 100644 preproc/testdata/pg1_sauvola_k0.5_w41.png delete mode 100644 preproc/testdata/pg2.png delete mode 100644 preproc/testdata/pg2_integralwipesides_t0.02_w5.png delete mode 100644 preproc/testdata/pg2_integralwipesides_t0.05_w25.png delete mode 100644 preproc/testdata/pg2_integralwipesides_t0.05_w5.png delete mode 100644 preproc/util.go delete mode 100644 preproc/wipesides.go delete mode 100644 preproc/wipesides_test.go create mode 100644 preprocmulti.go create mode 100644 sauvola.go create mode 100644 sauvola_test.go create mode 100644 test_helpers.go create mode 100644 testdata/pg1.png create mode 100644 testdata/pg1_integralsauvola_k0.3_w19.png create mode 100644 testdata/pg1_integralsauvola_k0.5_w19.png create mode 100644 testdata/pg1_integralsauvola_k0.5_w41.png create mode 100644 testdata/pg1_sauvola_k0.3_w19.png create mode 100644 testdata/pg1_sauvola_k0.5_w19.png create mode 100644 testdata/pg1_sauvola_k0.5_w41.png create mode 100644 testdata/pg2.png create mode 100644 testdata/pg2_integralwipesides_t0.02_w5.png create mode 100644 testdata/pg2_integralwipesides_t0.05_w25.png create mode 100644 testdata/pg2_integralwipesides_t0.05_w5.png create mode 100644 util.go create mode 100644 wipesides.go create mode 100644 wipesides_test.go diff --git a/avg-lines/html.go b/avg-lines/html.go deleted file mode 100644 index 443cc4a..0000000 --- a/avg-lines/html.go +++ /dev/null @@ -1,61 +0,0 @@ -package main - -import ( - "fmt" - "os" - "path/filepath" - - "rescribe.xyz/go.git/lib/line" -) - -func copylineimg(fn string, l line.Detail) error { - f, err := os.Create(fn) - if err != nil { - return err - } - defer f.Close() - - return l.Img.CopyLineTo(f) -} - -func htmlout(dir string, lines line.Details) error { - err := os.MkdirAll(dir, 0700) - if err != nil { - return err - } - - fn := filepath.Join(dir, "index.html") - f, err := os.Create(fn) - if err != nil { - return err - } - defer f.Close() - - _, err = fmt.Fprintf(f, ""+ - "\n\n") - if err != nil { - return err - } - for _, l := range lines { - fn = filepath.Base(l.OcrName) + "_" + l.Name + ".png" - err = copylineimg(filepath.Join(dir, fn), l) - if err != nil { - return err - } - _, err = fmt.Fprintf(f, "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n", - l.Avgconf, l.OcrName, l.Name, fn, l.Text) - if err != nil { - return err - } - } - _, err = fmt.Fprintf(f, "

%.4f%%

%s %s
%s
\n\n") - if err != nil { - return err - } - - return nil -} diff --git a/avg-lines/main.go b/avg-lines/main.go deleted file mode 100644 index 14b21bd..0000000 --- a/avg-lines/main.go +++ /dev/null @@ -1,69 +0,0 @@ -package main - -import ( - "flag" - "fmt" - "log" - "os" - "path/filepath" - "sort" - - "rescribe.xyz/go.git/lib/hocr" - "rescribe.xyz/go.git/lib/line" - "rescribe.xyz/go.git/lib/prob" -) - -func main() { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: avg-lines [-html dir] [-nosort] [prob1] [hocr1] [prob2] [...]\n") - fmt.Fprintf(os.Stderr, "Prints a report of the average confidence for each line, sorted\n") - fmt.Fprintf(os.Stderr, "from worst to best.\n") - fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n") - fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n") - fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n") - fmt.Fprintf(os.Stderr, "option.\n\n") - flag.PrintDefaults() - } - var html = flag.String("html", "", "Output in html format to the specified directory") - var nosort = flag.Bool("nosort", false, "Don't sort lines by confidence") - flag.Parse() - if flag.NArg() < 1 { - flag.Usage() - os.Exit(1) - } - - var err error - lines := make(line.Details, 0) - - for _, f := range flag.Args() { - var newlines line.Details - switch ext := filepath.Ext(f); ext { - case ".prob": - newlines, err = prob.GetLineDetails(f) - case ".hocr": - newlines, err = hocr.GetLineDetails(f) - default: - log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f) - continue - } - if err != nil { - log.Fatal(err) - } - - for _, l := range newlines { - lines = append(lines, l) - } - } - - if *nosort == false { - sort.Sort(lines) - } - - if *html == "" { - for _, l := range lines { - fmt.Printf("%s %s: %.2f%%\n", l.OcrName, l.Name, l.Avgconf) - } - } else { - htmlout(*html, lines) - } -} diff --git a/bookpipeline/aws.go b/bookpipeline/aws.go deleted file mode 100644 index 0127d6e..0000000 --- a/bookpipeline/aws.go +++ /dev/null @@ -1,322 +0,0 @@ -package bookpipeline - -import ( - "errors" - "fmt" - "log" - "os" - "time" - - "github.com/aws/aws-sdk-go/aws" - "github.com/aws/aws-sdk-go/aws/awserr" - "github.com/aws/aws-sdk-go/aws/session" - "github.com/aws/aws-sdk-go/service/ec2" - "github.com/aws/aws-sdk-go/service/s3" - "github.com/aws/aws-sdk-go/service/s3/s3manager" - "github.com/aws/aws-sdk-go/service/sqs" -) - -const PreprocPattern = `_bin[0-9].[0-9].png` -const heartbeatRetry = 10 - -type Qmsg struct { - Id, Handle, Body string -} - -type InstanceDetails struct { - Id, Name, Ip, Spot, Type, State, LaunchTime string -} - -type AwsConn struct { - // these need to be set before running Init() - Region string - Logger *log.Logger - - // these are used internally - sess *session.Session - ec2svc *ec2.EC2 - s3svc *s3.S3 - sqssvc *sqs.SQS - downloader *s3manager.Downloader - uploader *s3manager.Uploader - wipequrl, prequrl, ocrqurl, analysequrl string - wipstorageid string -} - -// TODO: split this up, as not everything is needed for different uses -func (a *AwsConn) Init() error { - if a.Region == "" { - return errors.New("No Region set") - } - if a.Logger == nil { - return errors.New("No logger set") - } - - var err error - a.sess, err = session.NewSession(&aws.Config{ - Region: aws.String(a.Region), - }) - if err != nil { - return errors.New(fmt.Sprintf("Failed to set up aws session: %s", err)) - } - a.ec2svc = ec2.New(a.sess) - a.s3svc = s3.New(a.sess) - a.sqssvc = sqs.New(a.sess) - a.downloader = s3manager.NewDownloader(a.sess) - a.uploader = s3manager.NewUploader(a.sess) - - a.Logger.Println("Getting preprocess queue URL") - result, err := a.sqssvc.GetQueueUrl(&sqs.GetQueueUrlInput{ - QueueName: aws.String("rescribepreprocess"), - }) - if err != nil { - return errors.New(fmt.Sprintf("Error getting preprocess queue URL: %s", err)) - } - a.prequrl = *result.QueueUrl - - a.Logger.Println("Getting wipeonly queue URL") - result, err = a.sqssvc.GetQueueUrl(&sqs.GetQueueUrlInput{ - QueueName: aws.String("rescribewipeonly"), - }) - if err != nil { - return errors.New(fmt.Sprintf("Error getting wipeonly queue URL: %s", err)) - } - a.wipequrl = *result.QueueUrl - - a.Logger.Println("Getting OCR queue URL") - result, err = a.sqssvc.GetQueueUrl(&sqs.GetQueueUrlInput{ - QueueName: aws.String("rescribeocr"), - }) - if err != nil { - return errors.New(fmt.Sprintf("Error getting OCR queue URL: %s", err)) - } - a.ocrqurl = *result.QueueUrl - - a.Logger.Println("Getting analyse queue URL") - result, err = a.sqssvc.GetQueueUrl(&sqs.GetQueueUrlInput{ - QueueName: aws.String("rescribeanalyse"), - }) - if err != nil { - return errors.New(fmt.Sprintf("Error getting analyse queue URL: %s", err)) - } - a.analysequrl = *result.QueueUrl - - a.wipstorageid = "rescribeinprogress" - - return nil -} - -func (a *AwsConn) CheckQueue(url string, timeout int64) (Qmsg, error) { - msgResult, err := a.sqssvc.ReceiveMessage(&sqs.ReceiveMessageInput{ - MaxNumberOfMessages: aws.Int64(1), - VisibilityTimeout: &timeout, - WaitTimeSeconds: aws.Int64(20), - QueueUrl: &url, - }) - if err != nil { - return Qmsg{}, err - } - - if len(msgResult.Messages) > 0 { - msg := Qmsg{Id: *msgResult.Messages[0].MessageId, - Handle: *msgResult.Messages[0].ReceiptHandle, - Body: *msgResult.Messages[0].Body} - a.Logger.Println("Message received:", msg.Body) - return msg, nil - } else { - return Qmsg{}, nil - } -} - -// QueueHeartbeat updates the visibility timeout of a message. This -// ensures that the message remains "in flight", meaning that it -// cannot be seen by other processes, but if this process fails the -// timeout will expire and it will go back to being available for -// any other process to retrieve and process. -// -// SQS only allows messages to be "in flight" for up to 12 hours, so -// this will detect if the request for an update to visibility timeout -// fails, and if so will attempt to find the message on the queue, and -// return it, as the handle will have changed. -func (a *AwsConn) QueueHeartbeat(msg Qmsg, qurl string, duration int64) (Qmsg, error) { - _, err := a.sqssvc.ChangeMessageVisibility(&sqs.ChangeMessageVisibilityInput{ - ReceiptHandle: &msg.Handle, - QueueUrl: &qurl, - VisibilityTimeout: &duration, - }) - if err != nil { - aerr, ok := err.(awserr.Error) - - // Check if the visibility timeout has exceeded the maximum allowed, - // and if so try to find the message again to get a new handle. - if ok && aerr.Code() == "InvalidParameterValue" { - // Try heartbeatRetry times to find the message - for range [heartbeatRetry]bool{} { - // Wait a little in case existing visibilitytimeout needs to expire - time.Sleep((time.Duration(duration) / heartbeatRetry) * time.Second) - - msgResult, err := a.sqssvc.ReceiveMessage(&sqs.ReceiveMessageInput{ - MaxNumberOfMessages: aws.Int64(10), - VisibilityTimeout: &duration, - WaitTimeSeconds: aws.Int64(20), - QueueUrl: &qurl, - }) - if err != nil { - return Qmsg{}, errors.New(fmt.Sprintf("Heartbeat error looking for message to update heartbeat: %s", err)) - } - for _, m := range msgResult.Messages { - if *m.MessageId == msg.Id { - return Qmsg{ - Id: *m.MessageId, - Handle: *m.ReceiptHandle, - Body: *m.Body, - }, nil - } - } - } - return Qmsg{}, errors.New("Heartbeat error failed to find message to update heartbeat") - } else { - return Qmsg{}, errors.New(fmt.Sprintf("Heartbeat error updating queue duration: %s", err)) - } - } - return Qmsg{}, nil -} - -// GetQueueDetails gets the number of in progress and available -// messages for a queue. These are returned as strings. -func (a *AwsConn) GetQueueDetails(url string) (string, string, error) { - numAvailable := "ApproximateNumberOfMessages" - numInProgress := "ApproximateNumberOfMessagesNotVisible" - attrs, err := a.sqssvc.GetQueueAttributes(&sqs.GetQueueAttributesInput{ - AttributeNames: []*string{&numAvailable, &numInProgress}, - QueueUrl: &url, - }) - if err != nil { - return "", "", errors.New(fmt.Sprintf("Failed to get queue attributes: %s", err)) - } - return *attrs.Attributes[numAvailable], *attrs.Attributes[numInProgress], nil -} - -func (a *AwsConn) PreQueueId() string { - return a.prequrl -} - -func (a *AwsConn) WipeQueueId() string { - return a.wipequrl -} - -func (a *AwsConn) OCRQueueId() string { - return a.ocrqurl -} - -func (a *AwsConn) AnalyseQueueId() string { - return a.analysequrl -} - -func (a *AwsConn) WIPStorageId() string { - return a.wipstorageid -} - -func (a *AwsConn) ListObjects(bucket string, prefix string) ([]string, error) { - var names []string - err := a.s3svc.ListObjectsV2Pages(&s3.ListObjectsV2Input{ - Bucket: aws.String(bucket), - Prefix: aws.String(prefix), - }, func(page *s3.ListObjectsV2Output, last bool) bool { - for _, r := range page.Contents { - names = append(names, *r.Key) - } - return true - }) - return names, err -} - -func (a *AwsConn) AddToQueue(url string, msg string) error { - _, err := a.sqssvc.SendMessage(&sqs.SendMessageInput{ - MessageBody: &msg, - QueueUrl: &url, - }) - return err -} - -func (a *AwsConn) DelFromQueue(url string, handle string) error { - _, err := a.sqssvc.DeleteMessage(&sqs.DeleteMessageInput{ - QueueUrl: &url, - ReceiptHandle: &handle, - }) - return err -} - -func (a *AwsConn) Download(bucket string, key string, path string) error { - f, err := os.Create(path) - if err != nil { - return err - } - defer f.Close() - - _, err = a.downloader.Download(f, - &s3.GetObjectInput{ - Bucket: aws.String(bucket), - Key: &key, - }) - return err -} - -func (a *AwsConn) Upload(bucket string, key string, path string) error { - file, err := os.Open(path) - if err != nil { - log.Fatalln("Failed to open file", path, err) - } - defer file.Close() - - _, err = a.uploader.Upload(&s3manager.UploadInput{ - Bucket: aws.String(bucket), - Key: aws.String(key), - Body: file, - }) - return err -} - -func (a *AwsConn) GetLogger() *log.Logger { - return a.Logger -} - -func instanceDetailsFromPage(page *ec2.DescribeInstancesOutput) []InstanceDetails { - var details []InstanceDetails - for _, r := range page.Reservations { - for _, i := range r.Instances { - var d InstanceDetails - - for _, t := range i.Tags { - if *t.Key == "Name" { - d.Name = *t.Value - } - } - if i.PublicIpAddress != nil { - d.Ip = *i.PublicIpAddress - } - if i.SpotInstanceRequestId != nil { - d.Spot = *i.SpotInstanceRequestId - } - d.Type = *i.InstanceType - d.Id = *i.InstanceId - d.LaunchTime = i.LaunchTime.String() - d.State = *i.State.Name - - details = append(details, d) - } - } - - return details -} - -func (a *AwsConn) GetInstanceDetails() ([]InstanceDetails, error) { - var details []InstanceDetails - err := a.ec2svc.DescribeInstancesPages(&ec2.DescribeInstancesInput{}, func(page *ec2.DescribeInstancesOutput, lastPage bool) bool { - for _, d := range instanceDetailsFromPage(page) { - details = append(details, d) - } - return !lastPage - }) - return details, err -} diff --git a/bookpipeline/cmd/bookpipeline/main.go b/bookpipeline/cmd/bookpipeline/main.go deleted file mode 100644 index 59ece72..0000000 --- a/bookpipeline/cmd/bookpipeline/main.go +++ /dev/null @@ -1,488 +0,0 @@ -package main - -// TODO: check if images are prebinarised and if so skip multiple binarisation - -import ( - "errors" - "flag" - "fmt" - "log" - "os" - "os/exec" - "path/filepath" - "regexp" - "strings" - "time" - - "rescribe.xyz/go.git/bookpipeline" - "rescribe.xyz/go.git/lib/hocr" - "rescribe.xyz/go.git/preproc" -) - -const usage = `Usage: bookpipeline [-v] [-np] [-nw] [-no] [-na] [-t training] - -Watches the preprocess, ocr and analyse queues for book names. When -one is found this general process is followed: - -- The book name is hidden from the queue, and a 'heartbeat' is - started which keeps it hidden (this will time out after 2 minutes - if the program is terminated) -- The necessary files from bookname/ are downloaded -- The files are processed -- The resulting files are uploaded to bookname/ -- The heartbeat is stopped -- The book name is removed from the queue it was taken from, and - added to the next queue for future processing - -` - -const PauseBetweenChecks = 3 * time.Minute -const HeartbeatTime = 60 - -// null writer to enable non-verbose logging to be discarded -type NullWriter bool - -func (w NullWriter) Write(p []byte) (n int, err error) { - return len(p), nil -} - -type Clouder interface { - Init() error - ListObjects(bucket string, prefix string) ([]string, error) - Download(bucket string, key string, fn string) error - Upload(bucket string, key string, path string) error - CheckQueue(url string, timeout int64) (bookpipeline.Qmsg, error) - AddToQueue(url string, msg string) error - DelFromQueue(url string, handle string) error - QueueHeartbeat(msg bookpipeline.Qmsg, qurl string, duration int64) (bookpipeline.Qmsg, error) -} - -type Pipeliner interface { - Clouder - PreQueueId() string - WipeQueueId() string - OCRQueueId() string - AnalyseQueueId() string - WIPStorageId() string - GetLogger() *log.Logger -} - -func download(dl chan string, process chan string, conn Pipeliner, dir string, errc chan error, logger *log.Logger) { - for key := range dl { - fn := filepath.Join(dir, filepath.Base(key)) - logger.Println("Downloading", key) - err := conn.Download(conn.WIPStorageId(), key, fn) - if err != nil { - for range dl { - } // consume the rest of the receiving channel so it isn't blocked - close(process) - errc <- err - return - } - process <- fn - } - close(process) -} - -func up(c chan string, done chan bool, conn Pipeliner, bookname string, errc chan error, logger *log.Logger) { - for path := range c { - name := filepath.Base(path) - key := filepath.Join(bookname, name) - logger.Println("Uploading", key) - err := conn.Upload(conn.WIPStorageId(), key, path) - if err != nil { - for range c { - } // consume the rest of the receiving channel so it isn't blocked - errc <- err - return - } - } - - done <- true -} - -func preprocess(pre chan string, up chan string, errc chan error, logger *log.Logger) { - for path := range pre { - logger.Println("Preprocessing", path) - done, err := preproc.PreProcMulti(path, []float64{0.1, 0.2, 0.4, 0.5}, "binary", 0, true, 5, 30) - if err != nil { - for range pre { - } // consume the rest of the receiving channel so it isn't blocked - close(up) - errc <- err - return - } - for _, p := range done { - up <- p - } - } - close(up) -} - -func wipe(towipe chan string, up chan string, errc chan error, logger *log.Logger) { - for path := range towipe { - logger.Println("Wiping", path) - s := strings.Split(path, ".") - base := strings.Join(s[:len(s)-1], "") - outpath := base + "_bin0.0.png" - err := preproc.WipeFile(path, outpath, 5, 0.03, 30) - if err != nil { - for range towipe { - } // consume the rest of the receiving channel so it isn't blocked - close(up) - errc <- err - return - } - up <- outpath - } - close(up) -} - -func ocr(training string) func(chan string, chan string, chan error, *log.Logger) { - return func(toocr chan string, up chan string, errc chan error, logger *log.Logger) { - for path := range toocr { - logger.Println("OCRing", path) - name := strings.Replace(path, ".png", "", 1) - cmd := exec.Command("tesseract", "-l", training, path, name, "hocr") - err := cmd.Run() - if err != nil { - for range toocr { - } // consume the rest of the receiving channel so it isn't blocked - close(up) - errc <- errors.New(fmt.Sprintf("Error ocring %s: %s", path, err)) - return - } - up <- name + ".hocr" - } - close(up) - } -} - -func analyse(toanalyse chan string, up chan string, errc chan error, logger *log.Logger) { - confs := make(map[string][]*bookpipeline.Conf) - bestconfs := make(map[string]*bookpipeline.Conf) - savedir := "" - - for path := range toanalyse { - if savedir == "" { - savedir = filepath.Dir(path) - } - logger.Println("Calculating confidence for", path) - avg, err := hocr.GetAvgConf(path) - if err != nil && err.Error() == "No words found" { - continue - } - if err != nil { - for range toanalyse { - } // consume the rest of the receiving channel so it isn't blocked - close(up) - errc <- errors.New(fmt.Sprintf("Error retreiving confidence for %s: %s", path, err)) - return - } - base := filepath.Base(path) - codestart := strings.Index(base, "_bin") - name := base[0:codestart] - var c bookpipeline.Conf - c.Path = path - c.Code = base[codestart:] - c.Conf = avg - confs[name] = append(confs[name], &c) - - } - - fn := filepath.Join(savedir, "conf") - logger.Println("Saving confidences in file", fn) - f, err := os.Create(fn) - if err != nil { - close(up) - errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err)) - return - } - defer f.Close() - - logger.Println("Finding best confidence for each page, and saving all confidences") - for base, conf := range confs { - var best float64 - for _, c := range conf { - if c.Conf > best { - best = c.Conf - bestconfs[base] = c - } - _, err = fmt.Fprintf(f, "%s\t%02.f\n", c.Path, c.Conf) - if err != nil { - close(up) - errc <- errors.New(fmt.Sprintf("Error writing confidences file: %s", err)) - return - } - } - } - up <- fn - - logger.Println("Creating best file listing the best file for each page") - fn = filepath.Join(savedir, "best") - f, err = os.Create(fn) - if err != nil { - close(up) - errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err)) - return - } - defer f.Close() - for _, conf := range bestconfs { - _, err = fmt.Fprintf(f, "%s\n", filepath.Base(conf.Path)) - } - up <- fn - - logger.Println("Creating graph") - fn = filepath.Join(savedir, "graph.png") - f, err = os.Create(fn) - if err != nil { - close(up) - errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err)) - return - } - defer f.Close() - err = bookpipeline.Graph(bestconfs, filepath.Base(savedir), f) - if err != nil { - close(up) - errc <- errors.New(fmt.Sprintf("Error rendering graph: %s", err)) - return - } - up <- fn - - close(up) -} - -func heartbeat(conn Pipeliner, t *time.Ticker, msg bookpipeline.Qmsg, queue string, msgc chan bookpipeline.Qmsg, errc chan error) { - currentmsg := msg - for range t.C { - m, err := conn.QueueHeartbeat(currentmsg, queue, HeartbeatTime*2) - if err != nil { - errc <- err - t.Stop() - return - } - if m.Id != "" { - conn.GetLogger().Println("Replaced message handle as visibilitytimeout limit was reached") - currentmsg = m - // TODO: maybe handle communicating new msg more gracefully than this - for range msgc { - } // throw away any old msgc - msgc <- m - } - } -} - -func processBook(msg bookpipeline.Qmsg, conn Pipeliner, process func(chan string, chan string, chan error, *log.Logger), match *regexp.Regexp, fromQueue string, toQueue string) error { - dl := make(chan string) - msgc := make(chan bookpipeline.Qmsg) - processc := make(chan string) - upc := make(chan string) - done := make(chan bool) - errc := make(chan error) - - bookname := msg.Body - - d := filepath.Join(os.TempDir(), bookname) - err := os.MkdirAll(d, 0755) - if err != nil { - return errors.New(fmt.Sprintf("Failed to create directory %s: %s", d, err)) - } - - t := time.NewTicker(HeartbeatTime * time.Second) - go heartbeat(conn, t, msg, fromQueue, msgc, errc) - - // these functions will do their jobs when their channels have data - go download(dl, processc, conn, d, errc, conn.GetLogger()) - go process(processc, upc, errc, conn.GetLogger()) - go up(upc, done, conn, bookname, errc, conn.GetLogger()) - - conn.GetLogger().Println("Getting list of objects to download") - objs, err := conn.ListObjects(conn.WIPStorageId(), bookname) - if err != nil { - t.Stop() - _ = os.RemoveAll(d) - return errors.New(fmt.Sprintf("Failed to get list of files for book %s: %s", bookname, err)) - } - var todl []string - for _, n := range objs { - if !match.MatchString(n) { - conn.GetLogger().Println("Skipping item that doesn't match target", n) - continue - } - todl = append(todl, n) - } - for _, a := range todl { - dl <- a - } - close(dl) - - // wait for either the done or errc channel to be sent to - select { - case err = <-errc: - t.Stop() - _ = os.RemoveAll(d) - return err - case <-done: - } - - if toQueue != "" { - conn.GetLogger().Println("Sending", bookname, "to queue", toQueue) - err = conn.AddToQueue(toQueue, bookname) - if err != nil { - t.Stop() - _ = os.RemoveAll(d) - return errors.New(fmt.Sprintf("Error adding to queue %s: %s", bookname, err)) - } - } - - t.Stop() - - // check whether we're using a newer msg handle - select { - case m, ok := <-msgc: - if ok { - msg = m - conn.GetLogger().Println("Using new message handle to delete message from old queue") - } - default: - conn.GetLogger().Println("Using original message handle to delete message from old queue") - } - - conn.GetLogger().Println("Deleting original message from queue", fromQueue) - err = conn.DelFromQueue(fromQueue, msg.Handle) - if err != nil { - _ = os.RemoveAll(d) - return errors.New(fmt.Sprintf("Error deleting message from queue: %s", err)) - } - - err = os.RemoveAll(d) - if err != nil { - return errors.New(fmt.Sprintf("Failed to remove directory %s: %s", d, err)) - } - - return nil -} - -func main() { - verbose := flag.Bool("v", false, "verbose") - training := flag.String("t", "rescribealphav5", "tesseract training file to use") - nopreproc := flag.Bool("np", false, "disable preprocessing") - nowipe := flag.Bool("nw", false, "disable wipeonly") - noocr := flag.Bool("no", false, "disable ocr") - noanalyse := flag.Bool("na", false, "disable analysis") - - flag.Usage = func() { - fmt.Fprintf(flag.CommandLine.Output(), usage) - flag.PrintDefaults() - } - flag.Parse() - - var verboselog *log.Logger - if *verbose { - verboselog = log.New(os.Stdout, "", 0) - } else { - var n NullWriter - verboselog = log.New(n, "", 0) - } - - origPattern := regexp.MustCompile(`[0-9]{4}.jpg$`) // TODO: match alternative file naming - wipePattern := regexp.MustCompile(`[0-9]{4}.png$`) - preprocessedPattern := regexp.MustCompile(`_bin[0-9].[0-9].png$`) - ocredPattern := regexp.MustCompile(`.hocr$`) - - var conn Pipeliner - conn = &bookpipeline.AwsConn{Region: "eu-west-2", Logger: verboselog} - - verboselog.Println("Setting up AWS session") - err := conn.Init() - if err != nil { - log.Fatalln("Error setting up cloud connection:", err) - } - verboselog.Println("Finished setting up AWS session") - - var checkPreQueue <-chan time.Time - var checkWipeQueue <-chan time.Time - var checkOCRQueue <-chan time.Time - var checkAnalyseQueue <-chan time.Time - if !*nopreproc { - checkPreQueue = time.After(0) - } - if !*nowipe { - checkWipeQueue = time.After(0) - } - if !*noocr { - checkOCRQueue = time.After(0) - } - if !*noanalyse { - checkAnalyseQueue = time.After(0) - } - - for { - select { - case <-checkPreQueue: - msg, err := conn.CheckQueue(conn.PreQueueId(), HeartbeatTime*2) - checkPreQueue = time.After(PauseBetweenChecks) - if err != nil { - log.Println("Error checking preprocess queue", err) - continue - } - if msg.Handle == "" { - verboselog.Println("No message received on preprocess queue, sleeping") - continue - } - verboselog.Println("Message received on preprocess queue, processing", msg.Body) - err = processBook(msg, conn, preprocess, origPattern, conn.PreQueueId(), conn.OCRQueueId()) - if err != nil { - log.Println("Error during preprocess", err) - } - case <-checkWipeQueue: - msg, err := conn.CheckQueue(conn.WipeQueueId(), HeartbeatTime*2) - checkWipeQueue = time.After(PauseBetweenChecks) - if err != nil { - log.Println("Error checking wipeonly queue", err) - continue - } - if msg.Handle == "" { - verboselog.Println("No message received on wipeonly queue, sleeping") - continue - } - verboselog.Println("Message received on wipeonly queue, processing", msg.Body) - err = processBook(msg, conn, wipe, wipePattern, conn.WipeQueueId(), conn.OCRQueueId()) - if err != nil { - log.Println("Error during wipe", err) - } - case <-checkOCRQueue: - msg, err := conn.CheckQueue(conn.OCRQueueId(), HeartbeatTime*2) - checkOCRQueue = time.After(PauseBetweenChecks) - if err != nil { - log.Println("Error checking OCR queue", err) - continue - } - if msg.Handle == "" { - verboselog.Println("No message received on OCR queue, sleeping") - continue - } - verboselog.Println("Message received on OCR queue, processing", msg.Body) - err = processBook(msg, conn, ocr(*training), preprocessedPattern, conn.OCRQueueId(), conn.AnalyseQueueId()) - if err != nil { - log.Println("Error during OCR process", err) - } - case <-checkAnalyseQueue: - msg, err := conn.CheckQueue(conn.AnalyseQueueId(), HeartbeatTime*2) - checkAnalyseQueue = time.After(PauseBetweenChecks) - if err != nil { - log.Println("Error checking analyse queue", err) - continue - } - if msg.Handle == "" { - verboselog.Println("No message received on analyse queue, sleeping") - continue - } - verboselog.Println("Message received on analyse queue, processing", msg.Body) - err = processBook(msg, conn, analyse, ocredPattern, conn.AnalyseQueueId(), "") - if err != nil { - log.Println("Error during analysis", err) - } - } - } -} diff --git a/bookpipeline/cmd/booktopipeline/main.go b/bookpipeline/cmd/booktopipeline/main.go deleted file mode 100644 index 6d9f146..0000000 --- a/bookpipeline/cmd/booktopipeline/main.go +++ /dev/null @@ -1,140 +0,0 @@ -package main - -// TODO: use bookpipeline package to do aws stuff - -import ( - "flag" - "fmt" - "log" - "os" - "path/filepath" - - "github.com/aws/aws-sdk-go/aws" - "github.com/aws/aws-sdk-go/aws/session" - "github.com/aws/aws-sdk-go/service/s3/s3manager" - "github.com/aws/aws-sdk-go/service/sqs" -) - -const usage = `Usage: booktopipeline [-prebinarised] [-v] bookdir [bookname] - -Uploads the book in bookdir to the S3 'inprogress' bucket and adds it -to the 'preprocess' SQS queue, or the 'wipeonly' queue if the -prebinarised flag is set. - -If bookname is omitted the last part of the bookdir is used. -` - -// null writer to enable non-verbose logging to be discarded -type NullWriter bool - -func (w NullWriter) Write(p []byte) (n int, err error) { - return len(p), nil -} - -var verboselog *log.Logger - -type fileWalk chan string - -func (f fileWalk) Walk(path string, info os.FileInfo, err error) error { - if err != nil { - return err - } - if !info.IsDir() { - f <- path - } - return nil -} - -func main() { - verbose := flag.Bool("v", false, "Verbose") - wipeonly := flag.Bool("prebinarised", false, "Prebinarised: only preprocessing will be to wipe") - - flag.Usage = func() { - fmt.Fprintf(flag.CommandLine.Output(), usage) - flag.PrintDefaults() - } - flag.Parse() - if flag.NArg() < 1 { - flag.Usage() - return - } - - bookdir := flag.Arg(0) - var bookname string - if flag.NArg() > 2 { - bookname = flag.Arg(1) - } else { - bookname = filepath.Base(bookdir) - } - - if *verbose { - verboselog = log.New(os.Stdout, "", log.LstdFlags) - } else { - var n NullWriter - verboselog = log.New(n, "", log.LstdFlags) - } - - verboselog.Println("Setting up AWS session") - sess, err := session.NewSession(&aws.Config{ - Region: aws.String("eu-west-2"), - }) - if err != nil { - log.Fatalln("Error: failed to set up aws session:", err) - } - sqssvc := sqs.New(sess) - uploader := s3manager.NewUploader(sess) - - var qname string - if *wipeonly { - qname = "rescribewipeonly" - } else { - qname = "rescribepreprocess" - } - verboselog.Println("Getting Queue URL for", qname) - result, err := sqssvc.GetQueueUrl(&sqs.GetQueueUrlInput{ - QueueName: aws.String(qname), - }) - if err != nil { - log.Fatalln("Error getting queue URL for", qname, ":", err) - } - qurl := *result.QueueUrl - - // concurrent walking upload based on example at - // https://docs.aws.amazon.com/sdk-for-go/v1/developer-guide/sdk-utilities.html - verboselog.Println("Walking", bookdir) - walker := make(fileWalk) - go func() { - err = filepath.Walk(bookdir, walker.Walk) - if err != nil { - log.Fatalln("Filesystem walk failed:", err) - } - close(walker) - }() - - for path := range walker { - verboselog.Println("Uploading", path) - name := filepath.Base(path) - file, err := os.Open(path) - if err != nil { - log.Fatalln("Open file", path, "failed:", err) - } - defer file.Close() - _, err = uploader.Upload(&s3manager.UploadInput{ - Bucket: aws.String("rescribeinprogress"), - Key: aws.String(filepath.Join(bookname, name)), - Body: file, - }) - if err != nil { - log.Fatalln("Failed to upload", path, err) - } - } - - verboselog.Println("Sending message", bookname, "to queue", qurl) - _, err = sqssvc.SendMessage(&sqs.SendMessageInput{ - MessageBody: aws.String(bookname), - QueueUrl: &qurl, - }) - if err != nil { - log.Fatalln("Error adding book to queue:", err) - } -} diff --git a/bookpipeline/cmd/confgraph/main.go b/bookpipeline/cmd/confgraph/main.go deleted file mode 100644 index b60821e..0000000 --- a/bookpipeline/cmd/confgraph/main.go +++ /dev/null @@ -1,71 +0,0 @@ -package main - -import ( - "flag" - "fmt" - "log" - "os" - "path/filepath" - "strings" - - "rescribe.xyz/go.git/bookpipeline" - "rescribe.xyz/go.git/lib/hocr" -) - -func walker(confs *[]*bookpipeline.Conf) filepath.WalkFunc { - return func(path string, info os.FileInfo, err error) error { - if info.IsDir() { - return nil - } - if !strings.HasSuffix(path, ".hocr") { - return nil - } - avg, err := hocr.GetAvgConf(path) - if err != nil { - return err - } - c := bookpipeline.Conf{ - Conf: avg, - Path: path, - } - *confs = append(*confs, &c) - return nil - } -} - -func main() { - flag.Usage = func() { - fmt.Fprintln(flag.CommandLine.Output(), "Usage: bookpipeline hocrdir graph.png") - flag.PrintDefaults() - } - flag.Parse() - - if flag.NArg() != 2 { - flag.Usage() - return - } - - var confs []*bookpipeline.Conf - err := filepath.Walk(flag.Arg(0), walker(&confs)) - if err != nil { - log.Fatalln("Failed to walk", flag.Arg(0), err) - } - - // Structure to fit what bookpipeline.Graph needs - // TODO: probably reorganise bookpipeline to just need []*Conf - cconfs := make(map[string]*bookpipeline.Conf) - for _, c := range confs { - cconfs[c.Path] = c - } - - fn := flag.Arg(1) - f, err := os.Create(fn) - if err != nil { - log.Fatalln("Error creating file", fn, err) - } - defer f.Close() - err = bookpipeline.Graph(cconfs, filepath.Base(flag.Arg(0)), f) - if err != nil { - log.Fatalln("Error creating graph", err) - } -} diff --git a/bookpipeline/cmd/getpipelinebook/main.go b/bookpipeline/cmd/getpipelinebook/main.go deleted file mode 100644 index 66e3f70..0000000 --- a/bookpipeline/cmd/getpipelinebook/main.go +++ /dev/null @@ -1,122 +0,0 @@ -package main - -import ( - "bufio" - "flag" - "fmt" - "log" - "os" - "path/filepath" - - "rescribe.xyz/go.git/bookpipeline" -) - -const usage = "Usage: getpipelinebook [-a] [-v] bookname\n\nDownloads the pipeline results for a book.\n" - -// null writer to enable non-verbose logging to be discarded -type NullWriter bool - -func (w NullWriter) Write(p []byte) (n int, err error) { - return len(p), nil -} - -type Pipeliner interface { - Init() error - ListObjects(bucket string, prefix string) ([]string, error) - Download(bucket string, key string, fn string) error - Upload(bucket string, key string, path string) error - CheckQueue(url string, timeout int64) (bookpipeline.Qmsg, error) - AddToQueue(url string, msg string) error - DelFromQueue(url string, handle string) error - WIPStorageId() string -} - -func main() { - all := flag.Bool("a", false, "Get all files for book, not just hOCR and analysis files") - verbose := flag.Bool("v", false, "Verbose") - flag.Usage = func() { - fmt.Fprintf(flag.CommandLine.Output(), usage) - flag.PrintDefaults() - } - flag.Parse() - - if flag.NArg() < 1 { - flag.Usage() - return - } - - var verboselog *log.Logger - if *verbose { - verboselog = log.New(os.Stdout, "", log.LstdFlags) - } else { - var n NullWriter - verboselog = log.New(n, "", log.LstdFlags) - } - - var conn Pipeliner - conn = &bookpipeline.AwsConn{Region: "eu-west-2", Logger: verboselog} - - verboselog.Println("Setting up AWS session") - err := conn.Init() - if err != nil { - log.Fatalln("Error setting up cloud connection:", err) - } - verboselog.Println("Finished setting up AWS session") - - bookname := flag.Arg(0) - - err = os.MkdirAll(bookname, 0755) - if err != nil { - log.Fatalln("Failed to create directory", bookname, err) - } - - if *all { - verboselog.Println("Downloading all files for", bookname) - objs, err := conn.ListObjects(conn.WIPStorageId(), bookname) - if err != nil { - log.Fatalln("Failed to get list of files for book", bookname, err) - } - for _, i := range objs { - verboselog.Println("Downloading", i) - err = conn.Download(conn.WIPStorageId(), i, i) - if err != nil { - log.Fatalln("Failed to download file", i, err) - } - } - return - } - - verboselog.Println("Downloading best file") - fn := filepath.Join(bookname, "best") - err = conn.Download(conn.WIPStorageId(), fn, fn) - if err != nil { - log.Fatalln("Failed to download 'best' file", err) - } - f, err := os.Open(fn) - if err != nil { - log.Fatalln("Failed to open best file", err) - } - defer f.Close() - - verboselog.Println("Downloading HOCR files") - s := bufio.NewScanner(f) - for s.Scan() { - fn = filepath.Join(bookname, s.Text()) - verboselog.Println("Downloading file", fn) - err = conn.Download(conn.WIPStorageId(), fn, fn) - if err != nil { - log.Fatalln("Failed to download file", fn, err) - } - } - - analyses := []string{"conf", "graph.png"} - verboselog.Println("Downloading analysis files") - for _, a := range analyses { - fn = filepath.Join(bookname, a) - verboselog.Println("Downloading file", fn) - err = conn.Download(conn.WIPStorageId(), fn, fn) - if err != nil { - log.Fatalln("Failed to download file", fn, err) - } - } -} diff --git a/bookpipeline/cmd/lspipeline/main.go b/bookpipeline/cmd/lspipeline/main.go deleted file mode 100644 index 46a1d63..0000000 --- a/bookpipeline/cmd/lspipeline/main.go +++ /dev/null @@ -1,250 +0,0 @@ -package main - -import ( - "flag" - "fmt" - "log" - "os/exec" - "strings" - - "rescribe.xyz/go.git/bookpipeline" -) - -const usage = `Usage: lspipeline [-i key] [-n num] - -Lists useful things related to the pipeline. - -- Instances running -- Messages in each queue -- Books not completed -- Books done -- Last n lines of bookpipeline logs from each running instance -` - -type LsPipeliner interface { - Init() error - PreQueueId() string - WipeQueueId() string - OCRQueueId() string - AnalyseQueueId() string - GetQueueDetails(url string) (string, string, error) - GetInstanceDetails() ([]bookpipeline.InstanceDetails, error) - ListObjects(bucket string, prefix string) ([]string, error) - WIPStorageId() string -} - -// NullWriter is used so non-verbose logging may be discarded -type NullWriter bool - -func (w NullWriter) Write(p []byte) (n int, err error) { - return len(p), nil -} - -type queueDetails struct { - name, numAvailable, numInProgress string -} - -func getInstances(conn LsPipeliner, detailsc chan bookpipeline.InstanceDetails) { - details, err := conn.GetInstanceDetails() - if err != nil { - log.Println("Error getting instance details:", err) - } - for _, d := range details { - detailsc <- d - } - close(detailsc) -} - -func getQueueDetails(conn LsPipeliner, qdetails chan queueDetails) { - queues := []struct{ name, id string }{ - {"preprocess", conn.PreQueueId()}, - {"wipeonly", conn.WipeQueueId()}, - {"ocr", conn.OCRQueueId()}, - {"analyse", conn.AnalyseQueueId()}, - } - for _, q := range queues { - avail, inprog, err := conn.GetQueueDetails(q.id) - if err != nil { - log.Println("Error getting queue details:", err) - } - var qd queueDetails - qd.name = q.name - qd.numAvailable = avail - qd.numInProgress = inprog - qdetails <- qd - } - close(qdetails) -} - -// getBookStatus returns a list of in progress and done books. -// It determines this by listing all objects, and splitting the -// prefixes into two lists, those which have a 'graph.png' file, -// which are classed as done, and those which are not. -func getBookStatus(conn LsPipeliner) (inprogress []string, done []string, err error) { - allfiles, err := conn.ListObjects(conn.WIPStorageId(), "") - if err != nil { - log.Println("Error getting list of objects:", err) - return inprogress, done, err - } - for _, f := range allfiles { - parts := strings.Split(f, "/") - if parts[1] != "graph.png" { - continue - } - prefix := parts[0] - found := false - for _, i := range done { - if i == prefix { - found = true - continue - } - } - if !found { - done = append(done, prefix) - } - } - - for _, f := range allfiles { - parts := strings.Split(f, "/") - prefix := parts[0] - found := false - for _, i := range done { - if i == prefix { - found = true - continue - } - } - for _, i := range inprogress { - if i == prefix { - found = true - continue - } - } - if !found { - inprogress = append(inprogress, prefix) - } - } - - return inprogress, done, err -} - -func getBookStatusChan(conn LsPipeliner, inprogressc chan string, donec chan string) { - inprogress, done, err := getBookStatus(conn) - if err != nil { - log.Println("Error getting book status:", err) - close(inprogressc) - close(donec) - return - } - for _, i := range inprogress { - inprogressc <- i - } - close(inprogressc) - for _, i := range done { - donec <- i - } - close(donec) -} - -func getRecentSSHLogs(ip string, id string, n int) (string, error) { - addr := fmt.Sprintf("%s@%s", "admin", ip) - logcmd := fmt.Sprintf("journalctl -n %d -u bookpipeline", n) - var cmd *exec.Cmd - if id == "" { - cmd = exec.Command("ssh", "-o", "StrictHostKeyChecking no", addr, logcmd) - } else { - cmd = exec.Command("ssh", "-o", "StrictHostKeyChecking no", "-i", id, addr, logcmd) - } - out, err := cmd.Output() - if err != nil { - return "", err - } - return string(out), nil -} - -func getRecentSSHLogsChan(ips []string, id string, lognum int, logs chan string) { - for _, ip := range ips { - sshlog, err := getRecentSSHLogs(ip, id, lognum) - if err != nil { - log.Printf("Error getting SSH logs for %s: %s\n", ip, err) - continue - } - logs <- fmt.Sprintf("%s\n%s", ip, sshlog) - } - close(logs) -} - -func main() { - keyfile := flag.String("i", "", "private key file for SSH") - lognum := flag.Int("n", 5, "number of lines to include in SSH logs") - flag.Usage = func() { - fmt.Fprintf(flag.CommandLine.Output(), usage) - flag.PrintDefaults() - } - flag.Parse() - - var verboselog *log.Logger - var n NullWriter - verboselog = log.New(n, "", 0) - - var conn LsPipeliner - conn = &bookpipeline.AwsConn{Region: "eu-west-2", Logger: verboselog} - err := conn.Init() - if err != nil { - log.Fatalln("Failed to set up cloud connection:", err) - } - - instances := make(chan bookpipeline.InstanceDetails, 100) - queues := make(chan queueDetails) - inprogress := make(chan string, 100) - done := make(chan string, 100) - logs := make(chan string, 10) - - go getInstances(conn, instances) - go getQueueDetails(conn, queues) - go getBookStatusChan(conn, inprogress, done) - - var ips []string - - fmt.Println("# Instances") - for i := range instances { - fmt.Printf("ID: %s, Type: %s, LaunchTime: %s, State: %s", i.Id, i.Type, i.LaunchTime, i.State) - if i.Name != "" { - fmt.Printf(", Name: %s", i.Name) - } - if i.Ip != "" { - fmt.Printf(", IP: %s", i.Ip) - if i.State == "running" && i.Name != "workhorse" { - ips = append(ips, i.Ip) - } - } - if i.Spot != "" { - fmt.Printf(", SpotRequest: %s", i.Spot) - } - fmt.Printf("\n") - } - - go getRecentSSHLogsChan(ips, *keyfile, *lognum, logs) - - fmt.Println("\n# Queues") - for i := range queues { - fmt.Printf("%s: %s available, %s in progress\n", i.name, i.numAvailable, i.numInProgress) - } - - fmt.Println("\n# Books not completed") - for i := range inprogress { - fmt.Println(i) - } - - fmt.Println("\n# Books done") - for i := range done { - fmt.Println(i) - } - - if len(ips) > 0 { - fmt.Println("\n# Recent logs") - for i := range logs { - fmt.Printf("\n%s", i) - } - } -} diff --git a/bookpipeline/cmd/mkpipeline/main.go b/bookpipeline/cmd/mkpipeline/main.go deleted file mode 100644 index e37a56d..0000000 --- a/bookpipeline/cmd/mkpipeline/main.go +++ /dev/null @@ -1,79 +0,0 @@ -package main - -// TODO: use the bookpipeline package for aws stuff -// TODO: set up iam role and policy needed for ec2 instances to access this stuff; -// see arn:aws:iam::557852942063:policy/pipelinestorageandqueue -// and arn:aws:iam::557852942063:role/pipeliner -// TODO: set up launch template for ec2 instances -// NOTE: potentially use json templates to define things, ala aws cli - -import ( - "log" - "os" - - "github.com/aws/aws-sdk-go/aws" - "github.com/aws/aws-sdk-go/aws/awserr" - "github.com/aws/aws-sdk-go/aws/session" - "github.com/aws/aws-sdk-go/service/s3" - "github.com/aws/aws-sdk-go/service/sqs" -) - -func main() { - if len(os.Args) != 1 { - log.Fatal("Usage: mkpipeline\n\nSets up necessary S3 buckets and SQS queues for our AWS pipeline\n") - } - - sess, err := session.NewSession(&aws.Config{ - Region: aws.String("eu-west-2"), - }) - if err != nil { - log.Fatalf("Error: failed to set up aws session: %v\n", err) - } - s3svc := s3.New(sess) - sqssvc := sqs.New(sess) - - prefix := "rescribe" - buckets := []string{"inprogress", "done"} - queues := []string{"preprocess", "wipeonly", "ocr", "analyse"} - - for _, bucket := range buckets { - bname := prefix + bucket - log.Printf("Creating bucket %s\n", bname) - _, err = s3svc.CreateBucket(&s3.CreateBucketInput{ - Bucket: aws.String(bname), - }) - if err != nil { - aerr, ok := err.(awserr.Error) - if ok && (aerr.Code() == s3.ErrCodeBucketAlreadyExists || aerr.Code() == s3.ErrCodeBucketAlreadyOwnedByYou) { - log.Printf("Bucket %s already exists\n", bname) - } else { - log.Fatalf("Error creating bucket %s: %v\n", bname, err) - } - } - } - - for _, queue := range queues { - qname := prefix + queue - log.Printf("Creating queue %s\n", qname) - _, err = sqssvc.CreateQueue(&sqs.CreateQueueInput{ - QueueName: aws.String(qname), - Attributes: map[string]*string{ - "VisibilityTimeout": aws.String("120"), // 2 minutes - "MessageRetentionPeriod": aws.String("1209600"), // 14 days; max allowed by sqs - "ReceiveMessageWaitTimeSeconds": aws.String("20"), - }, - }) - if err != nil { - aerr, ok := err.(awserr.Error) - // Note the QueueAlreadyExists code is only emitted if an existing queue - // has different attributes than the one that was being created. SQS just - // quietly ignores the CreateQueue request if it is identical to an - // existing queue. - if ok && aerr.Code() == sqs.ErrCodeQueueNameExists { - log.Fatalf("Error: Queue %s already exists but has different attributes\n", qname) - } else { - log.Fatalf("Error creating queue %s: %v\n", qname, err) - } - } - } -} diff --git a/bookpipeline/graph.go b/bookpipeline/graph.go deleted file mode 100644 index 955abbd..0000000 --- a/bookpipeline/graph.go +++ /dev/null @@ -1,155 +0,0 @@ -package bookpipeline - -import ( - "fmt" - "io" - "path/filepath" - "sort" - "strconv" - "strings" - - "github.com/wcharczuk/go-chart" - "github.com/wcharczuk/go-chart/drawing" -) - -const maxticks = 40 -const goodCutoff = 70 -const mediumCutoff = 65 -const badCutoff = 60 - -type Conf struct { - Path, Code string - Conf float64 -} - -type GraphConf struct { - Pgnum, Conf float64 -} - -func createLine(xvalues []float64, y float64, c drawing.Color) chart.ContinuousSeries { - var yvalues []float64 - for range xvalues { - yvalues = append(yvalues, y) - } - return chart.ContinuousSeries{ - XValues: xvalues, - YValues: yvalues, - Style: chart.Style{ - StrokeColor: c, - }, - } -} - -func Graph(confs map[string]*Conf, bookname string, w io.Writer) error { - // Organise confs to sort them by page - var graphconf []GraphConf - for _, conf := range confs { - name := filepath.Base(conf.Path) - var numend int - numend = strings.Index(name, "_") - if numend == -1 { - numend = strings.Index(name, ".") - } - pgnum, err := strconv.ParseFloat(name[0:numend], 64) - if err != nil { - continue - } - var c GraphConf - c.Pgnum = pgnum - c.Conf = conf.Conf - graphconf = append(graphconf, c) - } - sort.Slice(graphconf, func(i, j int) bool { return graphconf[i].Pgnum < graphconf[j].Pgnum }) - - // Create main xvalues and yvalues, annotations and ticks - var xvalues, yvalues []float64 - var annotations []chart.Value2 - var ticks []chart.Tick - tickevery := len(graphconf) / maxticks - if tickevery < 1 { - tickevery = 1 - } - for i, c := range graphconf { - xvalues = append(xvalues, c.Pgnum) - yvalues = append(yvalues, c.Conf) - if c.Conf < goodCutoff { - annotations = append(annotations, chart.Value2{Label: fmt.Sprintf("%.0f", c.Pgnum), XValue: c.Pgnum, YValue: c.Conf}) - } - if i%tickevery == 0 { - ticks = append(ticks, chart.Tick{c.Pgnum, fmt.Sprintf("%.0f", c.Pgnum)}) - } - } - // make last tick the final page - final := graphconf[len(graphconf)-1] - ticks[len(ticks)-1] = chart.Tick{final.Pgnum, fmt.Sprintf("%.0f", final.Pgnum)} - mainSeries := chart.ContinuousSeries{ - XValues: xvalues, - YValues: yvalues, - } - - // Create lines - goodCutoffSeries := createLine(xvalues, goodCutoff, chart.ColorAlternateGreen) - mediumCutoffSeries := createLine(xvalues, mediumCutoff, chart.ColorOrange) - badCutoffSeries := createLine(xvalues, badCutoff, chart.ColorRed) - - // Create lines marking top and bottom 10% confidence - sort.Slice(graphconf, func(i, j int) bool { return graphconf[i].Conf < graphconf[j].Conf }) - lowconf := graphconf[int(len(graphconf)/10)].Conf - highconf := graphconf[int((len(graphconf)/10)*9)].Conf - yvalues = []float64{} - for range graphconf { - yvalues = append(yvalues, lowconf) - } - minSeries := &chart.ContinuousSeries{ - Style: chart.Style{ - StrokeColor: chart.ColorAlternateGray, - StrokeDashArray: []float64{5.0, 5.0}, - }, - XValues: xvalues, - YValues: yvalues, - } - yvalues = []float64{} - for _ = range graphconf { - yvalues = append(yvalues, highconf) - } - maxSeries := &chart.ContinuousSeries{ - Style: chart.Style{ - StrokeColor: chart.ColorAlternateGray, - StrokeDashArray: []float64{5.0, 5.0}, - }, - XValues: xvalues, - YValues: yvalues, - } - - graph := chart.Chart{ - Title: bookname, - Width: 3840, - Height: 2160, - XAxis: chart.XAxis{ - Name: "Page number", - Range: &chart.ContinuousRange{ - Min: 0.0, - }, - Ticks: ticks, - }, - YAxis: chart.YAxis{ - Name: "Confidence", - Range: &chart.ContinuousRange{ - Min: 0.0, - Max: 100.0, - }, - }, - Series: []chart.Series{ - mainSeries, - minSeries, - maxSeries, - goodCutoffSeries, - mediumCutoffSeries, - badCutoffSeries, - chart.AnnotationSeries{ - Annotations: annotations, - }, - }, - } - return graph.Render(chart.PNG, w) -} diff --git a/bucket-lines/bucket.go b/bucket-lines/bucket.go deleted file mode 100644 index 9f98887..0000000 --- a/bucket-lines/bucket.go +++ /dev/null @@ -1,131 +0,0 @@ -package main - -import ( - "fmt" - "io" - "os" - "path/filepath" - "sort" - "strconv" - - "rescribe.xyz/go.git/lib/line" -) - -type BucketSpec struct { - Min float64 - Name string -} -type BucketSpecs []BucketSpec - -func (b BucketSpecs) Len() int { return len(b) } -func (b BucketSpecs) Swap(i, j int) { b[i], b[j] = b[j], b[i] } -func (b BucketSpecs) Less(i, j int) bool { return b[i].Min < b[j].Min } - -type BucketStat struct { - name string - num int -} -type BucketStats []BucketStat - -func (b BucketStats) Len() int { return len(b) } -func (b BucketStats) Swap(i, j int) { b[i], b[j] = b[j], b[i] } -func (b BucketStats) Less(i, j int) bool { return b[i].num < b[j].num } - -// Copies the image and text for a line into a directory based on -// the line confidence, as defined by the buckets struct -func bucketLine(l line.Detail, buckets BucketSpecs, dirname string) (string, error) { - var bucket string - - todir := "" - for _, b := range buckets { - if l.Avgconf >= b.Min { - todir = b.Name - bucket = b.Name - } - } - - if todir == "" { - return bucket, nil - } - - avgstr := strconv.FormatFloat(l.Avgconf, 'G', -1, 64) - if len(avgstr) > 2 { - avgstr = avgstr[2:] - } - - base := filepath.Join(dirname, todir, l.OcrName+"_"+l.Name+"_"+avgstr) - - err := os.MkdirAll(filepath.Join(dirname, todir), 0700) - if err != nil { - return bucket, err - } - - f, err := os.Create(base + ".png") - if err != nil { - return bucket, err - } - defer f.Close() - - err = l.Img.CopyLineTo(f) - if err != nil { - return bucket, err - } - - f, err = os.Create(base + ".txt") - if err != nil { - return bucket, err - } - defer f.Close() - - _, err = io.WriteString(f, l.Text) - if err != nil { - return bucket, err - } - - return bucket, err -} - -// Copies line images and text into directories based on their -// confidence, as defined by the buckets struct, and returns -// statistics of whire lines went in the process. -func BucketUp(lines line.Details, buckets BucketSpecs, dirname string) (BucketStats, error) { - var all []string - var stats BucketStats - - sort.Sort(lines) - sort.Sort(buckets) - for _, l := range lines { - bname, err := bucketLine(l, buckets, dirname) - if err != nil { - return stats, err - } - all = append(all, bname) - } - - for _, b := range all { - i := sort.Search(len(stats), func(i int) bool { return stats[i].name == b }) - if i == len(stats) { - newstat := BucketStat{b, 0} - stats = append(stats, newstat) - i = len(stats) - 1 - } - stats[i].num++ - } - - return stats, nil -} - -// Prints statistics of where lines went when bucketing -func PrintBucketStats(w io.Writer, stats BucketStats) { - var total int - for _, s := range stats { - total += s.num - } - - fmt.Fprintf(w, "Copied %d lines\n", total) - fmt.Fprintf(w, "---------------------------------\n") - sort.Sort(stats) - for _, s := range stats { - fmt.Fprintf(w, "Lines in %7s: %2d%%\n", s.name, 100*s.num/total) - } -} diff --git a/bucket-lines/main.go b/bucket-lines/main.go deleted file mode 100644 index 990e84c..0000000 --- a/bucket-lines/main.go +++ /dev/null @@ -1,87 +0,0 @@ -package main - -import ( - "encoding/json" - "flag" - "fmt" - "io/ioutil" - "log" - "os" - "path/filepath" - - "rescribe.xyz/go.git/lib/hocr" - "rescribe.xyz/go.git/lib/line" - "rescribe.xyz/go.git/lib/prob" -) - -func main() { - b := BucketSpecs{ - // minimum confidence, name - {0, "bad"}, - {0.95, "95to98"}, - {0.98, "98plus"}, - } - - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: bucket-lines [-d dir] [-s specs.json] [hocr1] [prob1] [hocr2] [...]\n") - fmt.Fprintf(os.Stderr, "Copies image-text line pairs into different directories according\n") - fmt.Fprintf(os.Stderr, "to the average character probability for the line.\n") - fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n") - fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n") - fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n") - fmt.Fprintf(os.Stderr, "option.\n") - fmt.Fprintf(os.Stderr, "The .prob and .hocr files are assumed to be in the same directory\n") - fmt.Fprintf(os.Stderr, "as the line's image and text files.\n\n") - flag.PrintDefaults() - fmt.Fprintf(os.Stderr, "\nAn example specs.json file would be the following:\n") - fmt.Fprintf(os.Stderr, "[{\"min\": 0, \"name\": \"terrible\"}, {\"min\": 0.80, \"name\": \"ok\"}, {\"min\": 0.98, \"name\": \"great\"}]\n") - } - dir := flag.String("d", "buckets", "Directory to store the buckets") - specs := flag.String("s", "", "JSON file describing specs to bucket into") - flag.Parse() - if flag.NArg() < 1 { - flag.Usage() - os.Exit(1) - } - - if *specs != "" { - js, err := ioutil.ReadFile(*specs) - if err != nil { - log.Fatal(err) - } - err = json.Unmarshal(js, &b) - if err != nil { - log.Fatal(err) - } - } - - var err error - lines := make(line.Details, 0) - - for _, f := range flag.Args() { - var newlines line.Details - switch ext := filepath.Ext(f); ext { - case ".prob": - newlines, err = prob.GetLineDetails(f) - case ".hocr": - newlines, err = hocr.GetLineDetails(f) - default: - log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f) - continue - } - if err != nil { - log.Fatal(err) - } - - for _, l := range newlines { - lines = append(lines, l) - } - } - - stats, err := BucketUp(lines, b, *dir) - if err != nil { - log.Fatal(err) - } - - PrintBucketStats(os.Stdout, stats) -} diff --git a/cmd/binarize/main.go b/cmd/binarize/main.go new file mode 100644 index 0000000..301e42b --- /dev/null +++ b/cmd/binarize/main.go @@ -0,0 +1,78 @@ +package main + +import ( + "flag" + "fmt" + "image" + "image/draw" + _ "image/jpeg" + "image/png" + "log" + "os" + + "rescribe.xyz/preproc" +) + +// TODO: do more testing to see how good this assumption is +func autowsize(bounds image.Rectangle) int { + return bounds.Dx() / 60 +} + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: binarize [-k num] [-t type] [-w num] inimg outimg\n") + flag.PrintDefaults() + } + wsize := flag.Int("w", 0, "Window size for sauvola algorithm. Set automatically based on resolution if not set.") + ksize := flag.Float64("k", 0.5, "K for sauvola algorithm. This controls the overall threshold level. Set it lower for very light text (try 0.1 or 0.2).") + btype := flag.String("t", "binary", "Type of threshold. binary or zeroinv are currently implemented.") + flag.Parse() + if flag.NArg() < 2 { + flag.Usage() + os.Exit(1) + } + + f, err := os.Open(flag.Arg(0)) + defer f.Close() + if err != nil { + log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) + } + img, _, err := image.Decode(f) + if err != nil { + log.Fatalf("Could not decode image: %v\n", err) + } + b := img.Bounds() + gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy())) + draw.Draw(gray, b, img, b.Min, draw.Src) + + if *wsize == 0 { + *wsize = autowsize(b) + log.Printf("Set window size to %d\n", *wsize) + } + + if *wsize%2 == 0 { + *wsize++ + } + + // TODO: come up with a way to set a good ksize automatically + + var thresh image.Image + thresh = preproc.IntegralSauvola(gray, *ksize, *wsize) + + if *btype == "zeroinv" { + thresh, err = preproc.BinToZeroInv(thresh.(*image.Gray), img.(*image.RGBA)) + if err != nil { + log.Fatal(err) + } + } + + f, err = os.Create(flag.Arg(1)) + if err != nil { + log.Fatalf("Could not create file %s: %v\n", flag.Arg(1), err) + } + defer f.Close() + err = png.Encode(f, thresh) + if err != nil { + log.Fatalf("Could not encode image: %v\n", err) + } +} diff --git a/cmd/preproc/main.go b/cmd/preproc/main.go new file mode 100644 index 0000000..5d71a62 --- /dev/null +++ b/cmd/preproc/main.go @@ -0,0 +1,90 @@ +package main + +// TODO: come up with a way to set a good ksize automatically + +import ( + "flag" + "fmt" + "image" + "image/draw" + _ "image/jpeg" + "image/png" + "log" + "os" + + "rescribe.xyz/preproc" +) + +// TODO: do more testing to see how good this assumption is +func autowsize(bounds image.Rectangle) int { + return bounds.Dx() / 60 +} + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: preproc [-bt bintype] [-bw winsize] [-k num] [-m minperc] [-nowipe] [-wt wipethresh] [-ws wipesize] inimg outimg\n") + fmt.Fprintf(os.Stderr, "Binarize and preprocess an image\n") + flag.PrintDefaults() + } + binwsize := flag.Int("bw", 0, "Window size for sauvola binarization algorithm. Set automatically based on resolution if not set.") + ksize := flag.Float64("k", 0.5, "K for sauvola binarization algorithm. This controls the overall threshold level. Set it lower for very light text (try 0.1 or 0.2).") + btype := flag.String("bt", "binary", "Type of binarization threshold. binary or zeroinv are currently implemented.") + min := flag.Int("m", 30, "Minimum percentage of the image width for the content width calculation to be considered valid.") + nowipe := flag.Bool("nowipe", false, "Disable wiping completely.") + wipewsize := flag.Int("ws", 5, "Window size for wiping algorithm.") + thresh := flag.Float64("wt", 0.05, "Threshold for the wiping algorithm to determine the proportion of black pixels below which a window is determined to be the edge.") + flag.Parse() + if flag.NArg() < 2 { + flag.Usage() + os.Exit(1) + } + + f, err := os.Open(flag.Arg(0)) + defer f.Close() + if err != nil { + log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) + } + img, _, err := image.Decode(f) + if err != nil { + log.Fatalf("Could not decode image: %v\n", err) + } + b := img.Bounds() + gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy())) + draw.Draw(gray, b, img, b.Min, draw.Src) + + if *binwsize == 0 { + *binwsize = autowsize(b) + } + + if *binwsize%2 == 0 { + *binwsize++ + } + + log.Print("Binarising") + var clean, threshimg image.Image + threshimg = preproc.IntegralSauvola(gray, *ksize, *binwsize) + + if *btype == "zeroinv" { + threshimg, err = preproc.BinToZeroInv(threshimg.(*image.Gray), img.(*image.RGBA)) + if err != nil { + log.Fatal(err) + } + } + + if !*nowipe { + log.Print("Wiping sides") + clean = preproc.Wipe(threshimg.(*image.Gray), *wipewsize, *thresh, *min) + } else { + clean = threshimg + } + + f, err = os.Create(flag.Arg(1)) + if err != nil { + log.Fatalf("Could not create file %s: %v\n", flag.Arg(1), err) + } + defer f.Close() + err = png.Encode(f, clean) + if err != nil { + log.Fatalf("Could not encode image: %v\n", err) + } +} diff --git a/cmd/preprocmulti/main.go b/cmd/preprocmulti/main.go new file mode 100644 index 0000000..eb9c018 --- /dev/null +++ b/cmd/preprocmulti/main.go @@ -0,0 +1,101 @@ +package main + +// TODO: come up with a way to set a good ksize automatically + +import ( + "flag" + "fmt" + "image" + "image/draw" + _ "image/jpeg" + "image/png" + "log" + "os" + + "rescribe.xyz/preproc" + "rescribe.xyz/preproc/integralimg" +) + +// TODO: do more testing to see how good this assumption is +func autowsize(bounds image.Rectangle) int { + return bounds.Dx() / 60 +} + +func main() { + ksizes := []float64{0.1, 0.2, 0.4, 0.5} + + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: preprocmulti [-bt bintype] [-bw winsize] [-m minperc] [-nowipe] [-ws wipesize] inimg outbase\n") + fmt.Fprintf(os.Stderr, "Binarize and preprocess an image, with multiple binarisation levels,\n") + fmt.Fprintf(os.Stderr, "saving images to outbase_bin{k}.png.\n") + fmt.Fprintf(os.Stderr, "Binarises with these levels for k: %v.\n", ksizes) + flag.PrintDefaults() + } + binwsize := flag.Int("bw", 0, "Window size for sauvola binarization algorithm. Set automatically based on resolution if not set.") + btype := flag.String("bt", "binary", "Type of binarization threshold. binary or zeroinv are currently implemented.") + min := flag.Int("m", 30, "Minimum percentage of the image width for the content width calculation to be considered valid.") + nowipe := flag.Bool("nowipe", false, "Disable wiping completely.") + wipewsize := flag.Int("ws", 5, "Window size for wiping algorithm.") + flag.Parse() + if flag.NArg() < 2 { + flag.Usage() + os.Exit(1) + } + + log.Printf("Opening %s\n", flag.Arg(0)) + f, err := os.Open(flag.Arg(0)) + defer f.Close() + if err != nil { + log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) + } + img, _, err := image.Decode(f) + if err != nil { + log.Fatalf("Could not decode image: %v\n", err) + } + b := img.Bounds() + gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy())) + draw.Draw(gray, b, img, b.Min, draw.Src) + + if *binwsize == 0 { + *binwsize = autowsize(b) + } + + if *binwsize%2 == 0 { + *binwsize++ + } + + var clean, threshimg image.Image + log.Print("Precalculating integral images") + integrals := integralimg.ToAllIntegralImg(gray) + + for _, k := range ksizes { + log.Print("Binarising") + threshimg = preproc.PreCalcedSauvola(integrals, gray, k, *binwsize) + + if *btype == "zeroinv" { + threshimg, err = preproc.BinToZeroInv(threshimg.(*image.Gray), img.(*image.RGBA)) + if err != nil { + log.Fatal(err) + } + } + + if !*nowipe { + log.Print("Wiping sides") + clean = preproc.Wipe(threshimg.(*image.Gray), *wipewsize, k*0.02, *min) + } else { + clean = threshimg + } + + savefn := fmt.Sprintf("%s_bin%0.1f.png", flag.Arg(1), k) + log.Printf("Saving %s\n", savefn) + f, err = os.Create(savefn) + if err != nil { + log.Fatalf("Could not create file %s: %v\n", savefn, err) + } + defer f.Close() + err = png.Encode(f, clean) + if err != nil { + log.Fatalf("Could not encode image: %v\n", err) + } + } +} diff --git a/cmd/wipe/main.go b/cmd/wipe/main.go new file mode 100644 index 0000000..6254946 --- /dev/null +++ b/cmd/wipe/main.go @@ -0,0 +1,55 @@ +package main + +import ( + "flag" + "fmt" + "image" + "image/draw" + _ "image/jpeg" + "image/png" + "log" + "os" + + "rescribe.xyz/preproc" +) + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: wipe [-m minperc] [-t thresh] [-w winsize] inimg outimg\n") + fmt.Fprintf(os.Stderr, "Wipes the sections of an image which are outside the content area.\n") + flag.PrintDefaults() + } + min := flag.Int("m", 30, "Minimum percentage of the image width for the content width calculation to be considered valid.") + thresh := flag.Float64("t", 0.05, "Threshold for the proportion of black pixels below which a window is determined to be the edge. Higher means more aggressive wiping.") + wsize := flag.Int("w", 5, "Window size for mask finding algorithm.") + flag.Parse() + if flag.NArg() < 2 { + flag.Usage() + os.Exit(1) + } + + f, err := os.Open(flag.Arg(0)) + defer f.Close() + if err != nil { + log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) + } + img, _, err := image.Decode(f) + if err != nil { + log.Fatalf("Could not decode image: %v\n", err) + } + b := img.Bounds() + gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy())) + draw.Draw(gray, b, img, b.Min, draw.Src) + + clean := preproc.Wipe(gray, *wsize, *thresh, *min) + + f, err = os.Create(flag.Arg(1)) + if err != nil { + log.Fatalf("Could not create file %s: %v\n", flag.Arg(1), err) + } + defer f.Close() + err = png.Encode(f, clean) + if err != nil { + log.Fatalf("Could not encode image: %v\n", err) + } +} diff --git a/dehyphenate/main.go b/dehyphenate/main.go deleted file mode 100644 index 4393c8f..0000000 --- a/dehyphenate/main.go +++ /dev/null @@ -1,63 +0,0 @@ -package main - -import ( - "encoding/xml" - "flag" - "fmt" - "io/ioutil" - "log" - "os" - - "rescribe.xyz/go.git/lib/hocr" -) - -// BUGS: -// - loses all elements not captured in hocr structure such as html headings -// might be best to copy the header and footer separately and put the hocr in between, but would still need to ensure all elements are captured -// - loses any formatting; doesn't need to be identical, but e.g. linebreaks after elements would be handy -// - need to handle OcrChar - -func main() { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: dehyphenate hocrin hocrout\n") - fmt.Fprintf(os.Stderr, "Dehyphenates a hocr file.\n") - flag.PrintDefaults() - } - flag.Parse() - if flag.NArg() != 2 { - flag.Usage() - os.Exit(1) - } - - in, err := ioutil.ReadFile(flag.Arg(0)) - if err != nil { - log.Fatalf("Error reading %s: %v", flag.Arg(1), err) - } - h, err := hocr.Parse(in) - if err != nil { - log.Fatal(err) - } - - for i, l := range h.Lines { - w := l.Words[len(l.Words)-1] - if len(w.Chars) == 0 { - if len(w.Text) > 0 && w.Text[len(w.Text) - 1] == '-' { - h.Lines[i].Words[len(l.Words)-1].Text = w.Text[0:len(w.Text)-1] + h.Lines[i+1].Words[0].Text - h.Lines[i+1].Words[0].Text = "" - } - } else { - log.Printf("TODO: handle OcrChar") - } - } - - f, err := os.Create(flag.Arg(1)) - if err != nil { - log.Fatalf("Error creating file %s: %v", flag.Arg(1), err) - } - defer f.Close() - e := xml.NewEncoder(f) - err = e.Encode(h) - if err != nil { - log.Fatalf("Error encoding XML: %v", err) - } -} diff --git a/eeboxmltohocr/main.go b/eeboxmltohocr/main.go deleted file mode 100644 index 2761cd9..0000000 --- a/eeboxmltohocr/main.go +++ /dev/null @@ -1,135 +0,0 @@ -package main - -import ( - "bufio" - "flag" - "fmt" - "io" - "log" - "os" - "regexp" - "strconv" - "strings" -) - -// splitByPb is a split function for the scanner that splits by the -// '= 0 { - return i + 1, data[0:i], nil - } - // If we're at EOF, we have a final section, so just return the lot. - if atEOF { - return len(data), data, nil - } - // Request more data. - return 0, nil, nil -} - -type Page struct { - number int - text string -} - -func addPage(pgs *[]Page, number int, text string) { - added := 0 - for i, pg := range *pgs { - if pg.number == number { - (*pgs)[i].text = pg.text + text - added = 1 - } - } - if added == 0 { - newpg := Page{number, text} - *pgs = append(*pgs, newpg) - } -} - -func main() { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: eeboxmltohocr in.xml outbase\n") - flag.PrintDefaults() - } - flag.Parse() - if flag.NArg() < 2 { - flag.Usage() - os.Exit(1) - } - - f, err := os.Open(flag.Arg(0)) - defer f.Close() - if err != nil { - log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) - } - scanner := bufio.NewScanner(f) - - scanner.Split(splitByPb) - - var pgs []Page - - for scanner.Scan() { - t := scanner.Text() - r := regexp.MustCompile(`pb [^>]*facs="tcp:.*?:(.*?)"`).FindStringSubmatch(t) - if len(r) <= 1 { - continue - } - pgnum, err := strconv.Atoi(r[1]) - if err != nil { - continue - } - - content := t[strings.Index(t, ">")+1:] - ungap := regexp.MustCompile(`(?s)].+?`).ReplaceAllString(content, "") - unxml := regexp.MustCompile(`<.+?>`).ReplaceAllString(ungap, "") - - finaltxt := strings.TrimLeft(unxml, " \n") - if len(finaltxt) == 0 { - continue - } - - addPage(&pgs, pgnum, finaltxt) - } - - for _, pg := range pgs { - fn := fmt.Sprintf("%s-%03d.hocr", flag.Arg(1), pg.number - 1) - f, err := os.Create(fn) - if err != nil { - log.Fatalf("Could not create file %s: %v\n", fn, err) - } - defer f.Close() - - _, err = io.WriteString(f, hocrHeader + pg.text + hocrFooter) - if err != nil { - log.Fatalf("Could not write file %s: %v\n", fn, err) - } - } -} - -const hocrHeader = ` - - - - - - - - - -
-
-

- - ` - -const hocrFooter = ` - -

-
-
- -` diff --git a/hocrtotxt/main.go b/hocrtotxt/main.go deleted file mode 100644 index 6821a9e..0000000 --- a/hocrtotxt/main.go +++ /dev/null @@ -1,30 +0,0 @@ -package main - -import ( - "flag" - "fmt" - "log" - "os" - - "rescribe.xyz/go.git/lib/hocr" -) - -func main() { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: hocrtotxt hocrfile\n") - fmt.Fprintf(os.Stderr, "Prints the text from a hocr file.\n") - flag.PrintDefaults() - } - flag.Parse() - if flag.NArg() != 1 { - flag.Usage() - os.Exit(1) - } - - text, err := hocr.GetText(flag.Arg(0)) - if err != nil { - log.Fatal(err) - } - - fmt.Printf("%s\n", text) -} diff --git a/lib/hocr/hocr.go b/lib/hocr/hocr.go deleted file mode 100644 index dcd0494..0000000 --- a/lib/hocr/hocr.go +++ /dev/null @@ -1,129 +0,0 @@ -package hocr - -import ( - "encoding/xml" - "errors" - "io/ioutil" - "regexp" - "strconv" - "strings" -) - -type Hocr struct { - Lines []OcrLine `xml:"body>div>div>p>span"` -} - -type OcrLine struct { - Class string `xml:"class,attr"` - Id string `xml:"id,attr"` - Title string `xml:"title,attr"` - Words []OcrWord `xml:"span"` - Text string `xml:",chardata"` -} - -type OcrWord struct { - Class string `xml:"class,attr"` - Id string `xml:"id,attr"` - Title string `xml:"title,attr"` - Chars []OcrChar `xml:"span"` - Text string `xml:",chardata"` -} - -type OcrChar struct { - Class string `xml:"class,attr"` - Id string `xml:"id,attr"` - Title string `xml:"title,attr"` - Chars []OcrChar `xml:"span"` - Text string `xml:",chardata"` -} - -// Returns the confidence for a word based on its x_wconf value -func wordConf(s string) (float64, error) { - re, err := regexp.Compile(`x_wconf ([0-9.]+)`) - if err != nil { - return 0.0, err - } - conf := re.FindStringSubmatch(s) - return strconv.ParseFloat(conf[1], 64) -} - -func boxCoords(s string) ([4]int, error) { - var coords [4]int - re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`) - if err != nil { - return coords, err - } - coordstr := re.FindStringSubmatch(s) - for i := range coords { - c, err := strconv.Atoi(coordstr[i+1]) - if err != nil { - return coords, err - } - coords[i] = c - } - return coords, nil -} - -func noText(s string) bool { - t := strings.Trim(s, " \n") - return len(t) == 0 -} - -func Parse(b []byte) (Hocr, error) { - var hocr Hocr - - err := xml.Unmarshal(b, &hocr) - if err != nil { - return hocr, err - } - - return hocr, nil -} - -func GetText(hocrfn string) (string, error) { - var s string - - file, err := ioutil.ReadFile(hocrfn) - if err != nil { - return s, err - } - - h, err := Parse(file) - if err != nil { - return s, err - } - - - for _, l := range h.Lines { - s += getLineText(l) - } - return s, nil -} - -func GetAvgConf(hocrfn string) (float64, error) { - file, err := ioutil.ReadFile(hocrfn) - if err != nil { - return 0, err - } - - h, err := Parse(file) - if err != nil { - return 0, err - } - - var total, num float64 - for _, l := range h.Lines { - for _, w := range l.Words { - c, err := wordConf(w.Title) - if err != nil { - return 0, err - } - total += c - num++ - } - } - if num == 0 { - return 0, errors.New("No words found") - } - return total / num, nil -} diff --git a/lib/hocr/lines.go b/lib/hocr/lines.go deleted file mode 100644 index 74e8f9a..0000000 --- a/lib/hocr/lines.go +++ /dev/null @@ -1,131 +0,0 @@ -package hocr - -// TODO: Parse line name to zero pad line numbers, so they can -// be sorted easily - -import ( - "image" - "image/png" - "io/ioutil" - "log" - "os" - "path/filepath" - "strings" - - "rescribe.xyz/go.git/lib/line" -) - -func getLineText(l OcrLine) (string) { - linetext := "" - - linetext = l.Text - if noText(linetext) { - linetext = "" - for _, w := range l.Words { - if w.Class != "ocrx_word" { - continue - } - linetext += w.Text + " " - } - } - if noText(linetext) { - linetext = "" - for _, w := range l.Words { - if w.Class != "ocrx_word" { - continue - } - for _, c := range w.Chars { - if c.Class != "ocrx_cinfo" { - continue - } - linetext += c.Text - } - linetext += " " - } - } - linetext = strings.TrimRight(linetext, " ") - linetext += "\n" - return linetext -} - -func parseLineDetails(h Hocr, i image.Image, name string) (line.Details, error) { - lines := make(line.Details, 0) - - for _, l := range h.Lines { - totalconf := float64(0) - num := 0 - for _, w := range l.Words { - c, err := wordConf(w.Title) - if err != nil { - return lines, err - } - num++ - totalconf += c - } - - coords, err := boxCoords(l.Title) - if err != nil { - return lines, err - } - - var ln line.Detail - ln.Name = l.Id - ln.Avgconf = (totalconf / float64(num)) / 100 - ln.Text = getLineText(l) - ln.OcrName = name - if i != nil { - var imgd line.ImgDirect - imgd.Img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) - ln.Img = imgd - } - lines = append(lines, ln) - } - return lines, nil -} - -func GetLineDetails(hocrfn string) (line.Details, error) { - var newlines line.Details - - file, err := ioutil.ReadFile(hocrfn) - if err != nil { - return newlines, err - } - - h, err := Parse(file) - if err != nil { - return newlines, err - } - - var img image.Image - pngfn := strings.Replace(hocrfn, ".hocr", ".png", 1) - pngf, err := os.Open(pngfn) - if err != nil { - log.Println("Warning: can't open image %s\n", pngfn) - } else { - defer pngf.Close() - img, err = png.Decode(pngf) - if err != nil { - log.Println("Warning: can't load image %s\n", pngfn) - } - } - - n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1) - return parseLineDetails(h, img, n) -} - -func GetLineBasics(hocrfn string) (line.Details, error) { - var newlines line.Details - - file, err := ioutil.ReadFile(hocrfn) - if err != nil { - return newlines, err - } - - h, err := Parse(file) - if err != nil { - return newlines, err - } - - n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1) - return parseLineDetails(h, nil, n) -} diff --git a/lib/line/line.go b/lib/line/line.go deleted file mode 100644 index d4e3e44..0000000 --- a/lib/line/line.go +++ /dev/null @@ -1,57 +0,0 @@ -package line - -import ( - "image" - "image/png" - "io" - "os" -) - -type Detail struct { - Name string - Avgconf float64 - Img CopyableImg - Text string - OcrName string -} - -type CopyableImg interface { - CopyLineTo(io.Writer) error -} - -type Details []Detail - -func (l Details) Len() int { return len(l) } -func (l Details) Less(i, j int) bool { return l[i].Avgconf < l[j].Avgconf } -func (l Details) Swap(i, j int) { l[i], l[j] = l[j], l[i] } - -// This is an implementation of the CopyableImg interface that -// stores the image directly as an image.Image -type ImgDirect struct { - Img image.Image -} - -func (i ImgDirect) CopyLineTo(w io.Writer) error { - err := png.Encode(w, i.Img) - if err != nil { - return err - } - return nil -} - -// This is an implementation of the CopyableImg interface that -// stores the path of an image -type ImgPath struct { - Path string -} - -func (i ImgPath) CopyLineTo(w io.Writer) error { - f, err := os.Open(i.Path) - if err != nil { - return err - } - defer f.Close() - - _, err = io.Copy(w, f) - return err -} diff --git a/lib/prob/prob.go b/lib/prob/prob.go deleted file mode 100644 index 31a484d..0000000 --- a/lib/prob/prob.go +++ /dev/null @@ -1,69 +0,0 @@ -package prob - -import ( - "io/ioutil" - "path/filepath" - "strconv" - "strings" - - "rescribe.xyz/go.git/lib/line" -) - -func getLineAvg(f string) (float64, error) { - totalconf := float64(0) - num := 0 - - prob, err := ioutil.ReadFile(f) - if err != nil { - return 0, err - } - - for _, l := range strings.Split(string(prob), "\n") { - fields := strings.Fields(l) - - if len(fields) == 2 { - conf, err := strconv.ParseFloat(fields[1], 64) - if err != nil { - continue - } - totalconf += conf - num += 1 - } - } - if num <= 0 { - return 0, nil - } - avg := totalconf / float64(num) - return avg, nil -} - -// Note this only processes one line at a time -func GetLineDetails(probfn string) (line.Details, error) { - var l line.Detail - lines := make(line.Details, 0) - - avg, err := getLineAvg(probfn) - if err != nil { - return lines, err - } - - filebase := strings.Replace(probfn, ".prob", "", 1) - - txt, err := ioutil.ReadFile(filebase + ".txt") - if err != nil { - return lines, err - } - - l.Name = filepath.Base(filebase) - l.Avgconf = avg - l.Text = string(txt) - l.OcrName = filepath.Base(filepath.Dir(filebase)) - - var imgfn line.ImgPath - imgfn.Path = filebase + ".bin.png" - l.Img = imgfn - - lines = append(lines, l) - - return lines, nil -} diff --git a/pgconf/main.go b/pgconf/main.go deleted file mode 100644 index bc09c23..0000000 --- a/pgconf/main.go +++ /dev/null @@ -1,30 +0,0 @@ -package main - -import ( - "flag" - "fmt" - "log" - "os" - - "rescribe.xyz/go.git/lib/hocr" -) - -func main() { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: pgconf hocr\n") - fmt.Fprintf(os.Stderr, "Prints the total confidence for a page, as an average of the confidence of each word.\n") - flag.PrintDefaults() - } - flag.Parse() - if flag.NArg() != 1 { - flag.Usage() - os.Exit(1) - } - - avg, err := hocr.GetAvgConf(flag.Arg(0)) - if err != nil { - log.Fatalf("Error retreiving confidence for %s: %v\n", flag.Arg(0), err) - } - - fmt.Printf("%0.0f\n", avg) -} diff --git a/preproc/cmd/binarize/main.go b/preproc/cmd/binarize/main.go deleted file mode 100644 index e7f677e..0000000 --- a/preproc/cmd/binarize/main.go +++ /dev/null @@ -1,78 +0,0 @@ -package main - -import ( - "flag" - "fmt" - "image" - "image/draw" - _ "image/jpeg" - "image/png" - "log" - "os" - - "rescribe.xyz/go.git/preproc" -) - -// TODO: do more testing to see how good this assumption is -func autowsize(bounds image.Rectangle) int { - return bounds.Dx() / 60 -} - -func main() { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: binarize [-k num] [-t type] [-w num] inimg outimg\n") - flag.PrintDefaults() - } - wsize := flag.Int("w", 0, "Window size for sauvola algorithm. Set automatically based on resolution if not set.") - ksize := flag.Float64("k", 0.5, "K for sauvola algorithm. This controls the overall threshold level. Set it lower for very light text (try 0.1 or 0.2).") - btype := flag.String("t", "binary", "Type of threshold. binary or zeroinv are currently implemented.") - flag.Parse() - if flag.NArg() < 2 { - flag.Usage() - os.Exit(1) - } - - f, err := os.Open(flag.Arg(0)) - defer f.Close() - if err != nil { - log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) - } - img, _, err := image.Decode(f) - if err != nil { - log.Fatalf("Could not decode image: %v\n", err) - } - b := img.Bounds() - gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy())) - draw.Draw(gray, b, img, b.Min, draw.Src) - - if *wsize == 0 { - *wsize = autowsize(b) - log.Printf("Set window size to %d\n", *wsize) - } - - if *wsize%2 == 0 { - *wsize++ - } - - // TODO: come up with a way to set a good ksize automatically - - var thresh image.Image - thresh = preproc.IntegralSauvola(gray, *ksize, *wsize) - - if *btype == "zeroinv" { - thresh, err = preproc.BinToZeroInv(thresh.(*image.Gray), img.(*image.RGBA)) - if err != nil { - log.Fatal(err) - } - } - - f, err = os.Create(flag.Arg(1)) - if err != nil { - log.Fatalf("Could not create file %s: %v\n", flag.Arg(1), err) - } - defer f.Close() - err = png.Encode(f, thresh) - if err != nil { - log.Fatalf("Could not encode image: %v\n", err) - } -} diff --git a/preproc/cmd/preproc/main.go b/preproc/cmd/preproc/main.go deleted file mode 100644 index 1c248e0..0000000 --- a/preproc/cmd/preproc/main.go +++ /dev/null @@ -1,90 +0,0 @@ -package main - -// TODO: come up with a way to set a good ksize automatically - -import ( - "flag" - "fmt" - "image" - "image/draw" - _ "image/jpeg" - "image/png" - "log" - "os" - - "rescribe.xyz/go.git/preproc" -) - -// TODO: do more testing to see how good this assumption is -func autowsize(bounds image.Rectangle) int { - return bounds.Dx() / 60 -} - -func main() { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: preproc [-bt bintype] [-bw winsize] [-k num] [-m minperc] [-nowipe] [-wt wipethresh] [-ws wipesize] inimg outimg\n") - fmt.Fprintf(os.Stderr, "Binarize and preprocess an image\n") - flag.PrintDefaults() - } - binwsize := flag.Int("bw", 0, "Window size for sauvola binarization algorithm. Set automatically based on resolution if not set.") - ksize := flag.Float64("k", 0.5, "K for sauvola binarization algorithm. This controls the overall threshold level. Set it lower for very light text (try 0.1 or 0.2).") - btype := flag.String("bt", "binary", "Type of binarization threshold. binary or zeroinv are currently implemented.") - min := flag.Int("m", 30, "Minimum percentage of the image width for the content width calculation to be considered valid.") - nowipe := flag.Bool("nowipe", false, "Disable wiping completely.") - wipewsize := flag.Int("ws", 5, "Window size for wiping algorithm.") - thresh := flag.Float64("wt", 0.05, "Threshold for the wiping algorithm to determine the proportion of black pixels below which a window is determined to be the edge.") - flag.Parse() - if flag.NArg() < 2 { - flag.Usage() - os.Exit(1) - } - - f, err := os.Open(flag.Arg(0)) - defer f.Close() - if err != nil { - log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) - } - img, _, err := image.Decode(f) - if err != nil { - log.Fatalf("Could not decode image: %v\n", err) - } - b := img.Bounds() - gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy())) - draw.Draw(gray, b, img, b.Min, draw.Src) - - if *binwsize == 0 { - *binwsize = autowsize(b) - } - - if *binwsize%2 == 0 { - *binwsize++ - } - - log.Print("Binarising") - var clean, threshimg image.Image - threshimg = preproc.IntegralSauvola(gray, *ksize, *binwsize) - - if *btype == "zeroinv" { - threshimg, err = preproc.BinToZeroInv(threshimg.(*image.Gray), img.(*image.RGBA)) - if err != nil { - log.Fatal(err) - } - } - - if !*nowipe { - log.Print("Wiping sides") - clean = preproc.Wipe(threshimg.(*image.Gray), *wipewsize, *thresh, *min) - } else { - clean = threshimg - } - - f, err = os.Create(flag.Arg(1)) - if err != nil { - log.Fatalf("Could not create file %s: %v\n", flag.Arg(1), err) - } - defer f.Close() - err = png.Encode(f, clean) - if err != nil { - log.Fatalf("Could not encode image: %v\n", err) - } -} diff --git a/preproc/cmd/preprocmulti/main.go b/preproc/cmd/preprocmulti/main.go deleted file mode 100644 index c6c9fe4..0000000 --- a/preproc/cmd/preprocmulti/main.go +++ /dev/null @@ -1,101 +0,0 @@ -package main - -// TODO: come up with a way to set a good ksize automatically - -import ( - "flag" - "fmt" - "image" - "image/draw" - _ "image/jpeg" - "image/png" - "log" - "os" - - "rescribe.xyz/go.git/integralimg" - "rescribe.xyz/go.git/preproc" -) - -// TODO: do more testing to see how good this assumption is -func autowsize(bounds image.Rectangle) int { - return bounds.Dx() / 60 -} - -func main() { - ksizes := []float64{0.1, 0.2, 0.4, 0.5} - - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: preprocmulti [-bt bintype] [-bw winsize] [-m minperc] [-nowipe] [-ws wipesize] inimg outbase\n") - fmt.Fprintf(os.Stderr, "Binarize and preprocess an image, with multiple binarisation levels,\n") - fmt.Fprintf(os.Stderr, "saving images to outbase_bin{k}.png.\n") - fmt.Fprintf(os.Stderr, "Binarises with these levels for k: %v.\n", ksizes) - flag.PrintDefaults() - } - binwsize := flag.Int("bw", 0, "Window size for sauvola binarization algorithm. Set automatically based on resolution if not set.") - btype := flag.String("bt", "binary", "Type of binarization threshold. binary or zeroinv are currently implemented.") - min := flag.Int("m", 30, "Minimum percentage of the image width for the content width calculation to be considered valid.") - nowipe := flag.Bool("nowipe", false, "Disable wiping completely.") - wipewsize := flag.Int("ws", 5, "Window size for wiping algorithm.") - flag.Parse() - if flag.NArg() < 2 { - flag.Usage() - os.Exit(1) - } - - log.Printf("Opening %s\n", flag.Arg(0)) - f, err := os.Open(flag.Arg(0)) - defer f.Close() - if err != nil { - log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) - } - img, _, err := image.Decode(f) - if err != nil { - log.Fatalf("Could not decode image: %v\n", err) - } - b := img.Bounds() - gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy())) - draw.Draw(gray, b, img, b.Min, draw.Src) - - if *binwsize == 0 { - *binwsize = autowsize(b) - } - - if *binwsize%2 == 0 { - *binwsize++ - } - - var clean, threshimg image.Image - log.Print("Precalculating integral images") - integrals := integralimg.ToAllIntegralImg(gray) - - for _, k := range ksizes { - log.Print("Binarising") - threshimg = preproc.PreCalcedSauvola(integrals, gray, k, *binwsize) - - if *btype == "zeroinv" { - threshimg, err = preproc.BinToZeroInv(threshimg.(*image.Gray), img.(*image.RGBA)) - if err != nil { - log.Fatal(err) - } - } - - if !*nowipe { - log.Print("Wiping sides") - clean = preproc.Wipe(threshimg.(*image.Gray), *wipewsize, k*0.02, *min) - } else { - clean = threshimg - } - - savefn := fmt.Sprintf("%s_bin%0.1f.png", flag.Arg(1), k) - log.Printf("Saving %s\n", savefn) - f, err = os.Create(savefn) - if err != nil { - log.Fatalf("Could not create file %s: %v\n", savefn, err) - } - defer f.Close() - err = png.Encode(f, clean) - if err != nil { - log.Fatalf("Could not encode image: %v\n", err) - } - } -} diff --git a/preproc/cmd/wipe/main.go b/preproc/cmd/wipe/main.go deleted file mode 100644 index e5c039d..0000000 --- a/preproc/cmd/wipe/main.go +++ /dev/null @@ -1,55 +0,0 @@ -package main - -import ( - "flag" - "fmt" - "image" - "image/draw" - _ "image/jpeg" - "image/png" - "log" - "os" - - "rescribe.xyz/go.git/preproc" -) - -func main() { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: wipe [-m minperc] [-t thresh] [-w winsize] inimg outimg\n") - fmt.Fprintf(os.Stderr, "Wipes the sections of an image which are outside the content area.\n") - flag.PrintDefaults() - } - min := flag.Int("m", 30, "Minimum percentage of the image width for the content width calculation to be considered valid.") - thresh := flag.Float64("t", 0.05, "Threshold for the proportion of black pixels below which a window is determined to be the edge. Higher means more aggressive wiping.") - wsize := flag.Int("w", 5, "Window size for mask finding algorithm.") - flag.Parse() - if flag.NArg() < 2 { - flag.Usage() - os.Exit(1) - } - - f, err := os.Open(flag.Arg(0)) - defer f.Close() - if err != nil { - log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) - } - img, _, err := image.Decode(f) - if err != nil { - log.Fatalf("Could not decode image: %v\n", err) - } - b := img.Bounds() - gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy())) - draw.Draw(gray, b, img, b.Min, draw.Src) - - clean := preproc.Wipe(gray, *wsize, *thresh, *min) - - f, err = os.Create(flag.Arg(1)) - if err != nil { - log.Fatalf("Could not create file %s: %v\n", flag.Arg(1), err) - } - defer f.Close() - err = png.Encode(f, clean) - if err != nil { - log.Fatalf("Could not encode image: %v\n", err) - } -} diff --git a/preproc/preprocmulti.go b/preproc/preprocmulti.go deleted file mode 100644 index 2e7cb06..0000000 --- a/preproc/preprocmulti.go +++ /dev/null @@ -1,94 +0,0 @@ -package preproc - -// TODO: come up with a way to set a good ksize automatically - -import ( - "fmt" - "image" - "image/draw" - _ "image/jpeg" - "image/png" - "os" - "strings" - - "rescribe.xyz/go.git/integralimg" -) - -// TODO: do more testing to see how good this assumption is -func autowsize(bounds image.Rectangle) int { - return bounds.Dx() / 60 -} - -// PreProcMulti binarizes and preprocesses an image with multiple binarisation levels. -// inPath: Path of input image. -// ksizes: Slice of k values to pass to Sauvola algorithm -// binType: Type of binarization threshold. binary or zeroinv are currently implemented. -// binWsize: Window size for sauvola binarization algorithm. Set automatically based on resolution if 0. -// wipe: Whether to wipe (clear sides) the image -// wipeWsize: Window size for wiping algorithm -// wipeMinWidthPerc: Minimum percentage of the image width for the content width calculation to be considered valid -// Note: copied from cmd/preprocmulti/main.go, should think about the best way -// to organise this code later. -// TODO: return errors that encapsulate the err describing where it was encountered -// TODO: do the post-integral image stuff in separate goroutines for speed -func PreProcMulti(inPath string, ksizes []float64, binType string, binWsize int, wipe bool, wipeWsize int, wipeMinWidthPerc int) ([]string, error) { - // Make outBase inPath up to final . - s := strings.Split(inPath, ".") - outBase := strings.Join(s[:len(s)-1], "") - - var donePaths []string - - f, err := os.Open(inPath) - if err != nil { - return donePaths, err - } - defer f.Close() - img, _, err := image.Decode(f) - if err != nil { - return donePaths, err - } - b := img.Bounds() - gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy())) - draw.Draw(gray, b, img, b.Min, draw.Src) - - if binWsize == 0 { - binWsize = autowsize(b) - } - - if binWsize%2 == 0 { - binWsize++ - } - - var clean, threshimg image.Image - integrals := integralimg.ToAllIntegralImg(gray) - - for _, k := range ksizes { - threshimg = PreCalcedSauvola(integrals, gray, k, binWsize) - - if binType == "zeroinv" { - threshimg, err = BinToZeroInv(threshimg.(*image.Gray), img.(*image.RGBA)) - if err != nil { - return donePaths, err - } - } - - if wipe { - clean = Wipe(threshimg.(*image.Gray), wipeWsize, k*0.02, wipeMinWidthPerc) - } else { - clean = threshimg - } - - savefn := fmt.Sprintf("%s_bin%0.1f.png", outBase, k) - f, err = os.Create(savefn) - if err != nil { - return donePaths, err - } - defer f.Close() - err = png.Encode(f, clean) - if err != nil { - return donePaths, err - } - donePaths = append(donePaths, savefn) - } - return donePaths, nil -} diff --git a/preproc/sauvola.go b/preproc/sauvola.go deleted file mode 100644 index 046bb7d..0000000 --- a/preproc/sauvola.go +++ /dev/null @@ -1,76 +0,0 @@ -package preproc - -import ( - "image" - "image/color" - - "rescribe.xyz/go.git/integralimg" -) - -// Implements Sauvola's algorithm for text binarization, see paper -// "Adaptive document image binarization" (2000) -func Sauvola(img *image.Gray, ksize float64, windowsize int) *image.Gray { - b := img.Bounds() - new := image.NewGray(b) - - for y := b.Min.Y; y < b.Max.Y; y++ { - for x := b.Min.X; x < b.Max.X; x++ { - window := surrounding(img, x, y, windowsize) - m, dev := meanstddev(window) - threshold := m * (1 + ksize*((dev/128)-1)) - if img.GrayAt(x, y).Y < uint8(threshold) { - new.SetGray(x, y, color.Gray{0}) - } else { - new.SetGray(x, y, color.Gray{255}) - } - } - } - - return new -} - -// Implements Sauvola's algorithm using Integral Images, see paper -// "Efficient Implementation of Local Adaptive Thresholding Techniques Using Integral Images" -// and -// https://stackoverflow.com/questions/13110733/computing-image-integral -func IntegralSauvola(img *image.Gray, ksize float64, windowsize int) *image.Gray { - b := img.Bounds() - new := image.NewGray(b) - - integrals := integralimg.ToAllIntegralImg(img) - - for y := b.Min.Y; y < b.Max.Y; y++ { - for x := b.Min.X; x < b.Max.X; x++ { - m, dev := integrals.MeanStdDevWindow(x, y, windowsize) - threshold := m * (1 + ksize*((dev/128)-1)) - if img.GrayAt(x, y).Y < uint8(threshold) { - new.SetGray(x, y, color.Gray{0}) - } else { - new.SetGray(x, y, color.Gray{255}) - } - } - } - - return new -} - -// PreCalcedSauvola Implements Sauvola's algorithm using precalculated Integral Images -// TODO: have this be the root function that the other two reference -func PreCalcedSauvola(integrals integralimg.WithSq, img *image.Gray, ksize float64, windowsize int) *image.Gray { - b := img.Bounds() - new := image.NewGray(b) - - for y := b.Min.Y; y < b.Max.Y; y++ { - for x := b.Min.X; x < b.Max.X; x++ { - m, dev := integrals.MeanStdDevWindow(x, y, windowsize) - threshold := m * (1 + ksize*((dev/128)-1)) - if img.GrayAt(x, y).Y < uint8(threshold) { - new.SetGray(x, y, color.Gray{0}) - } else { - new.SetGray(x, y, color.Gray{255}) - } - } - } - - return new -} diff --git a/preproc/sauvola_test.go b/preproc/sauvola_test.go deleted file mode 100644 index 2331e10..0000000 --- a/preproc/sauvola_test.go +++ /dev/null @@ -1,70 +0,0 @@ -package preproc - -import ( - "flag" - "fmt" - "image" - "image/png" - "os" - "testing" -) - -func TestBinarization(t *testing.T) { - var slow = flag.Bool("slow", false, "include slow tests") - var update = flag.Bool("updatesauvola", false, "update golden files") - - cases := []struct { - name string - orig string - golden string - ksize float64 - wsize int - }{ - {"integralsauvola", "testdata/pg1.png", "testdata/pg1_integralsauvola_k0.5_w41.png", 0.5, 41}, - {"integralsauvola", "testdata/pg1.png", "testdata/pg1_integralsauvola_k0.5_w19.png", 0.5, 19}, - {"integralsauvola", "testdata/pg1.png", "testdata/pg1_integralsauvola_k0.3_w19.png", 0.3, 19}, - {"sauvola", "testdata/pg1.png", "testdata/pg1_sauvola_k0.5_w41.png", 0.5, 41}, - {"sauvola", "testdata/pg1.png", "testdata/pg1_sauvola_k0.5_w19.png", 0.5, 19}, - {"sauvola", "testdata/pg1.png", "testdata/pg1_sauvola_k0.3_w19.png", 0.3, 19}, - } - - for _, c := range cases { - t.Run(fmt.Sprintf("%s_%0.1f_%d", c.name, c.ksize, c.wsize), func(t *testing.T) { - var actual *image.Gray - orig, err := decode(c.orig) - if err != nil { - t.Fatalf("Could not open file %s: %v\n", c.orig, err) - } - switch c.name { - case "integralsauvola": - actual = IntegralSauvola(orig, c.ksize, c.wsize) - case "sauvola": - if *slow { - actual = Sauvola(orig, c.ksize, c.wsize) - } else { - t.Skip("Skipping slow test; use -slow to run it.\n") - } - default: - t.Fatalf("No method %s\n", c.name) - } - if *update { - f, err := os.Create(c.golden) - defer f.Close() - if err != nil { - t.Fatalf("Could not open file %s to update: %v\n", c.golden, err) - } - err = png.Encode(f, actual) - if err != nil { - t.Fatalf("Could not encode update of %s: %v\n", c.golden, err) - } - } - golden, err := decode(c.golden) - if err != nil { - t.Fatalf("Could not open file %s: %v\n", c.golden, err) - } - if !imgsequal(golden, actual) { - t.Errorf("Binarized %s differs to %s\n", c.orig, c.golden) - } - }) - } -} diff --git a/preproc/test_helpers.go b/preproc/test_helpers.go deleted file mode 100644 index 20de5b1..0000000 --- a/preproc/test_helpers.go +++ /dev/null @@ -1,53 +0,0 @@ -package preproc - -// TODO: add different pages as test cases -// TODO: test non integral img version - -import ( - "image" - "image/draw" - "image/png" - "os" -) - -func decode(s string) (*image.Gray, error) { - f, err := os.Open(s) - defer f.Close() - if err != nil { - return nil, err - } - img, err := png.Decode(f) - if err != nil { - return nil, err - } - b := img.Bounds() - gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy())) - draw.Draw(gray, b, img, b.Min, draw.Src) - return gray, nil -} - -func imgsequal(img1 *image.Gray, img2 *image.Gray) bool { - b := img1.Bounds() - if !b.Eq(img2.Bounds()) { - return false - } - for y := b.Min.Y; y < b.Max.Y; y++ { - for x := b.Min.X; x < b.Max.X; x++ { - r0, g0, b0, a0 := img1.At(x, y).RGBA() - r1, g1, b1, a1 := img2.At(x, y).RGBA() - if r0 != r1 { - return false - } - if g0 != g1 { - return false - } - if b0 != b1 { - return false - } - if a0 != a1 { - return false - } - } - } - return true -} diff --git a/preproc/testdata/pg1.png b/preproc/testdata/pg1.png deleted file mode 100644 index 2bcc4b1..0000000 Binary files a/preproc/testdata/pg1.png and /dev/null differ diff --git a/preproc/testdata/pg1_integralsauvola_k0.3_w19.png b/preproc/testdata/pg1_integralsauvola_k0.3_w19.png deleted file mode 100644 index bdf5712..0000000 Binary files a/preproc/testdata/pg1_integralsauvola_k0.3_w19.png and /dev/null differ diff --git a/preproc/testdata/pg1_integralsauvola_k0.5_w19.png b/preproc/testdata/pg1_integralsauvola_k0.5_w19.png deleted file mode 100644 index 5db2d9a..0000000 Binary files a/preproc/testdata/pg1_integralsauvola_k0.5_w19.png and /dev/null differ diff --git a/preproc/testdata/pg1_integralsauvola_k0.5_w41.png b/preproc/testdata/pg1_integralsauvola_k0.5_w41.png deleted file mode 100644 index 050d037..0000000 Binary files a/preproc/testdata/pg1_integralsauvola_k0.5_w41.png and /dev/null differ diff --git a/preproc/testdata/pg1_sauvola_k0.3_w19.png b/preproc/testdata/pg1_sauvola_k0.3_w19.png deleted file mode 100644 index bcd595f..0000000 Binary files a/preproc/testdata/pg1_sauvola_k0.3_w19.png and /dev/null differ diff --git a/preproc/testdata/pg1_sauvola_k0.5_w19.png b/preproc/testdata/pg1_sauvola_k0.5_w19.png deleted file mode 100644 index 8de596c..0000000 Binary files a/preproc/testdata/pg1_sauvola_k0.5_w19.png and /dev/null differ diff --git a/preproc/testdata/pg1_sauvola_k0.5_w41.png b/preproc/testdata/pg1_sauvola_k0.5_w41.png deleted file mode 100644 index b8f50e0..0000000 Binary files a/preproc/testdata/pg1_sauvola_k0.5_w41.png and /dev/null differ diff --git a/preproc/testdata/pg2.png b/preproc/testdata/pg2.png deleted file mode 100644 index c7c4249..0000000 Binary files a/preproc/testdata/pg2.png and /dev/null differ diff --git a/preproc/testdata/pg2_integralwipesides_t0.02_w5.png b/preproc/testdata/pg2_integralwipesides_t0.02_w5.png deleted file mode 100644 index 6b4ccb2..0000000 Binary files a/preproc/testdata/pg2_integralwipesides_t0.02_w5.png and /dev/null differ diff --git a/preproc/testdata/pg2_integralwipesides_t0.05_w25.png b/preproc/testdata/pg2_integralwipesides_t0.05_w25.png deleted file mode 100644 index 39dc88d..0000000 Binary files a/preproc/testdata/pg2_integralwipesides_t0.05_w25.png and /dev/null differ diff --git a/preproc/testdata/pg2_integralwipesides_t0.05_w5.png b/preproc/testdata/pg2_integralwipesides_t0.05_w5.png deleted file mode 100644 index 50df855..0000000 Binary files a/preproc/testdata/pg2_integralwipesides_t0.05_w5.png and /dev/null differ diff --git a/preproc/util.go b/preproc/util.go deleted file mode 100644 index e23829d..0000000 --- a/preproc/util.go +++ /dev/null @@ -1,95 +0,0 @@ -package preproc - -import ( - "errors" - "image" - "math" -) - -// TODO: name better; maybe verb, x-er -// TODO: implement these for regular image, and use them to make -// image functions generic for integral and non- images -type UsefulImg interface { - MeanWindow() - MeanStdDevWindow() -} - -func mean(i []int) float64 { - sum := 0 - for _, n := range i { - sum += n - } - return float64(sum) / float64(len(i)) -} - -func stddev(i []int) float64 { - m := mean(i) - - var sum float64 - for _, n := range i { - sum += (float64(n) - m) * (float64(n) - m) - } - variance := sum / float64(len(i)-1) - return math.Sqrt(variance) -} - -func meanstddev(i []int) (float64, float64) { - m := mean(i) - - var sum float64 - for _, n := range i { - sum += (float64(n) - m) * (float64(n) - m) - } - variance := float64(sum) / float64(len(i)-1) - return m, math.Sqrt(variance) -} - -// gets the pixel values surrounding a point in the image -func surrounding(img *image.Gray, x int, y int, size int) []int { - b := img.Bounds() - step := size / 2 - - miny := y - step - if miny < b.Min.Y { - miny = b.Min.Y - } - minx := x - step - if minx < b.Min.X { - minx = b.Min.X - } - maxy := y + step - if maxy > b.Max.Y { - maxy = b.Max.Y - } - maxx := x + step - if maxx > b.Max.X { - maxx = b.Max.X - } - - var s []int - for yi := miny; yi <= maxy; yi++ { - for xi := minx; xi <= maxx; xi++ { - s = append(s, int(img.GrayAt(xi, yi).Y)) - } - } - return s -} - -func BinToZeroInv(bin *image.Gray, orig *image.RGBA) (*image.RGBA, error) { - b := bin.Bounds() - if !b.Eq(orig.Bounds()) { - return orig, errors.New("bin and orig images need to be the same dimensions") - } - newimg := image.NewRGBA(image.Rect(0, 0, b.Dx(), b.Dy())) - for y := b.Min.Y; y < b.Max.Y; y++ { - for x := b.Min.X; x < b.Max.X; x++ { - if bin.GrayAt(x, y).Y == 255 { - newimg.Set(x, y, bin.GrayAt(x, y)) - } else { - newimg.Set(x, y, orig.At(x, y)) - } - } - } - - return newimg, nil -} diff --git a/preproc/wipesides.go b/preproc/wipesides.go deleted file mode 100644 index 3d08053..0000000 --- a/preproc/wipesides.go +++ /dev/null @@ -1,160 +0,0 @@ -package preproc - -// TODO: add minimum size variable (default ~30%?) -// TODO: switch to an interface rather than integralimg.I - -import ( - "errors" - "fmt" - "image" - "image/color" - "image/draw" - _ "image/jpeg" - "image/png" - "os" - - "rescribe.xyz/go.git/integralimg" -) - -// returns the proportion of the given window that is black pixels -func proportion(i integralimg.I, x int, size int) float64 { - w := i.GetVerticalWindow(x, size) - return w.Proportion() -} - -// findbestedge goes through every vertical line from x to x+w to -// find the one with the lowest proportion of black pixels. -func findbestedge(img integralimg.I, x int, w int) int { - var bestx int - var best float64 - - if w == 1 { - return x - } - - right := x + w - for ; x < right; x++ { - prop := proportion(img, x, 1) - if prop > best { - best = prop - bestx = x - } - } - - return bestx -} - -// findedges finds the edges of the main content, by moving a window of wsize -// from near the middle of the image to the left and right, stopping when it reaches -// a point at which there is a lower proportion of black pixels than thresh. -func findedges(img integralimg.I, wsize int, thresh float64) (int, int) { - maxx := len(img[0]) - 1 - var lowedge, highedge int = 0, maxx - - // don't start at the middle, as this will fail for 2 column layouts, - // start 10% left or right of the middle - notcentre := maxx / 10 - - for x := maxx/2 + notcentre; x < maxx-wsize; x++ { - if proportion(img, x, wsize) <= thresh { - highedge = findbestedge(img, x, wsize) - break - } - } - - for x := maxx/2 - notcentre; x > 0; x-- { - if proportion(img, x, wsize) <= thresh { - lowedge = findbestedge(img, x, wsize) - break - } - } - - return lowedge, highedge -} - -// wipesides fills the sections of image not within the boundaries -// of lowedge and highedge with white -func wipesides(img *image.Gray, lowedge int, highedge int) *image.Gray { - b := img.Bounds() - new := image.NewGray(b) - - // set left edge white - for x := b.Min.X; x < lowedge; x++ { - for y := b.Min.Y; y < b.Max.Y; y++ { - new.SetGray(x, y, color.Gray{255}) - } - } - // copy middle - for x := lowedge; x < highedge; x++ { - for y := b.Min.Y; y < b.Max.Y; y++ { - new.SetGray(x, y, img.GrayAt(x, y)) - } - } - // set right edge white - for x := highedge; x < b.Max.X; x++ { - for y := b.Min.Y; y < b.Max.Y; y++ { - new.SetGray(x, y, color.Gray{255}) - } - } - - return new -} - -// toonarrow checks whether the area between lowedge and highedge is -// less than min % of the total image width -func toonarrow(img *image.Gray, lowedge int, highedge int, min int) bool { - b := img.Bounds() - imgw := b.Max.X - b.Min.X - wipew := highedge - lowedge - if float64(wipew)/float64(imgw)*100 < float64(min) { - return true - } - return false -} - -// Wipe fills the sections of image which fall outside the content -// area with white, providing the content area is above min % -func Wipe(img *image.Gray, wsize int, thresh float64, min int) *image.Gray { - integral := integralimg.ToIntegralImg(img) - lowedge, highedge := findedges(integral, wsize, thresh) - if toonarrow(img, lowedge, highedge, min) { - return img - } - return wipesides(img, lowedge, highedge) -} - -// WipeFile wipes an image file, filling the sections of the image -// which fall outside the content area with white, providing the -// content area is above min %. -// inPath: path of the input image. -// outPath: path to save the output image. -// wsize: window size for wipe algorithm. -// thresh: threshold for wipe algorithm. -// min: minimum % of content area width to consider valid. -func WipeFile(inPath string, outPath string, wsize int, thresh float64, min int) error { - f, err := os.Open(inPath) - defer f.Close() - if err != nil { - return errors.New(fmt.Sprintf("Could not open file %s: %v", inPath, err)) - } - img, _, err := image.Decode(f) - if err != nil { - return errors.New(fmt.Sprintf("Could not decode image: %v", err)) - } - b := img.Bounds() - gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy())) - draw.Draw(gray, b, img, b.Min, draw.Src) - - clean := Wipe(gray, wsize, thresh, min) - - f, err = os.Create(outPath) - if err != nil { - return errors.New(fmt.Sprintf("Could not create file %s: %v", outPath, err)) - } - defer f.Close() - err = png.Encode(f, clean) - if err != nil { - return errors.New(fmt.Sprintf("Could not encode image: %v", err)) - } - return nil -} diff --git a/preproc/wipesides_test.go b/preproc/wipesides_test.go deleted file mode 100644 index d5464e0..0000000 --- a/preproc/wipesides_test.go +++ /dev/null @@ -1,57 +0,0 @@ -package preproc - -// TODO: add different pages as test cases -// TODO: test non integral img version - -import ( - "flag" - "fmt" - "image" - "image/png" - "os" - "testing" -) - -func TestWipeSides(t *testing.T) { - var update = flag.Bool("updatewipe", false, "update golden files") - cases := []struct { - name string - orig string - golden string - thresh float64 - wsize int - }{ - {"integralwipesides", "testdata/pg2.png", "testdata/pg2_integralwipesides_t0.02_w5.png", 0.02, 5}, - {"integralwipesides", "testdata/pg2.png", "testdata/pg2_integralwipesides_t0.05_w5.png", 0.05, 5}, - {"integralwipesides", "testdata/pg2.png", "testdata/pg2_integralwipesides_t0.05_w25.png", 0.05, 25}, - } - - for _, c := range cases { - t.Run(fmt.Sprintf("%s_%0.2f_%d", c.name, c.thresh, c.wsize), func(t *testing.T) { - var actual *image.Gray - orig, err := decode(c.orig) - if err != nil { - t.Fatalf("Could not open file %s: %v\n", c.orig, err) - } - actual = Wipe(orig, c.wsize, c.thresh) - if *update { - f, err := os.Create(c.golden) - defer f.Close() - if err != nil { - t.Fatalf("Could not open file %s to update: %v\n", c.golden, err) - } - err = png.Encode(f, actual) - if err != nil { - t.Fatalf("Could not encode update of %s: %v\n", c.golden, err) - } - } - golden, err := decode(c.golden) - if err != nil { - t.Fatalf("Could not open file %s: %v\n", c.golden, err) - } - if !imgsequal(golden, actual) { - t.Errorf("Processed %s differs to %s\n", c.orig, c.golden) - } - }) - } -} diff --git a/preprocmulti.go b/preprocmulti.go new file mode 100644 index 0000000..7d5cbf5 --- /dev/null +++ b/preprocmulti.go @@ -0,0 +1,94 @@ +package preproc + +// TODO: come up with a way to set a good ksize automatically + +import ( + "fmt" + "image" + "image/draw" + _ "image/jpeg" + "image/png" + "os" + "strings" + + "rescribe.xyz/preproc/integralimg" +) + +// TODO: do more testing to see how good this assumption is +func autowsize(bounds image.Rectangle) int { + return bounds.Dx() / 60 +} + +// PreProcMulti binarizes and preprocesses an image with multiple binarisation levels. +// inPath: Path of input image. +// ksizes: Slice of k values to pass to Sauvola algorithm +// binType: Type of binarization threshold. binary or zeroinv are currently implemented. +// binWsize: Window size for sauvola binarization algorithm. Set automatically based on resolution if 0. +// wipe: Whether to wipe (clear sides) the image +// wipeWsize: Window size for wiping algorithm +// wipeMinWidthPerc: Minimum percentage of the image width for the content width calculation to be considered valid +// Note: copied from cmd/preprocmulti/main.go, should think about the best way +// to organise this code later. +// TODO: return errors that encapsulate the err describing where it was encountered +// TODO: do the post-integral image stuff in separate goroutines for speed +func PreProcMulti(inPath string, ksizes []float64, binType string, binWsize int, wipe bool, wipeWsize int, wipeMinWidthPerc int) ([]string, error) { + // Make outBase inPath up to final . + s := strings.Split(inPath, ".") + outBase := strings.Join(s[:len(s)-1], "") + + var donePaths []string + + f, err := os.Open(inPath) + if err != nil { + return donePaths, err + } + defer f.Close() + img, _, err := image.Decode(f) + if err != nil { + return donePaths, err + } + b := img.Bounds() + gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy())) + draw.Draw(gray, b, img, b.Min, draw.Src) + + if binWsize == 0 { + binWsize = autowsize(b) + } + + if binWsize%2 == 0 { + binWsize++ + } + + var clean, threshimg image.Image + integrals := integralimg.ToAllIntegralImg(gray) + + for _, k := range ksizes { + threshimg = PreCalcedSauvola(integrals, gray, k, binWsize) + + if binType == "zeroinv" { + threshimg, err = BinToZeroInv(threshimg.(*image.Gray), img.(*image.RGBA)) + if err != nil { + return donePaths, err + } + } + + if wipe { + clean = Wipe(threshimg.(*image.Gray), wipeWsize, k*0.02, wipeMinWidthPerc) + } else { + clean = threshimg + } + + savefn := fmt.Sprintf("%s_bin%0.1f.png", outBase, k) + f, err = os.Create(savefn) + if err != nil { + return donePaths, err + } + defer f.Close() + err = png.Encode(f, clean) + if err != nil { + return donePaths, err + } + donePaths = append(donePaths, savefn) + } + return donePaths, nil +} diff --git a/sauvola.go b/sauvola.go new file mode 100644 index 0000000..3ba4359 --- /dev/null +++ b/sauvola.go @@ -0,0 +1,76 @@ +package preproc + +import ( + "image" + "image/color" + + "rescribe.xyz/preproc/integralimg" +) + +// Implements Sauvola's algorithm for text binarization, see paper +// "Adaptive document image binarization" (2000) +func Sauvola(img *image.Gray, ksize float64, windowsize int) *image.Gray { + b := img.Bounds() + new := image.NewGray(b) + + for y := b.Min.Y; y < b.Max.Y; y++ { + for x := b.Min.X; x < b.Max.X; x++ { + window := surrounding(img, x, y, windowsize) + m, dev := meanstddev(window) + threshold := m * (1 + ksize*((dev/128)-1)) + if img.GrayAt(x, y).Y < uint8(threshold) { + new.SetGray(x, y, color.Gray{0}) + } else { + new.SetGray(x, y, color.Gray{255}) + } + } + } + + return new +} + +// Implements Sauvola's algorithm using Integral Images, see paper +// "Efficient Implementation of Local Adaptive Thresholding Techniques Using Integral Images" +// and +// https://stackoverflow.com/questions/13110733/computing-image-integral +func IntegralSauvola(img *image.Gray, ksize float64, windowsize int) *image.Gray { + b := img.Bounds() + new := image.NewGray(b) + + integrals := integralimg.ToAllIntegralImg(img) + + for y := b.Min.Y; y < b.Max.Y; y++ { + for x := b.Min.X; x < b.Max.X; x++ { + m, dev := integrals.MeanStdDevWindow(x, y, windowsize) + threshold := m * (1 + ksize*((dev/128)-1)) + if img.GrayAt(x, y).Y < uint8(threshold) { + new.SetGray(x, y, color.Gray{0}) + } else { + new.SetGray(x, y, color.Gray{255}) + } + } + } + + return new +} + +// PreCalcedSauvola Implements Sauvola's algorithm using precalculated Integral Images +// TODO: have this be the root function that the other two reference +func PreCalcedSauvola(integrals integralimg.WithSq, img *image.Gray, ksize float64, windowsize int) *image.Gray { + b := img.Bounds() + new := image.NewGray(b) + + for y := b.Min.Y; y < b.Max.Y; y++ { + for x := b.Min.X; x < b.Max.X; x++ { + m, dev := integrals.MeanStdDevWindow(x, y, windowsize) + threshold := m * (1 + ksize*((dev/128)-1)) + if img.GrayAt(x, y).Y < uint8(threshold) { + new.SetGray(x, y, color.Gray{0}) + } else { + new.SetGray(x, y, color.Gray{255}) + } + } + } + + return new +} diff --git a/sauvola_test.go b/sauvola_test.go new file mode 100644 index 0000000..2331e10 --- /dev/null +++ b/sauvola_test.go @@ -0,0 +1,70 @@ +package preproc + +import ( + "flag" + "fmt" + "image" + "image/png" + "os" + "testing" +) + +func TestBinarization(t *testing.T) { + var slow = flag.Bool("slow", false, "include slow tests") + var update = flag.Bool("updatesauvola", false, "update golden files") + + cases := []struct { + name string + orig string + golden string + ksize float64 + wsize int + }{ + {"integralsauvola", "testdata/pg1.png", "testdata/pg1_integralsauvola_k0.5_w41.png", 0.5, 41}, + {"integralsauvola", "testdata/pg1.png", "testdata/pg1_integralsauvola_k0.5_w19.png", 0.5, 19}, + {"integralsauvola", "testdata/pg1.png", "testdata/pg1_integralsauvola_k0.3_w19.png", 0.3, 19}, + {"sauvola", "testdata/pg1.png", "testdata/pg1_sauvola_k0.5_w41.png", 0.5, 41}, + {"sauvola", "testdata/pg1.png", "testdata/pg1_sauvola_k0.5_w19.png", 0.5, 19}, + {"sauvola", "testdata/pg1.png", "testdata/pg1_sauvola_k0.3_w19.png", 0.3, 19}, + } + + for _, c := range cases { + t.Run(fmt.Sprintf("%s_%0.1f_%d", c.name, c.ksize, c.wsize), func(t *testing.T) { + var actual *image.Gray + orig, err := decode(c.orig) + if err != nil { + t.Fatalf("Could not open file %s: %v\n", c.orig, err) + } + switch c.name { + case "integralsauvola": + actual = IntegralSauvola(orig, c.ksize, c.wsize) + case "sauvola": + if *slow { + actual = Sauvola(orig, c.ksize, c.wsize) + } else { + t.Skip("Skipping slow test; use -slow to run it.\n") + } + default: + t.Fatalf("No method %s\n", c.name) + } + if *update { + f, err := os.Create(c.golden) + defer f.Close() + if err != nil { + t.Fatalf("Could not open file %s to update: %v\n", c.golden, err) + } + err = png.Encode(f, actual) + if err != nil { + t.Fatalf("Could not encode update of %s: %v\n", c.golden, err) + } + } + golden, err := decode(c.golden) + if err != nil { + t.Fatalf("Could not open file %s: %v\n", c.golden, err) + } + if !imgsequal(golden, actual) { + t.Errorf("Binarized %s differs to %s\n", c.orig, c.golden) + } + }) + } +} diff --git a/test_helpers.go b/test_helpers.go new file mode 100644 index 0000000..20de5b1 --- /dev/null +++ b/test_helpers.go @@ -0,0 +1,53 @@ +package preproc + +// TODO: add different pages as test cases +// TODO: test non integral img version + +import ( + "image" + "image/draw" + "image/png" + "os" +) + +func decode(s string) (*image.Gray, error) { + f, err := os.Open(s) + defer f.Close() + if err != nil { + return nil, err + } + img, err := png.Decode(f) + if err != nil { + return nil, err + } + b := img.Bounds() + gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy())) + draw.Draw(gray, b, img, b.Min, draw.Src) + return gray, nil +} + +func imgsequal(img1 *image.Gray, img2 *image.Gray) bool { + b := img1.Bounds() + if !b.Eq(img2.Bounds()) { + return false + } + for y := b.Min.Y; y < b.Max.Y; y++ { + for x := b.Min.X; x < b.Max.X; x++ { + r0, g0, b0, a0 := img1.At(x, y).RGBA() + r1, g1, b1, a1 := img2.At(x, y).RGBA() + if r0 != r1 { + return false + } + if g0 != g1 { + return false + } + if b0 != b1 { + return false + } + if a0 != a1 { + return false + } + } + } + return true +} diff --git a/testdata/pg1.png b/testdata/pg1.png new file mode 100644 index 0000000..2bcc4b1 Binary files /dev/null and b/testdata/pg1.png differ diff --git a/testdata/pg1_integralsauvola_k0.3_w19.png b/testdata/pg1_integralsauvola_k0.3_w19.png new file mode 100644 index 0000000..bdf5712 Binary files /dev/null and b/testdata/pg1_integralsauvola_k0.3_w19.png differ diff --git a/testdata/pg1_integralsauvola_k0.5_w19.png b/testdata/pg1_integralsauvola_k0.5_w19.png new file mode 100644 index 0000000..5db2d9a Binary files /dev/null and b/testdata/pg1_integralsauvola_k0.5_w19.png differ diff --git a/testdata/pg1_integralsauvola_k0.5_w41.png b/testdata/pg1_integralsauvola_k0.5_w41.png new file mode 100644 index 0000000..050d037 Binary files /dev/null and b/testdata/pg1_integralsauvola_k0.5_w41.png differ diff --git a/testdata/pg1_sauvola_k0.3_w19.png b/testdata/pg1_sauvola_k0.3_w19.png new file mode 100644 index 0000000..bcd595f Binary files /dev/null and b/testdata/pg1_sauvola_k0.3_w19.png differ diff --git a/testdata/pg1_sauvola_k0.5_w19.png b/testdata/pg1_sauvola_k0.5_w19.png new file mode 100644 index 0000000..8de596c Binary files /dev/null and b/testdata/pg1_sauvola_k0.5_w19.png differ diff --git a/testdata/pg1_sauvola_k0.5_w41.png b/testdata/pg1_sauvola_k0.5_w41.png new file mode 100644 index 0000000..b8f50e0 Binary files /dev/null and b/testdata/pg1_sauvola_k0.5_w41.png differ diff --git a/testdata/pg2.png b/testdata/pg2.png new file mode 100644 index 0000000..c7c4249 Binary files /dev/null and b/testdata/pg2.png differ diff --git a/testdata/pg2_integralwipesides_t0.02_w5.png b/testdata/pg2_integralwipesides_t0.02_w5.png new file mode 100644 index 0000000..6b4ccb2 Binary files /dev/null and b/testdata/pg2_integralwipesides_t0.02_w5.png differ diff --git a/testdata/pg2_integralwipesides_t0.05_w25.png b/testdata/pg2_integralwipesides_t0.05_w25.png new file mode 100644 index 0000000..39dc88d Binary files /dev/null and b/testdata/pg2_integralwipesides_t0.05_w25.png differ diff --git a/testdata/pg2_integralwipesides_t0.05_w5.png b/testdata/pg2_integralwipesides_t0.05_w5.png new file mode 100644 index 0000000..50df855 Binary files /dev/null and b/testdata/pg2_integralwipesides_t0.05_w5.png differ diff --git a/util.go b/util.go new file mode 100644 index 0000000..e23829d --- /dev/null +++ b/util.go @@ -0,0 +1,95 @@ +package preproc + +import ( + "errors" + "image" + "math" +) + +// TODO: name better; maybe verb, x-er +// TODO: implement these for regular image, and use them to make +// image functions generic for integral and non- images +type UsefulImg interface { + MeanWindow() + MeanStdDevWindow() +} + +func mean(i []int) float64 { + sum := 0 + for _, n := range i { + sum += n + } + return float64(sum) / float64(len(i)) +} + +func stddev(i []int) float64 { + m := mean(i) + + var sum float64 + for _, n := range i { + sum += (float64(n) - m) * (float64(n) - m) + } + variance := sum / float64(len(i)-1) + return math.Sqrt(variance) +} + +func meanstddev(i []int) (float64, float64) { + m := mean(i) + + var sum float64 + for _, n := range i { + sum += (float64(n) - m) * (float64(n) - m) + } + variance := float64(sum) / float64(len(i)-1) + return m, math.Sqrt(variance) +} + +// gets the pixel values surrounding a point in the image +func surrounding(img *image.Gray, x int, y int, size int) []int { + b := img.Bounds() + step := size / 2 + + miny := y - step + if miny < b.Min.Y { + miny = b.Min.Y + } + minx := x - step + if minx < b.Min.X { + minx = b.Min.X + } + maxy := y + step + if maxy > b.Max.Y { + maxy = b.Max.Y + } + maxx := x + step + if maxx > b.Max.X { + maxx = b.Max.X + } + + var s []int + for yi := miny; yi <= maxy; yi++ { + for xi := minx; xi <= maxx; xi++ { + s = append(s, int(img.GrayAt(xi, yi).Y)) + } + } + return s +} + +func BinToZeroInv(bin *image.Gray, orig *image.RGBA) (*image.RGBA, error) { + b := bin.Bounds() + if !b.Eq(orig.Bounds()) { + return orig, errors.New("bin and orig images need to be the same dimensions") + } + newimg := image.NewRGBA(image.Rect(0, 0, b.Dx(), b.Dy())) + for y := b.Min.Y; y < b.Max.Y; y++ { + for x := b.Min.X; x < b.Max.X; x++ { + if bin.GrayAt(x, y).Y == 255 { + newimg.Set(x, y, bin.GrayAt(x, y)) + } else { + newimg.Set(x, y, orig.At(x, y)) + } + } + } + + return newimg, nil +} diff --git a/wipesides.go b/wipesides.go new file mode 100644 index 0000000..8cd2060 --- /dev/null +++ b/wipesides.go @@ -0,0 +1,160 @@ +package preproc + +// TODO: add minimum size variable (default ~30%?) +// TODO: switch to an interface rather than integralimg.I + +import ( + "errors" + "fmt" + "image" + "image/color" + "image/draw" + _ "image/jpeg" + "image/png" + "os" + + "rescribe.xyz/preproc/integralimg" +) + +// returns the proportion of the given window that is black pixels +func proportion(i integralimg.I, x int, size int) float64 { + w := i.GetVerticalWindow(x, size) + return w.Proportion() +} + +// findbestedge goes through every vertical line from x to x+w to +// find the one with the lowest proportion of black pixels. +func findbestedge(img integralimg.I, x int, w int) int { + var bestx int + var best float64 + + if w == 1 { + return x + } + + right := x + w + for ; x < right; x++ { + prop := proportion(img, x, 1) + if prop > best { + best = prop + bestx = x + } + } + + return bestx +} + +// findedges finds the edges of the main content, by moving a window of wsize +// from near the middle of the image to the left and right, stopping when it reaches +// a point at which there is a lower proportion of black pixels than thresh. +func findedges(img integralimg.I, wsize int, thresh float64) (int, int) { + maxx := len(img[0]) - 1 + var lowedge, highedge int = 0, maxx + + // don't start at the middle, as this will fail for 2 column layouts, + // start 10% left or right of the middle + notcentre := maxx / 10 + + for x := maxx/2 + notcentre; x < maxx-wsize; x++ { + if proportion(img, x, wsize) <= thresh { + highedge = findbestedge(img, x, wsize) + break + } + } + + for x := maxx/2 - notcentre; x > 0; x-- { + if proportion(img, x, wsize) <= thresh { + lowedge = findbestedge(img, x, wsize) + break + } + } + + return lowedge, highedge +} + +// wipesides fills the sections of image not within the boundaries +// of lowedge and highedge with white +func wipesides(img *image.Gray, lowedge int, highedge int) *image.Gray { + b := img.Bounds() + new := image.NewGray(b) + + // set left edge white + for x := b.Min.X; x < lowedge; x++ { + for y := b.Min.Y; y < b.Max.Y; y++ { + new.SetGray(x, y, color.Gray{255}) + } + } + // copy middle + for x := lowedge; x < highedge; x++ { + for y := b.Min.Y; y < b.Max.Y; y++ { + new.SetGray(x, y, img.GrayAt(x, y)) + } + } + // set right edge white + for x := highedge; x < b.Max.X; x++ { + for y := b.Min.Y; y < b.Max.Y; y++ { + new.SetGray(x, y, color.Gray{255}) + } + } + + return new +} + +// toonarrow checks whether the area between lowedge and highedge is +// less than min % of the total image width +func toonarrow(img *image.Gray, lowedge int, highedge int, min int) bool { + b := img.Bounds() + imgw := b.Max.X - b.Min.X + wipew := highedge - lowedge + if float64(wipew)/float64(imgw)*100 < float64(min) { + return true + } + return false +} + +// Wipe fills the sections of image which fall outside the content +// area with white, providing the content area is above min % +func Wipe(img *image.Gray, wsize int, thresh float64, min int) *image.Gray { + integral := integralimg.ToIntegralImg(img) + lowedge, highedge := findedges(integral, wsize, thresh) + if toonarrow(img, lowedge, highedge, min) { + return img + } + return wipesides(img, lowedge, highedge) +} + +// WipeFile wipes an image file, filling the sections of the image +// which fall outside the content area with white, providing the +// content area is above min %. +// inPath: path of the input image. +// outPath: path to save the output image. +// wsize: window size for wipe algorithm. +// thresh: threshold for wipe algorithm. +// min: minimum % of content area width to consider valid. +func WipeFile(inPath string, outPath string, wsize int, thresh float64, min int) error { + f, err := os.Open(inPath) + defer f.Close() + if err != nil { + return errors.New(fmt.Sprintf("Could not open file %s: %v", inPath, err)) + } + img, _, err := image.Decode(f) + if err != nil { + return errors.New(fmt.Sprintf("Could not decode image: %v", err)) + } + b := img.Bounds() + gray := image.NewGray(image.Rect(0, 0, b.Dx(), b.Dy())) + draw.Draw(gray, b, img, b.Min, draw.Src) + + clean := Wipe(gray, wsize, thresh, min) + + f, err = os.Create(outPath) + if err != nil { + return errors.New(fmt.Sprintf("Could not create file %s: %v", outPath, err)) + } + defer f.Close() + err = png.Encode(f, clean) + if err != nil { + return errors.New(fmt.Sprintf("Could not encode image: %v", err)) + } + return nil +} diff --git a/wipesides_test.go b/wipesides_test.go new file mode 100644 index 0000000..d5464e0 --- /dev/null +++ b/wipesides_test.go @@ -0,0 +1,57 @@ +package preproc + +// TODO: add different pages as test cases +// TODO: test non integral img version + +import ( + "flag" + "fmt" + "image" + "image/png" + "os" + "testing" +) + +func TestWipeSides(t *testing.T) { + var update = flag.Bool("updatewipe", false, "update golden files") + cases := []struct { + name string + orig string + golden string + thresh float64 + wsize int + }{ + {"integralwipesides", "testdata/pg2.png", "testdata/pg2_integralwipesides_t0.02_w5.png", 0.02, 5}, + {"integralwipesides", "testdata/pg2.png", "testdata/pg2_integralwipesides_t0.05_w5.png", 0.05, 5}, + {"integralwipesides", "testdata/pg2.png", "testdata/pg2_integralwipesides_t0.05_w25.png", 0.05, 25}, + } + + for _, c := range cases { + t.Run(fmt.Sprintf("%s_%0.2f_%d", c.name, c.thresh, c.wsize), func(t *testing.T) { + var actual *image.Gray + orig, err := decode(c.orig) + if err != nil { + t.Fatalf("Could not open file %s: %v\n", c.orig, err) + } + actual = Wipe(orig, c.wsize, c.thresh) + if *update { + f, err := os.Create(c.golden) + defer f.Close() + if err != nil { + t.Fatalf("Could not open file %s to update: %v\n", c.golden, err) + } + err = png.Encode(f, actual) + if err != nil { + t.Fatalf("Could not encode update of %s: %v\n", c.golden, err) + } + } + golden, err := decode(c.golden) + if err != nil { + t.Fatalf("Could not open file %s: %v\n", c.golden, err) + } + if !imgsequal(golden, actual) { + t.Errorf("Processed %s differs to %s\n", c.orig, c.golden) + } + }) + } +} -- cgit v1.2.1-24-ge1ad