diff options
| -rw-r--r-- | avg-lines/html.go | 61 | ||||
| -rw-r--r-- | avg-lines/main.go | 69 | ||||
| -rw-r--r-- | bookpipeline/aws.go | 322 | ||||
| -rw-r--r-- | bookpipeline/cmd/bookpipeline/main.go | 488 | ||||
| -rw-r--r-- | bookpipeline/cmd/booktopipeline/main.go | 140 | ||||
| -rw-r--r-- | bookpipeline/cmd/confgraph/main.go | 71 | ||||
| -rw-r--r-- | bookpipeline/cmd/getpipelinebook/main.go | 122 | ||||
| -rw-r--r-- | bookpipeline/cmd/lspipeline/main.go | 250 | ||||
| -rw-r--r-- | bookpipeline/cmd/mkpipeline/main.go | 79 | ||||
| -rw-r--r-- | bookpipeline/graph.go | 155 | ||||
| -rw-r--r-- | bucket-lines/bucket.go | 131 | ||||
| -rw-r--r-- | bucket-lines/main.go | 87 | ||||
| -rw-r--r-- | cmd/binarize/main.go (renamed from preproc/cmd/binarize/main.go) | 2 | ||||
| -rw-r--r-- | cmd/preproc/main.go (renamed from preproc/cmd/preproc/main.go) | 2 | ||||
| -rw-r--r-- | cmd/preprocmulti/main.go (renamed from preproc/cmd/preprocmulti/main.go) | 4 | ||||
| -rw-r--r-- | cmd/wipe/main.go (renamed from preproc/cmd/wipe/main.go) | 2 | ||||
| -rw-r--r-- | dehyphenate/main.go | 63 | ||||
| -rw-r--r-- | eeboxmltohocr/main.go | 135 | ||||
| -rw-r--r-- | hocrtotxt/main.go | 30 | ||||
| -rw-r--r-- | lib/hocr/hocr.go | 129 | ||||
| -rw-r--r-- | lib/hocr/lines.go | 131 | ||||
| -rw-r--r-- | lib/line/line.go | 57 | ||||
| -rw-r--r-- | lib/prob/prob.go | 69 | ||||
| -rw-r--r-- | pgconf/main.go | 30 | ||||
| -rw-r--r-- | preprocmulti.go (renamed from preproc/preprocmulti.go) | 2 | ||||
| -rw-r--r-- | sauvola.go (renamed from preproc/sauvola.go) | 2 | ||||
| -rw-r--r-- | sauvola_test.go (renamed from preproc/sauvola_test.go) | 0 | ||||
| -rw-r--r-- | test_helpers.go (renamed from preproc/test_helpers.go) | 0 | ||||
| -rw-r--r-- | testdata/pg1.png (renamed from preproc/testdata/pg1.png) | bin | 651071 -> 651071 bytes | |||
| -rw-r--r-- | testdata/pg1_integralsauvola_k0.3_w19.png (renamed from preproc/testdata/pg1_integralsauvola_k0.3_w19.png) | bin | 19456 -> 19456 bytes | |||
| -rw-r--r-- | testdata/pg1_integralsauvola_k0.5_w19.png (renamed from preproc/testdata/pg1_integralsauvola_k0.5_w19.png) | bin | 18241 -> 18241 bytes | |||
| -rw-r--r-- | testdata/pg1_integralsauvola_k0.5_w41.png (renamed from preproc/testdata/pg1_integralsauvola_k0.5_w41.png) | bin | 18260 -> 18260 bytes | |||
| -rw-r--r-- | testdata/pg1_sauvola_k0.3_w19.png (renamed from preproc/testdata/pg1_sauvola_k0.3_w19.png) | bin | 19447 -> 19447 bytes | |||
| -rw-r--r-- | testdata/pg1_sauvola_k0.5_w19.png (renamed from preproc/testdata/pg1_sauvola_k0.5_w19.png) | bin | 18231 -> 18231 bytes | |||
| -rw-r--r-- | testdata/pg1_sauvola_k0.5_w41.png (renamed from preproc/testdata/pg1_sauvola_k0.5_w41.png) | bin | 18275 -> 18275 bytes | |||
| -rw-r--r-- | testdata/pg2.png (renamed from preproc/testdata/pg2.png) | bin | 30803 -> 30803 bytes | |||
| -rw-r--r-- | testdata/pg2_integralwipesides_t0.02_w5.png (renamed from preproc/testdata/pg2_integralwipesides_t0.02_w5.png) | bin | 33595 -> 33595 bytes | |||
| -rw-r--r-- | testdata/pg2_integralwipesides_t0.05_w25.png (renamed from preproc/testdata/pg2_integralwipesides_t0.05_w25.png) | bin | 33432 -> 33432 bytes | |||
| -rw-r--r-- | testdata/pg2_integralwipesides_t0.05_w5.png (renamed from preproc/testdata/pg2_integralwipesides_t0.05_w5.png) | bin | 14546 -> 14546 bytes | |||
| -rw-r--r-- | util.go (renamed from preproc/util.go) | 0 | ||||
| -rw-r--r-- | wipesides.go (renamed from preproc/wipesides.go) | 2 | ||||
| -rw-r--r-- | wipesides_test.go (renamed from preproc/wipesides_test.go) | 0 | 
42 files changed, 8 insertions, 2627 deletions
diff --git a/avg-lines/html.go b/avg-lines/html.go deleted file mode 100644 index 443cc4a..0000000 --- a/avg-lines/html.go +++ /dev/null @@ -1,61 +0,0 @@ -package main - -import ( -	"fmt" -	"os" -	"path/filepath" - -	"rescribe.xyz/go.git/lib/line" -) - -func copylineimg(fn string, l line.Detail) error { -	f, err := os.Create(fn) -	if err != nil { -		return err -	} -	defer f.Close() - -	return l.Img.CopyLineTo(f) -} - -func htmlout(dir string, lines line.Details) error { -	err := os.MkdirAll(dir, 0700) -	if err != nil { -		return err -	} - -	fn := filepath.Join(dir, "index.html") -	f, err := os.Create(fn) -	if err != nil { -		return err -	} -	defer f.Close() - -	_, err = fmt.Fprintf(f, "<!DOCTYPE html><html><head><meta charset='UTF-8'><title></title>"+ -		"<style>td {border: 1px solid #444}</style></head><body>\n<table>\n") -	if err != nil { -		return err -	} -	for _, l := range lines { -		fn = filepath.Base(l.OcrName) + "_" + l.Name + ".png" -		err = copylineimg(filepath.Join(dir, fn), l) -		if err != nil { -			return err -		} -		_, err = fmt.Fprintf(f, "<tr>\n"+ -			"<td><h1>%.4f%%</h1></td>\n"+ -			"<td>%s %s</td>\n"+ -			"<td><img src='%s' width='100%%' /><br />%s</td>\n"+ -			"</tr>\n", -			l.Avgconf, l.OcrName, l.Name, fn, l.Text) -		if err != nil { -			return err -		} -	} -	_, err = fmt.Fprintf(f, "</table>\n</body></html>\n") -	if err != nil { -		return err -	} - -	return nil -} diff --git a/avg-lines/main.go b/avg-lines/main.go deleted file mode 100644 index 14b21bd..0000000 --- a/avg-lines/main.go +++ /dev/null @@ -1,69 +0,0 @@ -package main - -import ( -	"flag" -	"fmt" -	"log" -	"os" -	"path/filepath" -	"sort" - -	"rescribe.xyz/go.git/lib/hocr" -	"rescribe.xyz/go.git/lib/line" -	"rescribe.xyz/go.git/lib/prob" -) - -func main() { -	flag.Usage = func() { -		fmt.Fprintf(os.Stderr, "Usage: avg-lines [-html dir] [-nosort] [prob1] [hocr1] [prob2] [...]\n") -		fmt.Fprintf(os.Stderr, "Prints a report of the average confidence for each line, sorted\n") -		fmt.Fprintf(os.Stderr, "from worst to best.\n") -		fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n") -		fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n") -		fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n") -		fmt.Fprintf(os.Stderr, "option.\n\n") -		flag.PrintDefaults() -	} -	var html = flag.String("html", "", "Output in html format to the specified directory") -	var nosort = flag.Bool("nosort", false, "Don't sort lines by confidence") -	flag.Parse() -	if flag.NArg() < 1 { -		flag.Usage() -		os.Exit(1) -	} - -	var err error -	lines := make(line.Details, 0) - -	for _, f := range flag.Args() { -		var newlines line.Details -		switch ext := filepath.Ext(f); ext { -		case ".prob": -			newlines, err = prob.GetLineDetails(f) -		case ".hocr": -			newlines, err = hocr.GetLineDetails(f) -		default: -			log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f) -			continue -		} -		if err != nil { -			log.Fatal(err) -		} - -		for _, l := range newlines { -			lines = append(lines, l) -		} -	} - -	if *nosort == false { -		sort.Sort(lines) -	} - -	if *html == "" { -		for _, l := range lines { -			fmt.Printf("%s %s: %.2f%%\n", l.OcrName, l.Name, l.Avgconf) -		} -	} else { -		htmlout(*html, lines) -	} -} diff --git a/bookpipeline/aws.go b/bookpipeline/aws.go deleted file mode 100644 index 0127d6e..0000000 --- a/bookpipeline/aws.go +++ /dev/null @@ -1,322 +0,0 @@ -package bookpipeline - -import ( -	"errors" -	"fmt" -	"log" -	"os" -	"time" - -	"github.com/aws/aws-sdk-go/aws" -	"github.com/aws/aws-sdk-go/aws/awserr" -	"github.com/aws/aws-sdk-go/aws/session" -	"github.com/aws/aws-sdk-go/service/ec2" -	"github.com/aws/aws-sdk-go/service/s3" -	"github.com/aws/aws-sdk-go/service/s3/s3manager" -	"github.com/aws/aws-sdk-go/service/sqs" -) - -const PreprocPattern = `_bin[0-9].[0-9].png` -const heartbeatRetry = 10 - -type Qmsg struct { -	Id, Handle, Body string -} - -type InstanceDetails struct { -	Id, Name, Ip, Spot, Type, State, LaunchTime string -} - -type AwsConn struct { -	// these need to be set before running Init() -	Region string -	Logger *log.Logger - -	// these are used internally -	sess                                    *session.Session -	ec2svc                                  *ec2.EC2 -	s3svc                                   *s3.S3 -	sqssvc                                  *sqs.SQS -	downloader                              *s3manager.Downloader -	uploader                                *s3manager.Uploader -	wipequrl, prequrl, ocrqurl, analysequrl string -	wipstorageid                            string -} - -// TODO: split this up, as not everything is needed for different uses -func (a *AwsConn) Init() error { -	if a.Region == "" { -		return errors.New("No Region set") -	} -	if a.Logger == nil { -		return errors.New("No logger set") -	} - -	var err error -	a.sess, err = session.NewSession(&aws.Config{ -		Region: aws.String(a.Region), -	}) -	if err != nil { -		return errors.New(fmt.Sprintf("Failed to set up aws session: %s", err)) -	} -	a.ec2svc = ec2.New(a.sess) -	a.s3svc = s3.New(a.sess) -	a.sqssvc = sqs.New(a.sess) -	a.downloader = s3manager.NewDownloader(a.sess) -	a.uploader = s3manager.NewUploader(a.sess) - -	a.Logger.Println("Getting preprocess queue URL") -	result, err := a.sqssvc.GetQueueUrl(&sqs.GetQueueUrlInput{ -		QueueName: aws.String("rescribepreprocess"), -	}) -	if err != nil { -		return errors.New(fmt.Sprintf("Error getting preprocess queue URL: %s", err)) -	} -	a.prequrl = *result.QueueUrl - -	a.Logger.Println("Getting wipeonly queue URL") -	result, err = a.sqssvc.GetQueueUrl(&sqs.GetQueueUrlInput{ -		QueueName: aws.String("rescribewipeonly"), -	}) -	if err != nil { -		return errors.New(fmt.Sprintf("Error getting wipeonly queue URL: %s", err)) -	} -	a.wipequrl = *result.QueueUrl - -	a.Logger.Println("Getting OCR queue URL") -	result, err = a.sqssvc.GetQueueUrl(&sqs.GetQueueUrlInput{ -		QueueName: aws.String("rescribeocr"), -	}) -	if err != nil { -		return errors.New(fmt.Sprintf("Error getting OCR queue URL: %s", err)) -	} -	a.ocrqurl = *result.QueueUrl - -	a.Logger.Println("Getting analyse queue URL") -	result, err = a.sqssvc.GetQueueUrl(&sqs.GetQueueUrlInput{ -		QueueName: aws.String("rescribeanalyse"), -	}) -	if err != nil { -		return errors.New(fmt.Sprintf("Error getting analyse queue URL: %s", err)) -	} -	a.analysequrl = *result.QueueUrl - -	a.wipstorageid = "rescribeinprogress" - -	return nil -} - -func (a *AwsConn) CheckQueue(url string, timeout int64) (Qmsg, error) { -	msgResult, err := a.sqssvc.ReceiveMessage(&sqs.ReceiveMessageInput{ -		MaxNumberOfMessages: aws.Int64(1), -		VisibilityTimeout:   &timeout, -		WaitTimeSeconds:     aws.Int64(20), -		QueueUrl:            &url, -	}) -	if err != nil { -		return Qmsg{}, err -	} - -	if len(msgResult.Messages) > 0 { -		msg := Qmsg{Id: *msgResult.Messages[0].MessageId, -			Handle: *msgResult.Messages[0].ReceiptHandle, -			Body:   *msgResult.Messages[0].Body} -		a.Logger.Println("Message received:", msg.Body) -		return msg, nil -	} else { -		return Qmsg{}, nil -	} -} - -// QueueHeartbeat updates the visibility timeout of a message. This -// ensures that the message remains "in flight", meaning that it -// cannot be seen by other processes, but if this process fails the -// timeout will expire and it will go back to being available for -// any other process to retrieve and process. -// -// SQS only allows messages to be "in flight" for up to 12 hours, so -// this will detect if the request for an update to visibility timeout -// fails, and if so will attempt to find the message on the queue, and -// return it, as the handle will have changed. -func (a *AwsConn) QueueHeartbeat(msg Qmsg, qurl string, duration int64) (Qmsg, error) { -	_, err := a.sqssvc.ChangeMessageVisibility(&sqs.ChangeMessageVisibilityInput{ -		ReceiptHandle:     &msg.Handle, -		QueueUrl:          &qurl, -		VisibilityTimeout: &duration, -	}) -	if err != nil { -		aerr, ok := err.(awserr.Error) - -		// Check if the visibility timeout has exceeded the maximum allowed, -		// and if so try to find the message again to get a new handle. -		if ok && aerr.Code() == "InvalidParameterValue" { -			// Try heartbeatRetry times to find the message -			for range [heartbeatRetry]bool{} { -				// Wait a little in case existing visibilitytimeout needs to expire -				time.Sleep((time.Duration(duration) / heartbeatRetry) * time.Second) - -				msgResult, err := a.sqssvc.ReceiveMessage(&sqs.ReceiveMessageInput{ -					MaxNumberOfMessages: aws.Int64(10), -					VisibilityTimeout:   &duration, -					WaitTimeSeconds:     aws.Int64(20), -					QueueUrl:            &qurl, -				}) -				if err != nil { -					return Qmsg{}, errors.New(fmt.Sprintf("Heartbeat error looking for message to update heartbeat: %s", err)) -				} -				for _, m := range msgResult.Messages { -					if *m.MessageId == msg.Id { -						return Qmsg{ -							Id:     *m.MessageId, -							Handle: *m.ReceiptHandle, -							Body:   *m.Body, -						}, nil -					} -				} -			} -			return Qmsg{}, errors.New("Heartbeat error failed to find message to update heartbeat") -		} else { -			return Qmsg{}, errors.New(fmt.Sprintf("Heartbeat error updating queue duration: %s", err)) -		} -	} -	return Qmsg{}, nil -} - -// GetQueueDetails gets the number of in progress and available -// messages for a queue. These are returned as strings. -func (a *AwsConn) GetQueueDetails(url string) (string, string, error) { -	numAvailable := "ApproximateNumberOfMessages" -	numInProgress := "ApproximateNumberOfMessagesNotVisible" -	attrs, err := a.sqssvc.GetQueueAttributes(&sqs.GetQueueAttributesInput{ -		AttributeNames: []*string{&numAvailable, &numInProgress}, -		QueueUrl:       &url, -	}) -	if err != nil { -		return "", "", errors.New(fmt.Sprintf("Failed to get queue attributes: %s", err)) -	} -	return *attrs.Attributes[numAvailable], *attrs.Attributes[numInProgress], nil -} - -func (a *AwsConn) PreQueueId() string { -	return a.prequrl -} - -func (a *AwsConn) WipeQueueId() string { -	return a.wipequrl -} - -func (a *AwsConn) OCRQueueId() string { -	return a.ocrqurl -} - -func (a *AwsConn) AnalyseQueueId() string { -	return a.analysequrl -} - -func (a *AwsConn) WIPStorageId() string { -	return a.wipstorageid -} - -func (a *AwsConn) ListObjects(bucket string, prefix string) ([]string, error) { -	var names []string -	err := a.s3svc.ListObjectsV2Pages(&s3.ListObjectsV2Input{ -		Bucket: aws.String(bucket), -		Prefix: aws.String(prefix), -	}, func(page *s3.ListObjectsV2Output, last bool) bool { -		for _, r := range page.Contents { -			names = append(names, *r.Key) -		} -		return true -	}) -	return names, err -} - -func (a *AwsConn) AddToQueue(url string, msg string) error { -	_, err := a.sqssvc.SendMessage(&sqs.SendMessageInput{ -		MessageBody: &msg, -		QueueUrl:    &url, -	}) -	return err -} - -func (a *AwsConn) DelFromQueue(url string, handle string) error { -	_, err := a.sqssvc.DeleteMessage(&sqs.DeleteMessageInput{ -		QueueUrl:      &url, -		ReceiptHandle: &handle, -	}) -	return err -} - -func (a *AwsConn) Download(bucket string, key string, path string) error { -	f, err := os.Create(path) -	if err != nil { -		return err -	} -	defer f.Close() - -	_, err = a.downloader.Download(f, -		&s3.GetObjectInput{ -			Bucket: aws.String(bucket), -			Key:    &key, -		}) -	return err -} - -func (a *AwsConn) Upload(bucket string, key string, path string) error { -	file, err := os.Open(path) -	if err != nil { -		log.Fatalln("Failed to open file", path, err) -	} -	defer file.Close() - -	_, err = a.uploader.Upload(&s3manager.UploadInput{ -		Bucket: aws.String(bucket), -		Key:    aws.String(key), -		Body:   file, -	}) -	return err -} - -func (a *AwsConn) GetLogger() *log.Logger { -	return a.Logger -} - -func instanceDetailsFromPage(page *ec2.DescribeInstancesOutput) []InstanceDetails { -	var details []InstanceDetails -	for _, r := range page.Reservations { -		for _, i := range r.Instances { -			var d InstanceDetails - -			for _, t := range i.Tags { -				if *t.Key == "Name" { -					d.Name = *t.Value -				} -			} -			if i.PublicIpAddress != nil { -				d.Ip = *i.PublicIpAddress -			} -			if i.SpotInstanceRequestId != nil { -				d.Spot = *i.SpotInstanceRequestId -			} -			d.Type = *i.InstanceType -			d.Id = *i.InstanceId -			d.LaunchTime = i.LaunchTime.String() -			d.State = *i.State.Name - -			details = append(details, d) -		} -	} - -	return details -} - -func (a *AwsConn) GetInstanceDetails() ([]InstanceDetails, error) { -	var details []InstanceDetails -	err := a.ec2svc.DescribeInstancesPages(&ec2.DescribeInstancesInput{}, func(page *ec2.DescribeInstancesOutput, lastPage bool) bool { -		for _, d := range instanceDetailsFromPage(page) { -			details = append(details, d) -		} -		return !lastPage -	}) -	return details, err -} diff --git a/bookpipeline/cmd/bookpipeline/main.go b/bookpipeline/cmd/bookpipeline/main.go deleted file mode 100644 index 59ece72..0000000 --- a/bookpipeline/cmd/bookpipeline/main.go +++ /dev/null @@ -1,488 +0,0 @@ -package main - -// TODO: check if images are prebinarised and if so skip multiple binarisation - -import ( -	"errors" -	"flag" -	"fmt" -	"log" -	"os" -	"os/exec" -	"path/filepath" -	"regexp" -	"strings" -	"time" - -	"rescribe.xyz/go.git/bookpipeline" -	"rescribe.xyz/go.git/lib/hocr" -	"rescribe.xyz/go.git/preproc" -) - -const usage = `Usage: bookpipeline [-v] [-np] [-nw] [-no] [-na] [-t training] - -Watches the preprocess, ocr and analyse queues for book names. When -one is found this general process is followed: - -- The book name is hidden from the queue, and a 'heartbeat' is -  started which keeps it hidden (this will time out after 2 minutes -  if the program is terminated) -- The necessary files from bookname/ are downloaded -- The files are processed -- The resulting files are uploaded to bookname/ -- The heartbeat is stopped -- The book name is removed from the queue it was taken from, and -  added to the next queue for future processing - -` - -const PauseBetweenChecks = 3 * time.Minute -const HeartbeatTime = 60 - -// null writer to enable non-verbose logging to be discarded -type NullWriter bool - -func (w NullWriter) Write(p []byte) (n int, err error) { -	return len(p), nil -} - -type Clouder interface { -	Init() error -	ListObjects(bucket string, prefix string) ([]string, error) -	Download(bucket string, key string, fn string) error -	Upload(bucket string, key string, path string) error -	CheckQueue(url string, timeout int64) (bookpipeline.Qmsg, error) -	AddToQueue(url string, msg string) error -	DelFromQueue(url string, handle string) error -	QueueHeartbeat(msg bookpipeline.Qmsg, qurl string, duration int64) (bookpipeline.Qmsg, error) -} - -type Pipeliner interface { -	Clouder -	PreQueueId() string -	WipeQueueId() string -	OCRQueueId() string -	AnalyseQueueId() string -	WIPStorageId() string -	GetLogger() *log.Logger -} - -func download(dl chan string, process chan string, conn Pipeliner, dir string, errc chan error, logger *log.Logger) { -	for key := range dl { -		fn := filepath.Join(dir, filepath.Base(key)) -		logger.Println("Downloading", key) -		err := conn.Download(conn.WIPStorageId(), key, fn) -		if err != nil { -			for range dl { -			} // consume the rest of the receiving channel so it isn't blocked -			close(process) -			errc <- err -			return -		} -		process <- fn -	} -	close(process) -} - -func up(c chan string, done chan bool, conn Pipeliner, bookname string, errc chan error, logger *log.Logger) { -	for path := range c { -		name := filepath.Base(path) -		key := filepath.Join(bookname, name) -		logger.Println("Uploading", key) -		err := conn.Upload(conn.WIPStorageId(), key, path) -		if err != nil { -			for range c { -			} // consume the rest of the receiving channel so it isn't blocked -			errc <- err -			return -		} -	} - -	done <- true -} - -func preprocess(pre chan string, up chan string, errc chan error, logger *log.Logger) { -	for path := range pre { -		logger.Println("Preprocessing", path) -		done, err := preproc.PreProcMulti(path, []float64{0.1, 0.2, 0.4, 0.5}, "binary", 0, true, 5, 30) -		if err != nil { -			for range pre { -			} // consume the rest of the receiving channel so it isn't blocked -			close(up) -			errc <- err -			return -		} -		for _, p := range done { -			up <- p -		} -	} -	close(up) -} - -func wipe(towipe chan string, up chan string, errc chan error, logger *log.Logger) { -	for path := range towipe { -		logger.Println("Wiping", path) -		s := strings.Split(path, ".") -		base := strings.Join(s[:len(s)-1], "") -		outpath := base + "_bin0.0.png" -		err := preproc.WipeFile(path, outpath, 5, 0.03, 30) -		if err != nil { -			for range towipe { -			} // consume the rest of the receiving channel so it isn't blocked -			close(up) -			errc <- err -			return -		} -		up <- outpath -	} -	close(up) -} - -func ocr(training string) func(chan string, chan string, chan error, *log.Logger) { -	return func(toocr chan string, up chan string, errc chan error, logger *log.Logger) { -		for path := range toocr { -			logger.Println("OCRing", path) -			name := strings.Replace(path, ".png", "", 1) -			cmd := exec.Command("tesseract", "-l", training, path, name, "hocr") -			err := cmd.Run() -			if err != nil { -				for range toocr { -				} // consume the rest of the receiving channel so it isn't blocked -				close(up) -				errc <- errors.New(fmt.Sprintf("Error ocring %s: %s", path, err)) -				return -			} -			up <- name + ".hocr" -		} -		close(up) -	} -} - -func analyse(toanalyse chan string, up chan string, errc chan error, logger *log.Logger) { -	confs := make(map[string][]*bookpipeline.Conf) -	bestconfs := make(map[string]*bookpipeline.Conf) -	savedir := "" - -	for path := range toanalyse { -		if savedir == "" { -			savedir = filepath.Dir(path) -		} -		logger.Println("Calculating confidence for", path) -		avg, err := hocr.GetAvgConf(path) -		if err != nil && err.Error() == "No words found" { -			continue -		} -		if err != nil { -			for range toanalyse { -			} // consume the rest of the receiving channel so it isn't blocked -			close(up) -			errc <- errors.New(fmt.Sprintf("Error retreiving confidence for %s: %s", path, err)) -			return -		} -		base := filepath.Base(path) -		codestart := strings.Index(base, "_bin") -		name := base[0:codestart] -		var c bookpipeline.Conf -		c.Path = path -		c.Code = base[codestart:] -		c.Conf = avg -		confs[name] = append(confs[name], &c) - -	} - -	fn := filepath.Join(savedir, "conf") -	logger.Println("Saving confidences in file", fn) -	f, err := os.Create(fn) -	if err != nil { -		close(up) -		errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err)) -		return -	} -	defer f.Close() - -	logger.Println("Finding best confidence for each page, and saving all confidences") -	for base, conf := range confs { -		var best float64 -		for _, c := range conf { -			if c.Conf > best { -				best = c.Conf -				bestconfs[base] = c -			} -			_, err = fmt.Fprintf(f, "%s\t%02.f\n", c.Path, c.Conf) -			if err != nil { -				close(up) -				errc <- errors.New(fmt.Sprintf("Error writing confidences file: %s", err)) -				return -			} -		} -	} -	up <- fn - -	logger.Println("Creating best file listing the best file for each page") -	fn = filepath.Join(savedir, "best") -	f, err = os.Create(fn) -	if err != nil { -		close(up) -		errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err)) -		return -	} -	defer f.Close() -	for _, conf := range bestconfs { -		_, err = fmt.Fprintf(f, "%s\n", filepath.Base(conf.Path)) -	} -	up <- fn - -	logger.Println("Creating graph") -	fn = filepath.Join(savedir, "graph.png") -	f, err = os.Create(fn) -	if err != nil { -		close(up) -		errc <- errors.New(fmt.Sprintf("Error creating file %s: %s", fn, err)) -		return -	} -	defer f.Close() -	err = bookpipeline.Graph(bestconfs, filepath.Base(savedir), f) -	if err != nil { -		close(up) -		errc <- errors.New(fmt.Sprintf("Error rendering graph: %s", err)) -		return -	} -	up <- fn - -	close(up) -} - -func heartbeat(conn Pipeliner, t *time.Ticker, msg bookpipeline.Qmsg, queue string, msgc chan bookpipeline.Qmsg, errc chan error) { -	currentmsg := msg -	for range t.C { -		m, err := conn.QueueHeartbeat(currentmsg, queue, HeartbeatTime*2) -		if err != nil { -			errc <- err -			t.Stop() -			return -		} -		if m.Id != "" { -			conn.GetLogger().Println("Replaced message handle as visibilitytimeout limit was reached") -			currentmsg = m -			// TODO: maybe handle communicating new msg more gracefully than this -			for range msgc { -			} // throw away any old msgc -			msgc <- m -		} -	} -} - -func processBook(msg bookpipeline.Qmsg, conn Pipeliner, process func(chan string, chan string, chan error, *log.Logger), match *regexp.Regexp, fromQueue string, toQueue string) error { -	dl := make(chan string) -	msgc := make(chan bookpipeline.Qmsg) -	processc := make(chan string) -	upc := make(chan string) -	done := make(chan bool) -	errc := make(chan error) - -	bookname := msg.Body - -	d := filepath.Join(os.TempDir(), bookname) -	err := os.MkdirAll(d, 0755) -	if err != nil { -		return errors.New(fmt.Sprintf("Failed to create directory %s: %s", d, err)) -	} - -	t := time.NewTicker(HeartbeatTime * time.Second) -	go heartbeat(conn, t, msg, fromQueue, msgc, errc) - -	// these functions will do their jobs when their channels have data -	go download(dl, processc, conn, d, errc, conn.GetLogger()) -	go process(processc, upc, errc, conn.GetLogger()) -	go up(upc, done, conn, bookname, errc, conn.GetLogger()) - -	conn.GetLogger().Println("Getting list of objects to download") -	objs, err := conn.ListObjects(conn.WIPStorageId(), bookname) -	if err != nil { -		t.Stop() -		_ = os.RemoveAll(d) -		return errors.New(fmt.Sprintf("Failed to get list of files for book %s: %s", bookname, err)) -	} -	var todl []string -	for _, n := range objs { -		if !match.MatchString(n) { -			conn.GetLogger().Println("Skipping item that doesn't match target", n) -			continue -		} -		todl = append(todl, n) -	} -	for _, a := range todl { -		dl <- a -	} -	close(dl) - -	// wait for either the done or errc channel to be sent to -	select { -	case err = <-errc: -		t.Stop() -		_ = os.RemoveAll(d) -		return err -	case <-done: -	} - -	if toQueue != "" { -		conn.GetLogger().Println("Sending", bookname, "to queue", toQueue) -		err = conn.AddToQueue(toQueue, bookname) -		if err != nil { -			t.Stop() -			_ = os.RemoveAll(d) -			return errors.New(fmt.Sprintf("Error adding to queue %s: %s", bookname, err)) -		} -	} - -	t.Stop() - -	// check whether we're using a newer msg handle -	select { -	case m, ok := <-msgc: -		if ok { -			msg = m -			conn.GetLogger().Println("Using new message handle to delete message from old queue") -		} -	default: -		conn.GetLogger().Println("Using original message handle to delete message from old queue") -	} - -	conn.GetLogger().Println("Deleting original message from queue", fromQueue) -	err = conn.DelFromQueue(fromQueue, msg.Handle) -	if err != nil { -		_ = os.RemoveAll(d) -		return errors.New(fmt.Sprintf("Error deleting message from queue: %s", err)) -	} - -	err = os.RemoveAll(d) -	if err != nil { -		return errors.New(fmt.Sprintf("Failed to remove directory %s: %s", d, err)) -	} - -	return nil -} - -func main() { -	verbose := flag.Bool("v", false, "verbose") -	training := flag.String("t", "rescribealphav5", "tesseract training file to use") -	nopreproc := flag.Bool("np", false, "disable preprocessing") -	nowipe := flag.Bool("nw", false, "disable wipeonly") -	noocr := flag.Bool("no", false, "disable ocr") -	noanalyse := flag.Bool("na", false, "disable analysis") - -	flag.Usage = func() { -		fmt.Fprintf(flag.CommandLine.Output(), usage) -		flag.PrintDefaults() -	} -	flag.Parse() - -	var verboselog *log.Logger -	if *verbose { -		verboselog = log.New(os.Stdout, "", 0) -	} else { -		var n NullWriter -		verboselog = log.New(n, "", 0) -	} - -	origPattern := regexp.MustCompile(`[0-9]{4}.jpg$`) // TODO: match alternative file naming -	wipePattern := regexp.MustCompile(`[0-9]{4}.png$`) -	preprocessedPattern := regexp.MustCompile(`_bin[0-9].[0-9].png$`) -	ocredPattern := regexp.MustCompile(`.hocr$`) - -	var conn Pipeliner -	conn = &bookpipeline.AwsConn{Region: "eu-west-2", Logger: verboselog} - -	verboselog.Println("Setting up AWS session") -	err := conn.Init() -	if err != nil { -		log.Fatalln("Error setting up cloud connection:", err) -	} -	verboselog.Println("Finished setting up AWS session") - -	var checkPreQueue <-chan time.Time -	var checkWipeQueue <-chan time.Time -	var checkOCRQueue <-chan time.Time -	var checkAnalyseQueue <-chan time.Time -	if !*nopreproc { -		checkPreQueue = time.After(0) -	} -	if !*nowipe { -		checkWipeQueue = time.After(0) -	} -	if !*noocr { -		checkOCRQueue = time.After(0) -	} -	if !*noanalyse { -		checkAnalyseQueue = time.After(0) -	} - -	for { -		select { -		case <-checkPreQueue: -			msg, err := conn.CheckQueue(conn.PreQueueId(), HeartbeatTime*2) -			checkPreQueue = time.After(PauseBetweenChecks) -			if err != nil { -				log.Println("Error checking preprocess queue", err) -				continue -			} -			if msg.Handle == "" { -				verboselog.Println("No message received on preprocess queue, sleeping") -				continue -			} -			verboselog.Println("Message received on preprocess queue, processing", msg.Body) -			err = processBook(msg, conn, preprocess, origPattern, conn.PreQueueId(), conn.OCRQueueId()) -			if err != nil { -				log.Println("Error during preprocess", err) -			} -		case <-checkWipeQueue: -			msg, err := conn.CheckQueue(conn.WipeQueueId(), HeartbeatTime*2) -			checkWipeQueue = time.After(PauseBetweenChecks) -			if err != nil { -				log.Println("Error checking wipeonly queue", err) -				continue -			} -			if msg.Handle == "" { -				verboselog.Println("No message received on wipeonly queue, sleeping") -				continue -			} -			verboselog.Println("Message received on wipeonly queue, processing", msg.Body) -			err = processBook(msg, conn, wipe, wipePattern, conn.WipeQueueId(), conn.OCRQueueId()) -			if err != nil { -				log.Println("Error during wipe", err) -			} -		case <-checkOCRQueue: -			msg, err := conn.CheckQueue(conn.OCRQueueId(), HeartbeatTime*2) -			checkOCRQueue = time.After(PauseBetweenChecks) -			if err != nil { -				log.Println("Error checking OCR queue", err) -				continue -			} -			if msg.Handle == "" { -				verboselog.Println("No message received on OCR queue, sleeping") -				continue -			} -			verboselog.Println("Message received on OCR queue, processing", msg.Body) -			err = processBook(msg, conn, ocr(*training), preprocessedPattern, conn.OCRQueueId(), conn.AnalyseQueueId()) -			if err != nil { -				log.Println("Error during OCR process", err) -			} -		case <-checkAnalyseQueue: -			msg, err := conn.CheckQueue(conn.AnalyseQueueId(), HeartbeatTime*2) -			checkAnalyseQueue = time.After(PauseBetweenChecks) -			if err != nil { -				log.Println("Error checking analyse queue", err) -				continue -			} -			if msg.Handle == "" { -				verboselog.Println("No message received on analyse queue, sleeping") -				continue -			} -			verboselog.Println("Message received on analyse queue, processing", msg.Body) -			err = processBook(msg, conn, analyse, ocredPattern, conn.AnalyseQueueId(), "") -			if err != nil { -				log.Println("Error during analysis", err) -			} -		} -	} -} diff --git a/bookpipeline/cmd/booktopipeline/main.go b/bookpipeline/cmd/booktopipeline/main.go deleted file mode 100644 index 6d9f146..0000000 --- a/bookpipeline/cmd/booktopipeline/main.go +++ /dev/null @@ -1,140 +0,0 @@ -package main - -// TODO: use bookpipeline package to do aws stuff - -import ( -	"flag" -	"fmt" -	"log" -	"os" -	"path/filepath" - -	"github.com/aws/aws-sdk-go/aws" -	"github.com/aws/aws-sdk-go/aws/session" -	"github.com/aws/aws-sdk-go/service/s3/s3manager" -	"github.com/aws/aws-sdk-go/service/sqs" -) - -const usage = `Usage: booktopipeline [-prebinarised] [-v] bookdir [bookname] - -Uploads the book in bookdir to the S3 'inprogress' bucket and adds it -to the 'preprocess' SQS queue, or the 'wipeonly' queue if the -prebinarised flag is set. - -If bookname is omitted the last part of the bookdir is used. -` - -// null writer to enable non-verbose logging to be discarded -type NullWriter bool - -func (w NullWriter) Write(p []byte) (n int, err error) { -	return len(p), nil -} - -var verboselog *log.Logger - -type fileWalk chan string - -func (f fileWalk) Walk(path string, info os.FileInfo, err error) error { -	if err != nil { -		return err -	} -	if !info.IsDir() { -		f <- path -	} -	return nil -} - -func main() { -	verbose := flag.Bool("v", false, "Verbose") -	wipeonly := flag.Bool("prebinarised", false, "Prebinarised: only preprocessing will be to wipe") - -	flag.Usage = func() { -		fmt.Fprintf(flag.CommandLine.Output(), usage) -		flag.PrintDefaults() -	} -	flag.Parse() -	if flag.NArg() < 1 { -		flag.Usage() -		return -	} - -	bookdir := flag.Arg(0) -	var bookname string -	if flag.NArg() > 2 { -		bookname = flag.Arg(1) -	} else { -		bookname = filepath.Base(bookdir) -	} - -	if *verbose { -		verboselog = log.New(os.Stdout, "", log.LstdFlags) -	} else { -		var n NullWriter -		verboselog = log.New(n, "", log.LstdFlags) -	} - -	verboselog.Println("Setting up AWS session") -	sess, err := session.NewSession(&aws.Config{ -		Region: aws.String("eu-west-2"), -	}) -	if err != nil { -		log.Fatalln("Error: failed to set up aws session:", err) -	} -	sqssvc := sqs.New(sess) -	uploader := s3manager.NewUploader(sess) - -	var qname string -	if *wipeonly { -		qname = "rescribewipeonly" -	} else { -		qname = "rescribepreprocess" -	} -	verboselog.Println("Getting Queue URL for", qname) -	result, err := sqssvc.GetQueueUrl(&sqs.GetQueueUrlInput{ -		QueueName: aws.String(qname), -	}) -	if err != nil { -		log.Fatalln("Error getting queue URL for", qname, ":", err) -	} -	qurl := *result.QueueUrl - -	// concurrent walking upload based on example at -	// https://docs.aws.amazon.com/sdk-for-go/v1/developer-guide/sdk-utilities.html -	verboselog.Println("Walking", bookdir) -	walker := make(fileWalk) -	go func() { -		err = filepath.Walk(bookdir, walker.Walk) -		if err != nil { -			log.Fatalln("Filesystem walk failed:", err) -		} -		close(walker) -	}() - -	for path := range walker { -		verboselog.Println("Uploading", path) -		name := filepath.Base(path) -		file, err := os.Open(path) -		if err != nil { -			log.Fatalln("Open file", path, "failed:", err) -		} -		defer file.Close() -		_, err = uploader.Upload(&s3manager.UploadInput{ -			Bucket: aws.String("rescribeinprogress"), -			Key:    aws.String(filepath.Join(bookname, name)), -			Body:   file, -		}) -		if err != nil { -			log.Fatalln("Failed to upload", path, err) -		} -	} - -	verboselog.Println("Sending message", bookname, "to queue", qurl) -	_, err = sqssvc.SendMessage(&sqs.SendMessageInput{ -		MessageBody: aws.String(bookname), -		QueueUrl:    &qurl, -	}) -	if err != nil { -		log.Fatalln("Error adding book to queue:", err) -	} -} diff --git a/bookpipeline/cmd/confgraph/main.go b/bookpipeline/cmd/confgraph/main.go deleted file mode 100644 index b60821e..0000000 --- a/bookpipeline/cmd/confgraph/main.go +++ /dev/null @@ -1,71 +0,0 @@ -package main - -import ( -	"flag" -	"fmt" -	"log" -	"os" -	"path/filepath" -	"strings" - -	"rescribe.xyz/go.git/bookpipeline" -	"rescribe.xyz/go.git/lib/hocr" -) - -func walker(confs *[]*bookpipeline.Conf) filepath.WalkFunc { -	return func(path string, info os.FileInfo, err error) error { -		if info.IsDir() { -			return nil -		} -		if !strings.HasSuffix(path, ".hocr") { -			return nil -		} -		avg, err := hocr.GetAvgConf(path) -		if err != nil { -			return err -		} -		c := bookpipeline.Conf{ -			Conf: avg, -			Path: path, -		} -		*confs = append(*confs, &c) -		return nil -	} -} - -func main() { -	flag.Usage = func() { -		fmt.Fprintln(flag.CommandLine.Output(), "Usage: bookpipeline hocrdir graph.png") -		flag.PrintDefaults() -	} -	flag.Parse() - -	if flag.NArg() != 2 { -		flag.Usage() -		return -	} - -	var confs []*bookpipeline.Conf -	err := filepath.Walk(flag.Arg(0), walker(&confs)) -	if err != nil { -		log.Fatalln("Failed to walk", flag.Arg(0), err) -	} - -	// Structure to fit what bookpipeline.Graph needs -	// TODO: probably reorganise bookpipeline to just need []*Conf -	cconfs := make(map[string]*bookpipeline.Conf) -	for _, c := range confs { -		cconfs[c.Path] = c -	} - -	fn := flag.Arg(1) -	f, err := os.Create(fn) -	if err != nil { -		log.Fatalln("Error creating file", fn, err) -	} -	defer f.Close() -	err = bookpipeline.Graph(cconfs, filepath.Base(flag.Arg(0)), f) -	if err != nil { -		log.Fatalln("Error creating graph", err) -	} -} diff --git a/bookpipeline/cmd/getpipelinebook/main.go b/bookpipeline/cmd/getpipelinebook/main.go deleted file mode 100644 index 66e3f70..0000000 --- a/bookpipeline/cmd/getpipelinebook/main.go +++ /dev/null @@ -1,122 +0,0 @@ -package main - -import ( -	"bufio" -	"flag" -	"fmt" -	"log" -	"os" -	"path/filepath" - -	"rescribe.xyz/go.git/bookpipeline" -) - -const usage = "Usage: getpipelinebook [-a] [-v] bookname\n\nDownloads the pipeline results for a book.\n" - -// null writer to enable non-verbose logging to be discarded -type NullWriter bool - -func (w NullWriter) Write(p []byte) (n int, err error) { -	return len(p), nil -} - -type Pipeliner interface { -	Init() error -	ListObjects(bucket string, prefix string) ([]string, error) -	Download(bucket string, key string, fn string) error -	Upload(bucket string, key string, path string) error -	CheckQueue(url string, timeout int64) (bookpipeline.Qmsg, error) -	AddToQueue(url string, msg string) error -	DelFromQueue(url string, handle string) error -	WIPStorageId() string -} - -func main() { -	all := flag.Bool("a", false, "Get all files for book, not just hOCR and analysis files") -	verbose := flag.Bool("v", false, "Verbose") -	flag.Usage = func() { -		fmt.Fprintf(flag.CommandLine.Output(), usage) -		flag.PrintDefaults() -	} -	flag.Parse() - -	if flag.NArg() < 1 { -		flag.Usage() -		return -	} - -	var verboselog *log.Logger -	if *verbose { -		verboselog = log.New(os.Stdout, "", log.LstdFlags) -	} else { -		var n NullWriter -		verboselog = log.New(n, "", log.LstdFlags) -	} - -	var conn Pipeliner -	conn = &bookpipeline.AwsConn{Region: "eu-west-2", Logger: verboselog} - -	verboselog.Println("Setting up AWS session") -	err := conn.Init() -	if err != nil { -		log.Fatalln("Error setting up cloud connection:", err) -	} -	verboselog.Println("Finished setting up AWS session") - -	bookname := flag.Arg(0) - -	err = os.MkdirAll(bookname, 0755) -	if err != nil { -		log.Fatalln("Failed to create directory", bookname, err) -	} - -	if *all { -		verboselog.Println("Downloading all files for", bookname) -		objs, err := conn.ListObjects(conn.WIPStorageId(), bookname) -		if err != nil { -			log.Fatalln("Failed to get list of files for book", bookname, err) -		} -		for _, i := range objs { -			verboselog.Println("Downloading", i) -			err = conn.Download(conn.WIPStorageId(), i, i) -			if err != nil { -				log.Fatalln("Failed to download file", i, err) -			} -		} -		return -	} - -	verboselog.Println("Downloading best file") -	fn := filepath.Join(bookname, "best") -	err = conn.Download(conn.WIPStorageId(), fn, fn) -	if err != nil { -		log.Fatalln("Failed to download 'best' file", err) -	} -	f, err := os.Open(fn) -	if err != nil { -		log.Fatalln("Failed to open best file", err) -	} -	defer f.Close() - -	verboselog.Println("Downloading HOCR files") -	s := bufio.NewScanner(f) -	for s.Scan() { -		fn = filepath.Join(bookname, s.Text()) -		verboselog.Println("Downloading file", fn) -		err = conn.Download(conn.WIPStorageId(), fn, fn) -		if err != nil { -			log.Fatalln("Failed to download file", fn, err) -		} -	} - -	analyses := []string{"conf", "graph.png"} -	verboselog.Println("Downloading analysis files") -	for _, a := range analyses { -		fn = filepath.Join(bookname, a) -		verboselog.Println("Downloading file", fn) -		err = conn.Download(conn.WIPStorageId(), fn, fn) -		if err != nil { -			log.Fatalln("Failed to download file", fn, err) -		} -	} -} diff --git a/bookpipeline/cmd/lspipeline/main.go b/bookpipeline/cmd/lspipeline/main.go deleted file mode 100644 index 46a1d63..0000000 --- a/bookpipeline/cmd/lspipeline/main.go +++ /dev/null @@ -1,250 +0,0 @@ -package main - -import ( -	"flag" -	"fmt" -	"log" -	"os/exec" -	"strings" - -	"rescribe.xyz/go.git/bookpipeline" -) - -const usage = `Usage: lspipeline [-i key] [-n num] - -Lists useful things related to the pipeline. - -- Instances running -- Messages in each queue -- Books not completed -- Books done -- Last n lines of bookpipeline logs from each running instance -` - -type LsPipeliner interface { -	Init() error -	PreQueueId() string -	WipeQueueId() string -	OCRQueueId() string -	AnalyseQueueId() string -	GetQueueDetails(url string) (string, string, error) -	GetInstanceDetails() ([]bookpipeline.InstanceDetails, error) -	ListObjects(bucket string, prefix string) ([]string, error) -	WIPStorageId() string -} - -// NullWriter is used so non-verbose logging may be discarded -type NullWriter bool - -func (w NullWriter) Write(p []byte) (n int, err error) { -	return len(p), nil -} - -type queueDetails struct { -	name, numAvailable, numInProgress string -} - -func getInstances(conn LsPipeliner, detailsc chan bookpipeline.InstanceDetails) { -	details, err := conn.GetInstanceDetails() -	if err != nil { -		log.Println("Error getting instance details:", err) -	} -	for _, d := range details { -		detailsc <- d -	} -	close(detailsc) -} - -func getQueueDetails(conn LsPipeliner, qdetails chan queueDetails) { -	queues := []struct{ name, id string }{ -		{"preprocess", conn.PreQueueId()}, -		{"wipeonly", conn.WipeQueueId()}, -		{"ocr", conn.OCRQueueId()}, -		{"analyse", conn.AnalyseQueueId()}, -	} -	for _, q := range queues { -		avail, inprog, err := conn.GetQueueDetails(q.id) -		if err != nil { -			log.Println("Error getting queue details:", err) -		} -		var qd queueDetails -		qd.name = q.name -		qd.numAvailable = avail -		qd.numInProgress = inprog -		qdetails <- qd -	} -	close(qdetails) -} - -// getBookStatus returns a list of in progress and done books. -// It determines this by listing all objects, and splitting the -// prefixes into two lists, those which have a 'graph.png' file, -// which are classed as done, and those which are not. -func getBookStatus(conn LsPipeliner) (inprogress []string, done []string, err error) { -	allfiles, err := conn.ListObjects(conn.WIPStorageId(), "") -	if err != nil { -		log.Println("Error getting list of objects:", err) -		return inprogress, done, err -	} -	for _, f := range allfiles { -		parts := strings.Split(f, "/") -		if parts[1] != "graph.png" { -			continue -		} -		prefix := parts[0] -		found := false -		for _, i := range done { -			if i == prefix { -				found = true -				continue -			} -		} -		if !found { -			done = append(done, prefix) -		} -	} - -	for _, f := range allfiles { -		parts := strings.Split(f, "/") -		prefix := parts[0] -		found := false -		for _, i := range done { -			if i == prefix { -				found = true -				continue -			} -		} -		for _, i := range inprogress { -			if i == prefix { -				found = true -				continue -			} -		} -		if !found { -			inprogress = append(inprogress, prefix) -		} -	} - -	return inprogress, done, err -} - -func getBookStatusChan(conn LsPipeliner, inprogressc chan string, donec chan string) { -	inprogress, done, err := getBookStatus(conn) -	if err != nil { -		log.Println("Error getting book status:", err) -		close(inprogressc) -		close(donec) -		return -	} -	for _, i := range inprogress { -		inprogressc <- i -	} -	close(inprogressc) -	for _, i := range done { -		donec <- i -	} -	close(donec) -} - -func getRecentSSHLogs(ip string, id string, n int) (string, error) { -	addr := fmt.Sprintf("%s@%s", "admin", ip) -	logcmd := fmt.Sprintf("journalctl -n %d -u bookpipeline", n) -	var cmd *exec.Cmd -	if id == "" { -		cmd = exec.Command("ssh", "-o", "StrictHostKeyChecking no", addr, logcmd) -	} else { -		cmd = exec.Command("ssh", "-o", "StrictHostKeyChecking no", "-i", id, addr, logcmd) -	} -	out, err := cmd.Output() -	if err != nil { -		return "", err -	} -	return string(out), nil -} - -func getRecentSSHLogsChan(ips []string, id string, lognum int, logs chan string) { -	for _, ip := range ips { -		sshlog, err := getRecentSSHLogs(ip, id, lognum) -		if err != nil { -			log.Printf("Error getting SSH logs for %s: %s\n", ip, err) -			continue -		} -		logs <- fmt.Sprintf("%s\n%s", ip, sshlog) -	} -	close(logs) -} - -func main() { -	keyfile := flag.String("i", "", "private key file for SSH") -	lognum := flag.Int("n", 5, "number of lines to include in SSH logs") -	flag.Usage = func() { -		fmt.Fprintf(flag.CommandLine.Output(), usage) -		flag.PrintDefaults() -	} -	flag.Parse() - -	var verboselog *log.Logger -	var n NullWriter -	verboselog = log.New(n, "", 0) - -	var conn LsPipeliner -	conn = &bookpipeline.AwsConn{Region: "eu-west-2", Logger: verboselog} -	err := conn.Init() -	if err != nil { -		log.Fatalln("Failed to set up cloud connection:", err) -	} - -	instances := make(chan bookpipeline.InstanceDetails, 100) -	queues := make(chan queueDetails) -	inprogress := make(chan string, 100) -	done := make(chan string, 100) -	logs := make(chan string, 10) - -	go getInstances(conn, instances) -	go getQueueDetails(conn, queues) -	go getBookStatusChan(conn, inprogress, done) - -	var ips []string - -	fmt.Println("# Instances") -	for i := range instances { -		fmt.Printf("ID: %s, Type: %s, LaunchTime: %s, State: %s", i.Id, i.Type, i.LaunchTime, i.State) -		if i.Name != "" { -			fmt.Printf(", Name: %s", i.Name) -		} -		if i.Ip != "" { -			fmt.Printf(", IP: %s", i.Ip) -			if i.State == "running" && i.Name != "workhorse" { -				ips = append(ips, i.Ip) -			} -		} -		if i.Spot != "" { -			fmt.Printf(", SpotRequest: %s", i.Spot) -		} -		fmt.Printf("\n") -	} - -	go getRecentSSHLogsChan(ips, *keyfile, *lognum, logs) - -	fmt.Println("\n# Queues") -	for i := range queues { -		fmt.Printf("%s: %s available, %s in progress\n", i.name, i.numAvailable, i.numInProgress) -	} - -	fmt.Println("\n# Books not completed") -	for i := range inprogress { -		fmt.Println(i) -	} - -	fmt.Println("\n# Books done") -	for i := range done { -		fmt.Println(i) -	} - -	if len(ips) > 0 { -		fmt.Println("\n# Recent logs") -		for i := range logs { -			fmt.Printf("\n%s", i) -		} -	} -} diff --git a/bookpipeline/cmd/mkpipeline/main.go b/bookpipeline/cmd/mkpipeline/main.go deleted file mode 100644 index e37a56d..0000000 --- a/bookpipeline/cmd/mkpipeline/main.go +++ /dev/null @@ -1,79 +0,0 @@ -package main - -// TODO: use the bookpipeline package for aws stuff -// TODO: set up iam role and policy needed for ec2 instances to access this stuff; -//       see arn:aws:iam::557852942063:policy/pipelinestorageandqueue -//       and arn:aws:iam::557852942063:role/pipeliner -// TODO: set up launch template for ec2 instances -// NOTE: potentially use json templates to define things, ala aws cli - -import ( -	"log" -	"os" - -	"github.com/aws/aws-sdk-go/aws" -	"github.com/aws/aws-sdk-go/aws/awserr" -	"github.com/aws/aws-sdk-go/aws/session" -	"github.com/aws/aws-sdk-go/service/s3" -	"github.com/aws/aws-sdk-go/service/sqs" -) - -func main() { -	if len(os.Args) != 1 { -		log.Fatal("Usage: mkpipeline\n\nSets up necessary S3 buckets and SQS queues for our AWS pipeline\n") -	} - -	sess, err := session.NewSession(&aws.Config{ -		Region: aws.String("eu-west-2"), -	}) -	if err != nil { -		log.Fatalf("Error: failed to set up aws session: %v\n", err) -	} -	s3svc := s3.New(sess) -	sqssvc := sqs.New(sess) - -	prefix := "rescribe" -	buckets := []string{"inprogress", "done"} -	queues := []string{"preprocess", "wipeonly", "ocr", "analyse"} - -	for _, bucket := range buckets { -		bname := prefix + bucket -		log.Printf("Creating bucket %s\n", bname) -		_, err = s3svc.CreateBucket(&s3.CreateBucketInput{ -			Bucket: aws.String(bname), -		}) -		if err != nil { -			aerr, ok := err.(awserr.Error) -			if ok && (aerr.Code() == s3.ErrCodeBucketAlreadyExists || aerr.Code() == s3.ErrCodeBucketAlreadyOwnedByYou) { -				log.Printf("Bucket %s already exists\n", bname) -			} else { -				log.Fatalf("Error creating bucket %s: %v\n", bname, err) -			} -		} -	} - -	for _, queue := range queues { -		qname := prefix + queue -		log.Printf("Creating queue %s\n", qname) -		_, err = sqssvc.CreateQueue(&sqs.CreateQueueInput{ -			QueueName: aws.String(qname), -			Attributes: map[string]*string{ -				"VisibilityTimeout":             aws.String("120"),     // 2 minutes -				"MessageRetentionPeriod":        aws.String("1209600"), // 14 days; max allowed by sqs -				"ReceiveMessageWaitTimeSeconds": aws.String("20"), -			}, -		}) -		if err != nil { -			aerr, ok := err.(awserr.Error) -			// Note the QueueAlreadyExists code is only emitted if an existing queue -			// has different attributes than the one that was being created. SQS just -			// quietly ignores the CreateQueue request if it is identical to an -			// existing queue. -			if ok && aerr.Code() == sqs.ErrCodeQueueNameExists { -				log.Fatalf("Error: Queue %s already exists but has different attributes\n", qname) -			} else { -				log.Fatalf("Error creating queue %s: %v\n", qname, err) -			} -		} -	} -} diff --git a/bookpipeline/graph.go b/bookpipeline/graph.go deleted file mode 100644 index 955abbd..0000000 --- a/bookpipeline/graph.go +++ /dev/null @@ -1,155 +0,0 @@ -package bookpipeline - -import ( -	"fmt" -	"io" -	"path/filepath" -	"sort" -	"strconv" -	"strings" - -	"github.com/wcharczuk/go-chart" -	"github.com/wcharczuk/go-chart/drawing" -) - -const maxticks = 40 -const goodCutoff = 70 -const mediumCutoff = 65 -const badCutoff = 60 - -type Conf struct { -	Path, Code string -	Conf       float64 -} - -type GraphConf struct { -	Pgnum, Conf float64 -} - -func createLine(xvalues []float64, y float64, c drawing.Color) chart.ContinuousSeries { -	var yvalues []float64 -	for range xvalues { -		yvalues = append(yvalues, y) -	} -	return chart.ContinuousSeries{ -		XValues: xvalues, -		YValues: yvalues, -		Style: chart.Style{ -			StrokeColor: c, -		}, -	} -} - -func Graph(confs map[string]*Conf, bookname string, w io.Writer) error { -	// Organise confs to sort them by page -	var graphconf []GraphConf -	for _, conf := range confs { -		name := filepath.Base(conf.Path) -		var numend int -		numend = strings.Index(name, "_") -		if numend == -1 { -			numend = strings.Index(name, ".") -		} -		pgnum, err := strconv.ParseFloat(name[0:numend], 64) -		if err != nil { -			continue -		} -		var c GraphConf -		c.Pgnum = pgnum -		c.Conf = conf.Conf -		graphconf = append(graphconf, c) -	} -	sort.Slice(graphconf, func(i, j int) bool { return graphconf[i].Pgnum < graphconf[j].Pgnum }) - -	// Create main xvalues and yvalues, annotations and ticks -	var xvalues, yvalues []float64 -	var annotations []chart.Value2 -	var ticks []chart.Tick -	tickevery := len(graphconf) / maxticks -	if tickevery < 1 { -		tickevery = 1 -	} -	for i, c := range graphconf { -		xvalues = append(xvalues, c.Pgnum) -		yvalues = append(yvalues, c.Conf) -		if c.Conf < goodCutoff { -			annotations = append(annotations, chart.Value2{Label: fmt.Sprintf("%.0f", c.Pgnum), XValue: c.Pgnum, YValue: c.Conf}) -		} -		if i%tickevery == 0 { -			ticks = append(ticks, chart.Tick{c.Pgnum, fmt.Sprintf("%.0f", c.Pgnum)}) -		} -	} -	// make last tick the final page -	final := graphconf[len(graphconf)-1] -	ticks[len(ticks)-1] = chart.Tick{final.Pgnum, fmt.Sprintf("%.0f", final.Pgnum)} -	mainSeries := chart.ContinuousSeries{ -		XValues: xvalues, -		YValues: yvalues, -	} - -	// Create lines -	goodCutoffSeries := createLine(xvalues, goodCutoff, chart.ColorAlternateGreen) -	mediumCutoffSeries := createLine(xvalues, mediumCutoff, chart.ColorOrange) -	badCutoffSeries := createLine(xvalues, badCutoff, chart.ColorRed) - -	// Create lines marking top and bottom 10% confidence -	sort.Slice(graphconf, func(i, j int) bool { return graphconf[i].Conf < graphconf[j].Conf }) -	lowconf := graphconf[int(len(graphconf)/10)].Conf -	highconf := graphconf[int((len(graphconf)/10)*9)].Conf -	yvalues = []float64{} -	for range graphconf { -		yvalues = append(yvalues, lowconf) -	} -	minSeries := &chart.ContinuousSeries{ -		Style: chart.Style{ -			StrokeColor:     chart.ColorAlternateGray, -			StrokeDashArray: []float64{5.0, 5.0}, -		}, -		XValues: xvalues, -		YValues: yvalues, -	} -	yvalues = []float64{} -	for _ = range graphconf { -		yvalues = append(yvalues, highconf) -	} -	maxSeries := &chart.ContinuousSeries{ -		Style: chart.Style{ -			StrokeColor:     chart.ColorAlternateGray, -			StrokeDashArray: []float64{5.0, 5.0}, -		}, -		XValues: xvalues, -		YValues: yvalues, -	} - -	graph := chart.Chart{ -		Title:      bookname, -		Width:      3840, -		Height:     2160, -		XAxis: chart.XAxis{ -			Name:      "Page number", -			Range: &chart.ContinuousRange{ -				Min: 0.0, -			}, -			Ticks: ticks, -		}, -		YAxis: chart.YAxis{ -			Name:      "Confidence", -			Range: &chart.ContinuousRange{ -				Min: 0.0, -				Max: 100.0, -			}, -		}, -		Series: []chart.Series{ -			mainSeries, -			minSeries, -			maxSeries, -			goodCutoffSeries, -			mediumCutoffSeries, -			badCutoffSeries, -			chart.AnnotationSeries{ -				Annotations: annotations, -			}, -		}, -	} -	return graph.Render(chart.PNG, w) -} diff --git a/bucket-lines/bucket.go b/bucket-lines/bucket.go deleted file mode 100644 index 9f98887..0000000 --- a/bucket-lines/bucket.go +++ /dev/null @@ -1,131 +0,0 @@ -package main - -import ( -	"fmt" -	"io" -	"os" -	"path/filepath" -	"sort" -	"strconv" - -	"rescribe.xyz/go.git/lib/line" -) - -type BucketSpec struct { -	Min  float64 -	Name string -} -type BucketSpecs []BucketSpec - -func (b BucketSpecs) Len() int           { return len(b) } -func (b BucketSpecs) Swap(i, j int)      { b[i], b[j] = b[j], b[i] } -func (b BucketSpecs) Less(i, j int) bool { return b[i].Min < b[j].Min } - -type BucketStat struct { -	name string -	num  int -} -type BucketStats []BucketStat - -func (b BucketStats) Len() int           { return len(b) } -func (b BucketStats) Swap(i, j int)      { b[i], b[j] = b[j], b[i] } -func (b BucketStats) Less(i, j int) bool { return b[i].num < b[j].num } - -// Copies the image and text for a line into a directory based on -// the line confidence, as defined by the buckets struct -func bucketLine(l line.Detail, buckets BucketSpecs, dirname string) (string, error) { -	var bucket string - -	todir := "" -	for _, b := range buckets { -		if l.Avgconf >= b.Min { -			todir = b.Name -			bucket = b.Name -		} -	} - -	if todir == "" { -		return bucket, nil -	} - -	avgstr := strconv.FormatFloat(l.Avgconf, 'G', -1, 64) -	if len(avgstr) > 2 { -		avgstr = avgstr[2:] -	} - -	base := filepath.Join(dirname, todir, l.OcrName+"_"+l.Name+"_"+avgstr) - -	err := os.MkdirAll(filepath.Join(dirname, todir), 0700) -	if err != nil { -		return bucket, err -	} - -	f, err := os.Create(base + ".png") -	if err != nil { -		return bucket, err -	} -	defer f.Close() - -	err = l.Img.CopyLineTo(f) -	if err != nil { -		return bucket, err -	} - -	f, err = os.Create(base + ".txt") -	if err != nil { -		return bucket, err -	} -	defer f.Close() - -	_, err = io.WriteString(f, l.Text) -	if err != nil { -		return bucket, err -	} - -	return bucket, err -} - -// Copies line images and text into directories based on their -// confidence, as defined by the buckets struct, and returns -// statistics of whire lines went in the process. -func BucketUp(lines line.Details, buckets BucketSpecs, dirname string) (BucketStats, error) { -	var all []string -	var stats BucketStats - -	sort.Sort(lines) -	sort.Sort(buckets) -	for _, l := range lines { -		bname, err := bucketLine(l, buckets, dirname) -		if err != nil { -			return stats, err -		} -		all = append(all, bname) -	} - -	for _, b := range all { -		i := sort.Search(len(stats), func(i int) bool { return stats[i].name == b }) -		if i == len(stats) { -			newstat := BucketStat{b, 0} -			stats = append(stats, newstat) -			i = len(stats) - 1 -		} -		stats[i].num++ -	} - -	return stats, nil -} - -// Prints statistics of where lines went when bucketing -func PrintBucketStats(w io.Writer, stats BucketStats) { -	var total int -	for _, s := range stats { -		total += s.num -	} - -	fmt.Fprintf(w, "Copied %d lines\n", total) -	fmt.Fprintf(w, "---------------------------------\n") -	sort.Sort(stats) -	for _, s := range stats { -		fmt.Fprintf(w, "Lines in %7s: %2d%%\n", s.name, 100*s.num/total) -	} -} diff --git a/bucket-lines/main.go b/bucket-lines/main.go deleted file mode 100644 index 990e84c..0000000 --- a/bucket-lines/main.go +++ /dev/null @@ -1,87 +0,0 @@ -package main - -import ( -	"encoding/json" -	"flag" -	"fmt" -	"io/ioutil" -	"log" -	"os" -	"path/filepath" - -	"rescribe.xyz/go.git/lib/hocr" -	"rescribe.xyz/go.git/lib/line" -	"rescribe.xyz/go.git/lib/prob" -) - -func main() { -	b := BucketSpecs{ -		// minimum confidence, name -		{0, "bad"}, -		{0.95, "95to98"}, -		{0.98, "98plus"}, -	} - -	flag.Usage = func() { -		fmt.Fprintf(os.Stderr, "Usage: bucket-lines [-d dir] [-s specs.json] [hocr1] [prob1] [hocr2] [...]\n") -		fmt.Fprintf(os.Stderr, "Copies image-text line pairs into different directories according\n") -		fmt.Fprintf(os.Stderr, "to the average character probability for the line.\n") -		fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n") -		fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n") -		fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n") -		fmt.Fprintf(os.Stderr, "option.\n") -		fmt.Fprintf(os.Stderr, "The .prob and .hocr files are assumed to be in the same directory\n") -		fmt.Fprintf(os.Stderr, "as the line's image and text files.\n\n") -		flag.PrintDefaults() -		fmt.Fprintf(os.Stderr, "\nAn example specs.json file would be the following:\n") -		fmt.Fprintf(os.Stderr, "[{\"min\": 0, \"name\": \"terrible\"}, {\"min\": 0.80, \"name\": \"ok\"}, {\"min\": 0.98, \"name\": \"great\"}]\n") -	} -	dir := flag.String("d", "buckets", "Directory to store the buckets") -	specs := flag.String("s", "", "JSON file describing specs to bucket into") -	flag.Parse() -	if flag.NArg() < 1 { -		flag.Usage() -		os.Exit(1) -	} - -	if *specs != "" { -		js, err := ioutil.ReadFile(*specs) -		if err != nil { -			log.Fatal(err) -		} -		err = json.Unmarshal(js, &b) -		if err != nil { -			log.Fatal(err) -		} -	} - -	var err error -	lines := make(line.Details, 0) - -	for _, f := range flag.Args() { -		var newlines line.Details -		switch ext := filepath.Ext(f); ext { -		case ".prob": -			newlines, err = prob.GetLineDetails(f) -		case ".hocr": -			newlines, err = hocr.GetLineDetails(f) -		default: -			log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f) -			continue -		} -		if err != nil { -			log.Fatal(err) -		} - -		for _, l := range newlines { -			lines = append(lines, l) -		} -	} - -	stats, err := BucketUp(lines, b, *dir) -	if err != nil { -		log.Fatal(err) -	} - -	PrintBucketStats(os.Stdout, stats) -} diff --git a/preproc/cmd/binarize/main.go b/cmd/binarize/main.go index e7f677e..301e42b 100644 --- a/preproc/cmd/binarize/main.go +++ b/cmd/binarize/main.go @@ -10,7 +10,7 @@ import (  	"log"  	"os" -	"rescribe.xyz/go.git/preproc" +	"rescribe.xyz/preproc"  )  // TODO: do more testing to see how good this assumption is diff --git a/preproc/cmd/preproc/main.go b/cmd/preproc/main.go index 1c248e0..5d71a62 100644 --- a/preproc/cmd/preproc/main.go +++ b/cmd/preproc/main.go @@ -12,7 +12,7 @@ import (  	"log"  	"os" -	"rescribe.xyz/go.git/preproc" +	"rescribe.xyz/preproc"  )  // TODO: do more testing to see how good this assumption is diff --git a/preproc/cmd/preprocmulti/main.go b/cmd/preprocmulti/main.go index c6c9fe4..eb9c018 100644 --- a/preproc/cmd/preprocmulti/main.go +++ b/cmd/preprocmulti/main.go @@ -12,8 +12,8 @@ import (  	"log"  	"os" -	"rescribe.xyz/go.git/integralimg" -	"rescribe.xyz/go.git/preproc" +	"rescribe.xyz/preproc" +	"rescribe.xyz/preproc/integralimg"  )  // TODO: do more testing to see how good this assumption is diff --git a/preproc/cmd/wipe/main.go b/cmd/wipe/main.go index e5c039d..6254946 100644 --- a/preproc/cmd/wipe/main.go +++ b/cmd/wipe/main.go @@ -10,7 +10,7 @@ import (  	"log"  	"os" -	"rescribe.xyz/go.git/preproc" +	"rescribe.xyz/preproc"  )  func main() { diff --git a/dehyphenate/main.go b/dehyphenate/main.go deleted file mode 100644 index 4393c8f..0000000 --- a/dehyphenate/main.go +++ /dev/null @@ -1,63 +0,0 @@ -package main - -import ( -	"encoding/xml" -	"flag" -	"fmt" -	"io/ioutil" -	"log" -	"os" - -	"rescribe.xyz/go.git/lib/hocr" -) - -// BUGS: -// - loses all elements not captured in hocr structure such as html headings -//   might be best to copy the header and footer separately and put the hocr in between, but would still need to ensure all elements are captured -// - loses any formatting; doesn't need to be identical, but e.g. linebreaks after elements would be handy -// - need to handle OcrChar - -func main() { -	flag.Usage = func() { -		fmt.Fprintf(os.Stderr, "Usage: dehyphenate hocrin hocrout\n") -		fmt.Fprintf(os.Stderr, "Dehyphenates a hocr file.\n") -		flag.PrintDefaults() -	} -	flag.Parse() -	if flag.NArg() != 2 { -		flag.Usage() -		os.Exit(1) -	} - -	in, err := ioutil.ReadFile(flag.Arg(0)) -	if err != nil { -		log.Fatalf("Error reading %s: %v", flag.Arg(1), err) -	} -	h, err := hocr.Parse(in) -	if err != nil { -		log.Fatal(err) -	} - -	for i, l := range h.Lines { -		w := l.Words[len(l.Words)-1] -		if len(w.Chars) == 0 { -			if len(w.Text) > 0 && w.Text[len(w.Text) - 1] == '-' { -				h.Lines[i].Words[len(l.Words)-1].Text = w.Text[0:len(w.Text)-1] + h.Lines[i+1].Words[0].Text -				h.Lines[i+1].Words[0].Text = "" -			} -		} else { -			log.Printf("TODO: handle OcrChar") -		} -	} - -	f, err := os.Create(flag.Arg(1)) -	if err != nil { -		log.Fatalf("Error creating file %s: %v", flag.Arg(1), err) -	} -	defer f.Close() -	e := xml.NewEncoder(f) -	err = e.Encode(h) -	if err != nil { -		log.Fatalf("Error encoding XML: %v", err) -	} -} diff --git a/eeboxmltohocr/main.go b/eeboxmltohocr/main.go deleted file mode 100644 index 2761cd9..0000000 --- a/eeboxmltohocr/main.go +++ /dev/null @@ -1,135 +0,0 @@ -package main - -import ( -	"bufio" -	"flag" -	"fmt" -	"io" -	"log" -	"os" -	"regexp" -	"strconv" -	"strings" -) - -// splitByPb is a split function for the scanner that splits by the -// '<pb' token. -func splitByPb(data []byte, atEOF bool) (advance int, token []byte, err error) { -	if atEOF && len(data) == 0 { -		return 0, nil, nil -	} -	if i := strings.Index(string(data[:]), "<pb"); i >= 0 { -		return i + 1, data[0:i], nil -	} -	// If we're at EOF, we have a final section, so just return the lot. -	if atEOF { -		return len(data), data, nil -	} -	// Request more data. -	return 0, nil, nil -} - -type Page struct { -	number int -	text   string -} - -func addPage(pgs *[]Page, number int, text string) { -	added := 0 -	for i, pg := range *pgs { -		if pg.number == number { -			(*pgs)[i].text = pg.text + text -			added = 1 -		} -	} -	if added == 0 { -		newpg := Page{number, text} -		*pgs = append(*pgs, newpg) -	}	 -} - -func main() { -	flag.Usage = func() { -		fmt.Fprintf(os.Stderr, "Usage: eeboxmltohocr in.xml outbase\n") -		flag.PrintDefaults() -	} -	flag.Parse() -	if flag.NArg() < 2 { -		flag.Usage() -		os.Exit(1) -	} - -	f, err := os.Open(flag.Arg(0)) -	defer f.Close() -	if err != nil { -		log.Fatalf("Could not open file %s: %v\n", flag.Arg(0), err) -	} -	scanner := bufio.NewScanner(f) - -	scanner.Split(splitByPb) - -	var pgs []Page - -	for scanner.Scan() { -		t := scanner.Text() -		r := regexp.MustCompile(`pb [^>]*facs="tcp:.*?:(.*?)"`).FindStringSubmatch(t) -		if len(r) <= 1 { -			continue -		} -		pgnum, err := strconv.Atoi(r[1]) -		if err != nil { -			continue -		} - -		content := t[strings.Index(t, ">")+1:] -		ungap := regexp.MustCompile(`(?s)<gap[ >].+?</gap>`).ReplaceAllString(content, "") -		unxml := regexp.MustCompile(`<.+?>`).ReplaceAllString(ungap, "") - -		finaltxt := strings.TrimLeft(unxml, " \n") -		if len(finaltxt) == 0 { -			continue -		} - -		addPage(&pgs, pgnum, finaltxt) -	} - -	for _, pg := range pgs { -		fn := fmt.Sprintf("%s-%03d.hocr", flag.Arg(1), pg.number - 1) -		f, err := os.Create(fn) -		if err != nil { -			log.Fatalf("Could not create file %s: %v\n", fn, err) -		} -		defer f.Close() - -		_, err = io.WriteString(f, hocrHeader + pg.text + hocrFooter) -		if err != nil { -			log.Fatalf("Could not write file %s: %v\n", fn, err) -		} -	} -} - -const hocrHeader = `<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" -    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> -<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> - <head> -  <title></title> -  <meta http-equiv="Content-Type" content="text/html;charset=utf-8"/> -  <meta name='ocr-system' content='tesseract 4.0.0' /> -  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/> - </head> - <body> -  <div class='ocr_page' id='page_1' title='bbox 0 0 600 1200'> -   <div class='ocr_carea' id='block_1_1' title="bbox 0 0 600 1200"> -    <p class='ocr_par' id='par_1_1' lang='lat' title="bbox 0 0 600 1200"> -     <span class='ocr_line' id='line_1_1' title="bbox 0 0 600 1200" -> -      <span class='ocrx_word' id='word_1_1' title='bbox 0 0 600 1200'>` - -const hocrFooter = `</span> -     </span> -    </p> -   </div> -  </div> - </body> -</html>` diff --git a/hocrtotxt/main.go b/hocrtotxt/main.go deleted file mode 100644 index 6821a9e..0000000 --- a/hocrtotxt/main.go +++ /dev/null @@ -1,30 +0,0 @@ -package main - -import ( -	"flag" -	"fmt" -	"log" -	"os" - -	"rescribe.xyz/go.git/lib/hocr" -) - -func main() { -	flag.Usage = func() { -		fmt.Fprintf(os.Stderr, "Usage: hocrtotxt hocrfile\n") -		fmt.Fprintf(os.Stderr, "Prints the text from a hocr file.\n") -		flag.PrintDefaults() -	} -	flag.Parse() -	if flag.NArg() != 1 { -		flag.Usage() -		os.Exit(1) -	} - -	text, err := hocr.GetText(flag.Arg(0)) -	if err != nil { -		log.Fatal(err) -	} - -	fmt.Printf("%s\n", text) -} diff --git a/lib/hocr/hocr.go b/lib/hocr/hocr.go deleted file mode 100644 index dcd0494..0000000 --- a/lib/hocr/hocr.go +++ /dev/null @@ -1,129 +0,0 @@ -package hocr - -import ( -	"encoding/xml" -	"errors" -	"io/ioutil" -	"regexp" -	"strconv" -	"strings" -) - -type Hocr struct { -	Lines []OcrLine `xml:"body>div>div>p>span"` -} - -type OcrLine struct { -	Class string    `xml:"class,attr"` -	Id    string    `xml:"id,attr"` -	Title string    `xml:"title,attr"` -	Words []OcrWord `xml:"span"` -	Text  string    `xml:",chardata"` -} - -type OcrWord struct { -	Class string    `xml:"class,attr"` -	Id    string    `xml:"id,attr"` -	Title string    `xml:"title,attr"` -	Chars []OcrChar `xml:"span"` -	Text  string    `xml:",chardata"` -} - -type OcrChar struct { -	Class string    `xml:"class,attr"` -	Id    string    `xml:"id,attr"` -	Title string    `xml:"title,attr"` -	Chars []OcrChar `xml:"span"` -	Text  string    `xml:",chardata"` -} - -// Returns the confidence for a word based on its x_wconf value -func wordConf(s string) (float64, error) { -	re, err := regexp.Compile(`x_wconf ([0-9.]+)`) -	if err != nil { -		return 0.0, err -	} -	conf := re.FindStringSubmatch(s) -	return strconv.ParseFloat(conf[1], 64) -} - -func boxCoords(s string) ([4]int, error) { -	var coords [4]int -	re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`) -	if err != nil { -		return coords, err -	} -	coordstr := re.FindStringSubmatch(s) -	for i := range coords { -		c, err := strconv.Atoi(coordstr[i+1]) -		if err != nil { -			return coords, err -		} -		coords[i] = c -	} -	return coords, nil -} - -func noText(s string) bool { -	t := strings.Trim(s, " \n") -	return len(t) == 0 -} - -func Parse(b []byte) (Hocr, error) { -	var hocr Hocr - -	err := xml.Unmarshal(b, &hocr) -	if err != nil { -		return hocr, err -	} - -	return hocr, nil -} - -func GetText(hocrfn string) (string, error) { -	var s string - -	file, err := ioutil.ReadFile(hocrfn) -	if err != nil { -		return s, err -	} - -	h, err := Parse(file) -	if err != nil { -		return s, err -	} - - -	for _, l := range h.Lines { -		s += getLineText(l) -	} -	return s, nil -} - -func GetAvgConf(hocrfn string) (float64, error) { -	file, err := ioutil.ReadFile(hocrfn) -	if err != nil { -		return 0, err -	} - -	h, err := Parse(file) -	if err != nil { -		return 0, err -	} - -	var total, num float64 -	for _, l := range h.Lines { -		for _, w := range l.Words { -			c, err := wordConf(w.Title) -			if err != nil { -				return 0, err -			} -			total += c -			num++ -		} -	} -	if num == 0 { -		return 0, errors.New("No words found") -	} -	return total / num, nil -} diff --git a/lib/hocr/lines.go b/lib/hocr/lines.go deleted file mode 100644 index 74e8f9a..0000000 --- a/lib/hocr/lines.go +++ /dev/null @@ -1,131 +0,0 @@ -package hocr - -// TODO: Parse line name to zero pad line numbers, so they can -//       be sorted easily - -import ( -	"image" -	"image/png" -	"io/ioutil" -	"log" -	"os" -	"path/filepath" -	"strings" - -	"rescribe.xyz/go.git/lib/line" -) - -func getLineText(l OcrLine) (string) { -	linetext := "" - -	linetext = l.Text -	if noText(linetext) { -		linetext = "" -		for _, w := range l.Words { -			if w.Class != "ocrx_word" { -				continue -			} -			linetext += w.Text + " " -		} -	} -	if noText(linetext) { -		linetext = "" -		for _, w := range l.Words { -			if w.Class != "ocrx_word" { -				continue -			} -			for _, c := range w.Chars { -				if c.Class != "ocrx_cinfo" { -					continue -				} -				linetext += c.Text -			} -			linetext += " " -		} -	} -	linetext = strings.TrimRight(linetext, " ") -	linetext += "\n" -	return linetext -} - -func parseLineDetails(h Hocr, i image.Image, name string) (line.Details, error) { -	lines := make(line.Details, 0) - -	for _, l := range h.Lines { -		totalconf := float64(0) -		num := 0 -		for _, w := range l.Words { -			c, err := wordConf(w.Title) -			if err != nil { -				return lines, err -			} -			num++ -			totalconf += c -		} - -		coords, err := boxCoords(l.Title) -		if err != nil { -			return lines, err -		} - -		var ln line.Detail -		ln.Name = l.Id -		ln.Avgconf = (totalconf / float64(num)) / 100 -		ln.Text = getLineText(l) -		ln.OcrName = name -		if i != nil { -			var imgd line.ImgDirect -			imgd.Img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) -			ln.Img = imgd -		} -		lines = append(lines, ln) -	} -	return lines, nil -} - -func GetLineDetails(hocrfn string) (line.Details, error) { -	var newlines line.Details - -	file, err := ioutil.ReadFile(hocrfn) -	if err != nil { -		return newlines, err -	} - -	h, err := Parse(file) -	if err != nil { -		return newlines, err -	} - -	var img image.Image -	pngfn := strings.Replace(hocrfn, ".hocr", ".png", 1) -	pngf, err := os.Open(pngfn) -	if err != nil { -		log.Println("Warning: can't open image %s\n", pngfn) -	} else { -		defer pngf.Close() -		img, err = png.Decode(pngf) -		if err != nil { -			log.Println("Warning: can't load image %s\n", pngfn) -		} -	} - -	n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1) -	return parseLineDetails(h, img, n) -} - -func GetLineBasics(hocrfn string) (line.Details, error) { -	var newlines line.Details - -	file, err := ioutil.ReadFile(hocrfn) -	if err != nil { -		return newlines, err -	} - -	h, err := Parse(file) -	if err != nil { -		return newlines, err -	} - -	n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1) -	return parseLineDetails(h, nil, n) -} diff --git a/lib/line/line.go b/lib/line/line.go deleted file mode 100644 index d4e3e44..0000000 --- a/lib/line/line.go +++ /dev/null @@ -1,57 +0,0 @@ -package line - -import ( -	"image" -	"image/png" -	"io" -	"os" -) - -type Detail struct { -	Name    string -	Avgconf float64 -	Img     CopyableImg -	Text    string -	OcrName string -} - -type CopyableImg interface { -	CopyLineTo(io.Writer) error -} - -type Details []Detail - -func (l Details) Len() int           { return len(l) } -func (l Details) Less(i, j int) bool { return l[i].Avgconf < l[j].Avgconf } -func (l Details) Swap(i, j int)      { l[i], l[j] = l[j], l[i] } - -// This is an implementation of the CopyableImg interface that -// stores the image directly as an image.Image -type ImgDirect struct { -	Img image.Image -} - -func (i ImgDirect) CopyLineTo(w io.Writer) error { -	err := png.Encode(w, i.Img) -	if err != nil { -		return err -	} -	return nil -} - -// This is an implementation of the CopyableImg interface that -// stores the path of an image -type ImgPath struct { -	Path string -} - -func (i ImgPath) CopyLineTo(w io.Writer) error { -	f, err := os.Open(i.Path) -	if err != nil { -		return err -	} -	defer f.Close() - -	_, err = io.Copy(w, f) -	return err -} diff --git a/lib/prob/prob.go b/lib/prob/prob.go deleted file mode 100644 index 31a484d..0000000 --- a/lib/prob/prob.go +++ /dev/null @@ -1,69 +0,0 @@ -package prob - -import ( -	"io/ioutil" -	"path/filepath" -	"strconv" -	"strings" - -	"rescribe.xyz/go.git/lib/line" -) - -func getLineAvg(f string) (float64, error) { -	totalconf := float64(0) -	num := 0 - -	prob, err := ioutil.ReadFile(f) -	if err != nil { -		return 0, err -	} - -	for _, l := range strings.Split(string(prob), "\n") { -		fields := strings.Fields(l) - -		if len(fields) == 2 { -			conf, err := strconv.ParseFloat(fields[1], 64) -			if err != nil { -				continue -			} -			totalconf += conf -			num += 1 -		} -	} -	if num <= 0 { -		return 0, nil -	} -	avg := totalconf / float64(num) -	return avg, nil -} - -// Note this only processes one line at a time -func GetLineDetails(probfn string) (line.Details, error) { -	var l line.Detail -	lines := make(line.Details, 0) - -	avg, err := getLineAvg(probfn) -	if err != nil { -		return lines, err -	} - -	filebase := strings.Replace(probfn, ".prob", "", 1) - -	txt, err := ioutil.ReadFile(filebase + ".txt") -	if err != nil { -		return lines, err -	} - -	l.Name = filepath.Base(filebase) -	l.Avgconf = avg -	l.Text = string(txt) -	l.OcrName = filepath.Base(filepath.Dir(filebase)) - -	var imgfn line.ImgPath -	imgfn.Path = filebase + ".bin.png" -	l.Img = imgfn - -	lines = append(lines, l) - -	return lines, nil -} diff --git a/pgconf/main.go b/pgconf/main.go deleted file mode 100644 index bc09c23..0000000 --- a/pgconf/main.go +++ /dev/null @@ -1,30 +0,0 @@ -package main - -import ( -	"flag" -	"fmt" -	"log" -	"os" - -	"rescribe.xyz/go.git/lib/hocr" -) - -func main() { -	flag.Usage = func() { -		fmt.Fprintf(os.Stderr, "Usage: pgconf hocr\n") -		fmt.Fprintf(os.Stderr, "Prints the total confidence for a page, as an average of the confidence of each word.\n") -		flag.PrintDefaults() -	} -	flag.Parse() -	if flag.NArg() != 1 { -		flag.Usage() -		os.Exit(1) -	} - -	avg, err := hocr.GetAvgConf(flag.Arg(0)) -	if err != nil { -		log.Fatalf("Error retreiving confidence for %s: %v\n", flag.Arg(0), err) -	} - -	fmt.Printf("%0.0f\n", avg) -} diff --git a/preproc/preprocmulti.go b/preprocmulti.go index 2e7cb06..7d5cbf5 100644 --- a/preproc/preprocmulti.go +++ b/preprocmulti.go @@ -11,7 +11,7 @@ import (  	"os"  	"strings" -	"rescribe.xyz/go.git/integralimg" +	"rescribe.xyz/preproc/integralimg"  )  // TODO: do more testing to see how good this assumption is diff --git a/preproc/sauvola.go b/sauvola.go index 046bb7d..3ba4359 100644 --- a/preproc/sauvola.go +++ b/sauvola.go @@ -4,7 +4,7 @@ import (  	"image"  	"image/color" -	"rescribe.xyz/go.git/integralimg" +	"rescribe.xyz/preproc/integralimg"  )  // Implements Sauvola's algorithm for text binarization, see paper diff --git a/preproc/sauvola_test.go b/sauvola_test.go index 2331e10..2331e10 100644 --- a/preproc/sauvola_test.go +++ b/sauvola_test.go diff --git a/preproc/test_helpers.go b/test_helpers.go index 20de5b1..20de5b1 100644 --- a/preproc/test_helpers.go +++ b/test_helpers.go diff --git a/preproc/testdata/pg1.png b/testdata/pg1.png Binary files differindex 2bcc4b1..2bcc4b1 100644 --- a/preproc/testdata/pg1.png +++ b/testdata/pg1.png diff --git a/preproc/testdata/pg1_integralsauvola_k0.3_w19.png b/testdata/pg1_integralsauvola_k0.3_w19.png Binary files differindex bdf5712..bdf5712 100644 --- a/preproc/testdata/pg1_integralsauvola_k0.3_w19.png +++ b/testdata/pg1_integralsauvola_k0.3_w19.png diff --git a/preproc/testdata/pg1_integralsauvola_k0.5_w19.png b/testdata/pg1_integralsauvola_k0.5_w19.png Binary files differindex 5db2d9a..5db2d9a 100644 --- a/preproc/testdata/pg1_integralsauvola_k0.5_w19.png +++ b/testdata/pg1_integralsauvola_k0.5_w19.png diff --git a/preproc/testdata/pg1_integralsauvola_k0.5_w41.png b/testdata/pg1_integralsauvola_k0.5_w41.png Binary files differindex 050d037..050d037 100644 --- a/preproc/testdata/pg1_integralsauvola_k0.5_w41.png +++ b/testdata/pg1_integralsauvola_k0.5_w41.png diff --git a/preproc/testdata/pg1_sauvola_k0.3_w19.png b/testdata/pg1_sauvola_k0.3_w19.png Binary files differindex bcd595f..bcd595f 100644 --- a/preproc/testdata/pg1_sauvola_k0.3_w19.png +++ b/testdata/pg1_sauvola_k0.3_w19.png diff --git a/preproc/testdata/pg1_sauvola_k0.5_w19.png b/testdata/pg1_sauvola_k0.5_w19.png Binary files differindex 8de596c..8de596c 100644 --- a/preproc/testdata/pg1_sauvola_k0.5_w19.png +++ b/testdata/pg1_sauvola_k0.5_w19.png diff --git a/preproc/testdata/pg1_sauvola_k0.5_w41.png b/testdata/pg1_sauvola_k0.5_w41.png Binary files differindex b8f50e0..b8f50e0 100644 --- a/preproc/testdata/pg1_sauvola_k0.5_w41.png +++ b/testdata/pg1_sauvola_k0.5_w41.png diff --git a/preproc/testdata/pg2.png b/testdata/pg2.png Binary files differindex c7c4249..c7c4249 100644 --- a/preproc/testdata/pg2.png +++ b/testdata/pg2.png diff --git a/preproc/testdata/pg2_integralwipesides_t0.02_w5.png b/testdata/pg2_integralwipesides_t0.02_w5.png Binary files differindex 6b4ccb2..6b4ccb2 100644 --- a/preproc/testdata/pg2_integralwipesides_t0.02_w5.png +++ b/testdata/pg2_integralwipesides_t0.02_w5.png diff --git a/preproc/testdata/pg2_integralwipesides_t0.05_w25.png b/testdata/pg2_integralwipesides_t0.05_w25.png Binary files differindex 39dc88d..39dc88d 100644 --- a/preproc/testdata/pg2_integralwipesides_t0.05_w25.png +++ b/testdata/pg2_integralwipesides_t0.05_w25.png diff --git a/preproc/testdata/pg2_integralwipesides_t0.05_w5.png b/testdata/pg2_integralwipesides_t0.05_w5.png Binary files differindex 50df855..50df855 100644 --- a/preproc/testdata/pg2_integralwipesides_t0.05_w5.png +++ b/testdata/pg2_integralwipesides_t0.05_w5.png diff --git a/preproc/util.go b/util.go index e23829d..e23829d 100644 --- a/preproc/util.go +++ b/util.go diff --git a/preproc/wipesides.go b/wipesides.go index 3d08053..8cd2060 100644 --- a/preproc/wipesides.go +++ b/wipesides.go @@ -13,7 +13,7 @@ import (  	"image/png"  	"os" -	"rescribe.xyz/go.git/integralimg" +	"rescribe.xyz/preproc/integralimg"  )  // returns the proportion of the given window that is black pixels diff --git a/preproc/wipesides_test.go b/wipesides_test.go index d5464e0..d5464e0 100644 --- a/preproc/wipesides_test.go +++ b/wipesides_test.go  | 
