summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README4
-rw-r--r--aws.go1
-rw-r--r--cloudsettings.go12
-rw-r--r--doc.go20
4 files changed, 17 insertions, 20 deletions
diff --git a/README b/README
index e5e918f..5da6c6b 100644
--- a/README
+++ b/README
@@ -11,7 +11,9 @@ by running `go get rescribe.xyz/bookpipeline/...`
The commands in the cmd/ directory are at the heart of this
package. For more details on their usage, use `go doc` or read
-doc.go in the package repository. The key commands are:
+doc.go in the package repository.
+
+The key commands for the virtual server side are:
- bookpipeline : processes items from queues, doing preprocessing,
ocr and postprocessing, and moving items on to
diff --git a/aws.go b/aws.go
index 2015d9c..70189ea 100644
--- a/aws.go
+++ b/aws.go
@@ -450,6 +450,7 @@ func (a *AwsConn) Log(v ...interface{}) {
}
// mkpipeline sets up necessary buckets and queues for the pipeline
+// TODO: also set up the necessary security group and iam stuff
func (a *AwsConn) MkPipeline() error {
buckets := []string{storageWip}
queues := []string{queuePreProc, queueWipeOnly, queueAnalyse, queueOcrPage}
diff --git a/cloudsettings.go b/cloudsettings.go
index 0cf1777..fa60238 100644
--- a/cloudsettings.go
+++ b/cloudsettings.go
@@ -7,7 +7,13 @@ package bookpipeline
// This file contains various cloud account specific stuff; change this if
// you want to use the cloud functionality on your own site.
-// Spot instance details
+// Spot instance details.
+// The profile needs to allow permissions to the below S3 buckets and
+// SQS queues, the Sg (security group) doesn't need any permissions,
+// beyond SSH if you like, and the image should have bookpipeline
+// installed and ideally auto-updating.
+// TODO: release ansible repository which creates AMI.
+// TODO: create profile and security group with mkpipeline
const (
spotProfile = "arn:aws:iam::557852942063:instance-profile/pipeliner"
spotImage = "ami-0bc6ef6900f6da5d3"
@@ -15,7 +21,7 @@ const (
spotSg = "sg-0be8a3ab89e7136b9"
)
-// Queue names
+// Queue names. Can be anything unique in SQS.
const (
queuePreProc = "rescribepreprocess"
queueWipeOnly = "rescribewipeonly"
@@ -23,7 +29,7 @@ const (
queueAnalyse = "rescribeanalyse"
)
-// Storage bucket names
+// Storage bucket names. Can be anything unique in S3.
const (
storageWip = "rescribeinprogress"
)
diff --git a/doc.go b/doc.go
index b7d9d99..59037cd 100644
--- a/doc.go
+++ b/doc.go
@@ -5,6 +5,8 @@
/*
Package bookpipeline contains various tools and functions for the OCR of
books, with a focus on distributed OCR using short-lived virtual servers.
+It also contains several tools that are useful standalone; read the
+accompanying README for more details.
Introduction
@@ -25,8 +27,8 @@ what they do and how they work with the '-h' flag, so for example to get usage
information on the booktopipeline tool simply run the following:
booktopipeline -h
-You'll also need to set up your ~/.aws/credentials appropriately so that the
-tools work.
+To get the pipeline tools to work for you, you'll need to change the settings
+in cloudsettings.go, and set up your ~/.aws/credentials appropriately.
Managing servers
@@ -127,20 +129,6 @@ which have been prebinarised.
example message: APolishGentleman_MemoirByAdamKruczkiewicz
example message: APolishGentleman_MemoirByAdamKruczkiewicz rescribefrav2
-rescribeocr
-
-This queue is no longer used, as it could result in processes that took more
-than 12 hours to complete, which was unreliable with SQS. Instead pages are
-submitted individually to the rescribeocrpage by the preprocess and wipe
-functions, which has the added advantage that different pages can be processed
-in parallel on different servers, enabling books to be processed significantly
-faster. The code for processing books from the rescribeocr queue is still
-present in bookpipeline, and the queue is still checked, but it is not
-expected to be used.
-
- example message: APolishGentleman_MemoirByAdamKruczkiewicz
- example message: APolishGentleman_MemoirByAdamKruczkiewicz rescribefrav2
-
rescribeocrpage
This queue contains the path of individual pages, optionally followed by