summaryrefslogtreecommitdiff
path: root/pipelinepreprocess/main.go
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-08-13 18:33:32 +0100
committerNick White <git@njw.name>2019-08-13 18:33:32 +0100
commit312dcbe96e45330e933f7d542e3b2ef2bf76ec08 (patch)
tree2ea6d0c9d47d26f3ff2261b97a6727bc8efb469d /pipelinepreprocess/main.go
parente9587728ff52846cb43e42531b4c4772e3516f13 (diff)
Various improvements to pipelinepreprocess
- Ensure temporary directory already being present isn't an issue - Remove temporary directory when done with it - Ensure any already preprocessed files aren't preprocessed themselves (this could happen in the case of a run stopping half way through)
Diffstat (limited to 'pipelinepreprocess/main.go')
-rw-r--r--pipelinepreprocess/main.go17
1 files changed, 15 insertions, 2 deletions
diff --git a/pipelinepreprocess/main.go b/pipelinepreprocess/main.go
index fd73725..6c58d98 100644
--- a/pipelinepreprocess/main.go
+++ b/pipelinepreprocess/main.go
@@ -9,6 +9,7 @@ import (
"log"
"os"
"path/filepath"
+ "regexp"
"time"
"github.com/aws/aws-sdk-go/aws"
@@ -31,6 +32,7 @@ var verboselog *log.Logger
const HeartbeatTime = 60
const PauseBetweenChecks = 60 * time.Second
+const PreprocPattern = `_bin[0-9].[0-9].png`
// TODO: could restructure like so:
// have the goroutine functions run outside of the main loop in the program,
@@ -127,6 +129,8 @@ func main() {
verboselog = log.New(n, "", log.LstdFlags)
}
+ alreadydone := regexp.MustCompile(PreprocPattern)
+
verboselog.Println("Setting up AWS session")
sess, err := session.NewSession(&aws.Config{
Region: aws.String("eu-west-2"),
@@ -187,7 +191,7 @@ func main() {
d := filepath.Join(os.TempDir(), bookname)
- err = os.Mkdir(d, 0755)
+ err = os.MkdirAll(d, 0755)
if err != nil {
log.Fatalln("Failed to create directory", d, err)
}
@@ -203,12 +207,16 @@ func main() {
go up(upc, done, uploader, bookname)
- verboselog.Println("Getting list of appropriate objects to download")
+ verboselog.Println("Getting list of objects to download")
err = s3svc.ListObjectsV2Pages(&s3.ListObjectsV2Input{
Bucket: aws.String("rescribeinprogress"),
Prefix: aws.String(bookname),
}, func(page *s3.ListObjectsV2Output, last bool) bool {
for _, r := range page.Contents {
+ if alreadydone.MatchString(*r.Key) {
+ verboselog.Println("Skipping item that looks like it has already been processed", *r.Key)
+ continue
+ }
dl <- *r.Key
}
return true
@@ -238,5 +246,10 @@ func main() {
if err != nil {
log.Fatalln("Error deleting message from queue", preqname, ":", err)
}
+
+ err = os.RemoveAll(d)
+ if err != nil {
+ log.Fatalln("Failed to remove directory", d, err)
+ }
}
}