summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2020-05-22 12:02:48 +0100
committerNick White <git@njw.name>2020-05-22 12:02:48 +0100
commit8d7f88c2caba3a40c50a04aff1072218b931e8bc (patch)
treee8dd38ec504abfb3fd09470d60890831c6271d28
parent60af66d9ba77a61e61ef71d02c8e30198c433c87 (diff)
[untested] Use less disk spaceminimisedisk
There are several ways that disk usage is reduced with this patch: - Files are deleted as soon as they have been uploaded - Once a page image has been added to a PDF, immediately delete it This should allow much larger books to be processed without needing bigger disks.
-rw-r--r--cmd/bookpipeline/main.go94
1 files changed, 70 insertions, 24 deletions
diff --git a/cmd/bookpipeline/main.go b/cmd/bookpipeline/main.go
index 3b9befb..3f55def 100644
--- a/cmd/bookpipeline/main.go
+++ b/cmd/bookpipeline/main.go
@@ -76,6 +76,10 @@ type Pipeliner interface {
Log(v ...interface{})
}
+type pageimg struct {
+ hocr, img string
+}
+
func download(dl chan string, process chan string, conn Pipeliner, dir string, errc chan error, logger *log.Logger) {
for key := range dl {
fn := filepath.Join(dir, filepath.Base(key))
@@ -105,6 +109,13 @@ func up(c chan string, done chan bool, conn Pipeliner, bookname string, errc cha
errc <- err
return
}
+ err = os.Remove(path)
+ if err != nil {
+ for range c {
+ } // consume the rest of the receiving channel so it isn't blocked
+ errc <- err
+ return
+ }
}
done <- true
@@ -122,6 +133,13 @@ func upAndQueue(c chan string, done chan bool, toQueue string, conn Pipeliner, b
errc <- err
return
}
+ err = os.Remove(path)
+ if err != nil {
+ for range c {
+ } // consume the rest of the receiving channel so it isn't blocked
+ errc <- err
+ return
+ }
logger.Println("Adding", key, training, "to queue", toQueue)
err = conn.AddToQueue(toQueue, key+" "+training)
if err != nil {
@@ -297,51 +315,89 @@ func analyse(conn Pipeliner) func(chan string, chan string, chan error, *log.Log
return
}
binhascontent, colourhascontent := false, false
+
+ var colourimgs, binimgs []pageimg
+
for _, pg := range pgs {
- var colourfn, binfn string
base := filepath.Base(pg)
nosuffix := strings.TrimSuffix(base, ".hocr")
p := strings.SplitN(base, "_bin", 2)
- binfn = nosuffix + ".png"
+ var fn string
if len(p) > 1 {
- colourfn = p[0] + ".jpg"
+ fn = p[0] + ".jpg"
} else {
- colourfn = nosuffix + ".jpg"
+ fn = nosuffix + ".jpg"
}
- logger.Println("Downloading binarised page to add to PDF", binfn)
- err := conn.Download(conn.WIPStorageId(), bookname+"/"+binfn, filepath.Join(savedir, binfn))
+ binimgs = append(binimgs, pageimg{hocr: pg, img: nosuffix + ".png"})
+ colourimgs = append(colourimgs, pageimg{hocr: pg, img: fn})
+ }
+
+ for _, pg := range binimgs {
+ logger.Println("Downloading binarised page to add to PDF", pg.img)
+ err := conn.Download(conn.WIPStorageId(), bookname+"/"+pg.img, filepath.Join(savedir, pg.img))
if err != nil {
- logger.Println("Download failed; skipping page", binfn)
+ logger.Println("Download failed; skipping page", pg.img)
} else {
- err = binarisedpdf.AddPage(filepath.Join(savedir, binfn), pg, true)
+ err = binarisedpdf.AddPage(filepath.Join(savedir, pg.img), filepath.Join(savedir, pg.hocr), true)
if err != nil {
close(up)
- errc <- fmt.Errorf("Failed to add page %s to PDF: %s", binfn, err)
+ errc <- fmt.Errorf("Failed to add page %s to PDF: %s", pg.img, err)
return
}
binhascontent = true
+ err = os.Remove(filepath.Join(savedir, pg.img))
+ if err != nil {
+ close(up)
+ errc <- err
+ return
+ }
+ }
+ }
+
+ if binhascontent {
+ fn = filepath.Join(savedir, bookname+".binarised.pdf")
+ err = binarisedpdf.Save(fn)
+ if err != nil {
+ close(up)
+ errc <- fmt.Errorf("Failed to save binarised pdf: %s", err)
+ return
}
+ up <- fn
+ key := bookname + "/" + bookname + ".binarised.pdf"
+ conn.Log("Uploading", key)
+ err := conn.Upload(conn.WIPStorageId(), key, fn)
+ if err != nil {
+ }
+ }
- logger.Println("Downloading colour page to add to PDF", colourfn)
+ for _, pg := range colourimgs {
+ logger.Println("Downloading colour page to add to PDF", pg.img)
+ colourfn := pg.img
err = conn.Download(conn.WIPStorageId(), bookname+"/"+colourfn, filepath.Join(savedir, colourfn))
if err != nil {
- colourfn = strings.Replace(colourfn, ".jpg", ".png", 1)
+ colourfn = strings.Replace(pg.img, ".jpg", ".png", 1)
logger.Println("Download failed; trying", colourfn)
err = conn.Download(conn.WIPStorageId(), bookname+"/"+colourfn, filepath.Join(savedir, colourfn))
if err != nil {
- logger.Println("Download failed; skipping page", colourfn)
+ logger.Println("Download failed; skipping page", pg.img)
}
}
if err == nil {
- err = colourpdf.AddPage(filepath.Join(savedir, colourfn), pg, true)
+ err = colourpdf.AddPage(filepath.Join(savedir, colourfn), pg.hocr, true)
if err != nil {
close(up)
- errc <- fmt.Errorf("Failed to add page %s to PDF: %s", colourfn, err)
+ errc <- fmt.Errorf("Failed to add page %s to PDF: %s", pg.img, err)
return
}
colourhascontent = true
+ err = os.Remove(filepath.Join(savedir, colourfn))
+ if err != nil {
+ close(up)
+ errc <- err
+ return
+ }
}
}
if colourhascontent {
@@ -354,16 +410,6 @@ func analyse(conn Pipeliner) func(chan string, chan string, chan error, *log.Log
}
up <- fn
}
- if binhascontent {
- fn = filepath.Join(savedir, bookname+".binarised.pdf")
- err = binarisedpdf.Save(fn)
- if err != nil {
- close(up)
- errc <- fmt.Errorf("Failed to save binarised pdf: %s", err)
- return
- }
- up <- fn
- }
logger.Println("Creating graph")
fn = filepath.Join(savedir, "graph.png")