From 8d7f88c2caba3a40c50a04aff1072218b931e8bc Mon Sep 17 00:00:00 2001 From: Nick White Date: Fri, 22 May 2020 12:02:48 +0100 Subject: [untested] Use less disk space There are several ways that disk usage is reduced with this patch: - Files are deleted as soon as they have been uploaded - Once a page image has been added to a PDF, immediately delete it This should allow much larger books to be processed without needing bigger disks. --- cmd/bookpipeline/main.go | 94 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 70 insertions(+), 24 deletions(-) diff --git a/cmd/bookpipeline/main.go b/cmd/bookpipeline/main.go index 3b9befb..3f55def 100644 --- a/cmd/bookpipeline/main.go +++ b/cmd/bookpipeline/main.go @@ -76,6 +76,10 @@ type Pipeliner interface { Log(v ...interface{}) } +type pageimg struct { + hocr, img string +} + func download(dl chan string, process chan string, conn Pipeliner, dir string, errc chan error, logger *log.Logger) { for key := range dl { fn := filepath.Join(dir, filepath.Base(key)) @@ -105,6 +109,13 @@ func up(c chan string, done chan bool, conn Pipeliner, bookname string, errc cha errc <- err return } + err = os.Remove(path) + if err != nil { + for range c { + } // consume the rest of the receiving channel so it isn't blocked + errc <- err + return + } } done <- true @@ -122,6 +133,13 @@ func upAndQueue(c chan string, done chan bool, toQueue string, conn Pipeliner, b errc <- err return } + err = os.Remove(path) + if err != nil { + for range c { + } // consume the rest of the receiving channel so it isn't blocked + errc <- err + return + } logger.Println("Adding", key, training, "to queue", toQueue) err = conn.AddToQueue(toQueue, key+" "+training) if err != nil { @@ -297,51 +315,89 @@ func analyse(conn Pipeliner) func(chan string, chan string, chan error, *log.Log return } binhascontent, colourhascontent := false, false + + var colourimgs, binimgs []pageimg + for _, pg := range pgs { - var colourfn, binfn string base := filepath.Base(pg) nosuffix := strings.TrimSuffix(base, ".hocr") p := strings.SplitN(base, "_bin", 2) - binfn = nosuffix + ".png" + var fn string if len(p) > 1 { - colourfn = p[0] + ".jpg" + fn = p[0] + ".jpg" } else { - colourfn = nosuffix + ".jpg" + fn = nosuffix + ".jpg" } - logger.Println("Downloading binarised page to add to PDF", binfn) - err := conn.Download(conn.WIPStorageId(), bookname+"/"+binfn, filepath.Join(savedir, binfn)) + binimgs = append(binimgs, pageimg{hocr: pg, img: nosuffix + ".png"}) + colourimgs = append(colourimgs, pageimg{hocr: pg, img: fn}) + } + + for _, pg := range binimgs { + logger.Println("Downloading binarised page to add to PDF", pg.img) + err := conn.Download(conn.WIPStorageId(), bookname+"/"+pg.img, filepath.Join(savedir, pg.img)) if err != nil { - logger.Println("Download failed; skipping page", binfn) + logger.Println("Download failed; skipping page", pg.img) } else { - err = binarisedpdf.AddPage(filepath.Join(savedir, binfn), pg, true) + err = binarisedpdf.AddPage(filepath.Join(savedir, pg.img), filepath.Join(savedir, pg.hocr), true) if err != nil { close(up) - errc <- fmt.Errorf("Failed to add page %s to PDF: %s", binfn, err) + errc <- fmt.Errorf("Failed to add page %s to PDF: %s", pg.img, err) return } binhascontent = true + err = os.Remove(filepath.Join(savedir, pg.img)) + if err != nil { + close(up) + errc <- err + return + } + } + } + + if binhascontent { + fn = filepath.Join(savedir, bookname+".binarised.pdf") + err = binarisedpdf.Save(fn) + if err != nil { + close(up) + errc <- fmt.Errorf("Failed to save binarised pdf: %s", err) + return } + up <- fn + key := bookname + "/" + bookname + ".binarised.pdf" + conn.Log("Uploading", key) + err := conn.Upload(conn.WIPStorageId(), key, fn) + if err != nil { + } + } - logger.Println("Downloading colour page to add to PDF", colourfn) + for _, pg := range colourimgs { + logger.Println("Downloading colour page to add to PDF", pg.img) + colourfn := pg.img err = conn.Download(conn.WIPStorageId(), bookname+"/"+colourfn, filepath.Join(savedir, colourfn)) if err != nil { - colourfn = strings.Replace(colourfn, ".jpg", ".png", 1) + colourfn = strings.Replace(pg.img, ".jpg", ".png", 1) logger.Println("Download failed; trying", colourfn) err = conn.Download(conn.WIPStorageId(), bookname+"/"+colourfn, filepath.Join(savedir, colourfn)) if err != nil { - logger.Println("Download failed; skipping page", colourfn) + logger.Println("Download failed; skipping page", pg.img) } } if err == nil { - err = colourpdf.AddPage(filepath.Join(savedir, colourfn), pg, true) + err = colourpdf.AddPage(filepath.Join(savedir, colourfn), pg.hocr, true) if err != nil { close(up) - errc <- fmt.Errorf("Failed to add page %s to PDF: %s", colourfn, err) + errc <- fmt.Errorf("Failed to add page %s to PDF: %s", pg.img, err) return } colourhascontent = true + err = os.Remove(filepath.Join(savedir, colourfn)) + if err != nil { + close(up) + errc <- err + return + } } } if colourhascontent { @@ -354,16 +410,6 @@ func analyse(conn Pipeliner) func(chan string, chan string, chan error, *log.Log } up <- fn } - if binhascontent { - fn = filepath.Join(savedir, bookname+".binarised.pdf") - err = binarisedpdf.Save(fn) - if err != nil { - close(up) - errc <- fmt.Errorf("Failed to save binarised pdf: %s", err) - return - } - up <- fn - } logger.Println("Creating graph") fn = filepath.Join(savedir, "graph.png") -- cgit v1.2.1-24-ge1ad