From ced7a8b54710a590c38cbbc9f64dec7920cc27a8 Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 1 Oct 2019 19:00:54 +0100 Subject: Add first draft of blog to git --- config.toml | 9 ++++++ content/posts/binarisation-01.md | 64 ++++++++++++++++++++++++++++++++++++++++ content/posts/who-we-are.md | 8 +++++ 3 files changed, 81 insertions(+) create mode 100644 config.toml create mode 100644 content/posts/binarisation-01.md create mode 100644 content/posts/who-we-are.md diff --git a/config.toml b/config.toml new file mode 100644 index 0000000..0015559 --- /dev/null +++ b/config.toml @@ -0,0 +1,9 @@ +baseURL = "http://example.org/" +languageCode = "en-us" +theme = "hyde" +title = "Rescribe Blog" + +[params] +description = "Historical OCR, and all things around it" +themeColor = "theme-rescribe" +# TODO: move away from google-hosted fonts diff --git a/content/posts/binarisation-01.md b/content/posts/binarisation-01.md new file mode 100644 index 0000000..79e33e0 --- /dev/null +++ b/content/posts/binarisation-01.md @@ -0,0 +1,64 @@ +--- +title: "Binarisation 1" +date: 2019-02-11T08:32:42Z +draft: true +--- +Binarisation is the process of turning a colour or grayscale image into +a black and white image. It's called binarisation as once you're done, +each pixel will either be white (0) or black (1), a binary option. +Binarisation is necessary for various types of image analysis, as it +makes various image manipulation tasks much more straightforward. OCR is +one such process, and all major OCR engines today work on binarised +images. + +Binarisation sounds pretty straightforward, and in the ideal case it is. +You can pick a number, and go through each pixel in the image, checking +if the pixel is lighter than the number, and if so declaring it to be +white, otherwise black. + +( INSERT IMAGE DEMONSTRATING THIS ) + +The first issue with this is deciding what number to pick to determine +whether a pixel is white or black. This number is called the threshold, +and the whole process of binarisation can also called thresholding, as +it's so fundamental to the binarising process. Picking a threshold that +is too high will result in too few pixels being marked as black, which +in the case of OCR means losing parts of characters, which will make it +harder for an OCR engine to correctly recognise text. Picking a +threshold that is too low will result in too many pixels being marked as +black, which for OCR means that various non-text noise will be included +and considered by the OCR engine, again reducing accuracy. + +( INSERT IMAGES DEMONSTRATING EACH ) + +If all page images were printed exactly the same way, and scanned the +same way, we could probably get away with just picking an appropriate +threshold number for everything. However sadly that is not the case, and +the variances can be significantly greater for historical documents. + +There are various algorithms to find an appropriate threshold number for +a given page. A particularly well-known and reasonable one is called the +[Otsu algorithm](https://en.wikipedia.org/wiki/Otsu%27s_method). This +works by splitting the pixels in the image into two classes, one for +background and one for foreground, with the threshold calculated to +minimise the "spread" of both classes. Spread here means how much +variation in pixel intensity there is, so by trying to minimise the +spread for each class, the threshold aims to find two clusters of +similar pixel intensities, one being a common background, the other a +common foreground. + +Otsu's algorithm works well for well printed material, on good paper, +which has been well scanned, as the brightness of the background and +foreground pixels is consistent. It works less well for pages which +been scanned with have uneven lighting, as the background brightness +may be quite different for one corner of a page than another. It is +also not too good at handling paper or ink inconsistencies, such as +blemishes, splotches or page grain, as they may well have parts +which are darker than the threshold. + +( INSERT IMAGES DEMONSTRATING OTSU FAILING ON BAD LIGHTING AND WITH + SPLOTCHES IN PAGE BEING BLACKENED ) + +Both of these criticisms could be addressed by using an algorithm that +could alter the threshold according to the conditions of the region on +the page. diff --git a/content/posts/who-we-are.md b/content/posts/who-we-are.md new file mode 100644 index 0000000..629bdcf --- /dev/null +++ b/content/posts/who-we-are.md @@ -0,0 +1,8 @@ +--- +title: "Who we are" +date: 2019-02-11T08:28:20Z +draft: true +--- +Rescribe is a research collective focused on improving the state of OCR and related technologies for historical books and documents. Free and open source software is key to the work we do, and we release all the code and training data we can on [github](https://github.com/rescribe). + +We work with a variety of academic and archiving projects to make historical works more accessible, searchable and discoverable, and to enable researchers to work with them and find connections in new ways. -- cgit v1.2.1-24-ge1ad