|
|
<!DOCTYPE html>
|
|
|
<!DOCTYPE html>
|
|
|
<html>
|
|
|
<head>
|
|
|
<meta charset="utf-8">
|
|
|
<title>Tasks of the Contingent Librarian</title>
|
|
|
<link rel="stylesheet" type="text/css" href="tasks.css">
|
|
|
<script src="tasks.js"></script>
|
|
|
</head>
|
|
|
<body>
|
|
|
|
|
|
<div class="cardback"><DOCUMENT_FRAGMENT><div class="mw-parser-output"><div class="thumb tright"><div class="thumbinner" style="width:152px;"><a class="image" href="https://pzwiki.wdka.nl/mw-mediadesign/images/thumb/3/3c/Rereferencing_Open_work_OCR.jpeg/720px-Rereferencing_Open_work_OCR.jpeg"><img alt="" class="thumbimage" decoding="async" src="https://pzwiki.wdka.nl/mw-mediadesign/images/thumb/3/3c/Rereferencing_Open_work_OCR.jpeg/320px-Rereferencing_Open_work_OCR.jpeg"></a> <div class="thumbcaption"><div class="magnify"><a class="internal" href="File:Rereferencing_Open_work_OCR.jpeg.html" title="Enlarge"></a></div>A bootleg copy of The Open Work by Umberto Eco. OCR software has mistaken the page number (page 80) as the word “So”</div></div></div>
|
|
|
<h2><span class="mw-headline" id="Pre-processing_for_OCR">Pre-processing for OCR</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/mw-mediadesign/index.php?title=User:Simon/self_directed_research/OCR_preprocessing&action=edit&section=T-1" title="Edit section: ">edit</a><span class="mw-editsection-bracket">]</span></span></h2>
|
|
|
<p>This script applies transformations to the image before running OCR, resulting in a clearer result:
|
|
|
</p>
|
|
|
<div class="mw-highlight mw-content-ltr" dir="ltr"><pre><span></span><span class="c1"># import the necessary packages</span>
|
|
|
<span class="c1">#from PIL </span>
|
|
|
<span class="kn">import</span> <span class="nn">Image</span>
|
|
|
<span class="kn">import</span> <span class="nn">pytesseract</span>
|
|
|
<span class="kn">import</span> <span class="nn">argparse</span>
|
|
|
<span class="kn">import</span> <span class="nn">cv2</span>
|
|
|
<span class="kn">import</span> <span class="nn">os</span>
|
|
|
|
|
|
<span class="c1"># construct the argument parse and parse the arguments</span>
|
|
|
<span class="n">ap</span> <span class="o">=</span> <span class="n">argparse</span><span class="o">.</span><span class="n">ArgumentParser</span><span class="p">()</span>
|
|
|
<span class="n">ap</span><span class="o">.</span><span class="n">add_argument</span><span class="p">(</span><span class="s2">"-i"</span><span class="p">,</span> <span class="s2">"--image"</span><span class="p">,</span> <span class="n">required</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span>
|
|
|
<span class="n">help</span><span class="o">=</span><span class="s2">"path to input image to be OCR'd"</span><span class="p">)</span>
|
|
|
<span class="n">ap</span><span class="o">.</span><span class="n">add_argument</span><span class="p">(</span><span class="s2">"-p"</span><span class="p">,</span> <span class="s2">"--preprocess"</span><span class="p">,</span> <span class="nb">type</span><span class="o">=</span><span class="nb">str</span><span class="p">,</span> <span class="n">default</span><span class="o">=</span><span class="s2">"thresh"</span><span class="p">,</span>
|
|
|
<span class="n">help</span><span class="o">=</span><span class="s2">"type of preprocessing to be done"</span><span class="p">)</span>
|
|
|
<span class="n">args</span> <span class="o">=</span> <span class="nb">vars</span><span class="p">(</span><span class="n">ap</span><span class="o">.</span><span class="n">parse_args</span><span class="p">())</span>
|
|
|
|
|
|
<span class="c1"># load the example image and convert it to grayscale</span>
|
|
|
<span class="n">image</span> <span class="o">=</span> <span class="n">cv2</span><span class="o">.</span><span class="n">imread</span><span class="p">(</span><span class="n">args</span><span class="p">[</span><span class="s2">"image"</span><span class="p">])</span>
|
|
|
<span class="n">gray</span> <span class="o">=</span> <span class="n">cv2</span><span class="o">.</span><span class="n">cvtColor</span><span class="p">(</span><span class="n">image</span><span class="p">,</span> <span class="n">cv2</span><span class="o">.</span><span class="n">COLOR_BGR2GRAY</span><span class="p">)</span>
|
|
|
|
|
|
<span class="c1"># check to see if we should apply thresholding to preprocess the</span>
|
|
|
<span class="c1"># image</span>
|
|
|
<span class="k">if</span> <span class="n">args</span><span class="p">[</span><span class="s2">"preprocess"</span><span class="p">]</span> <span class="o">==</span> <span class="s2">"thresh"</span><span class="p">:</span>
|
|
|
<span class="n">gray</span> <span class="o">=</span> <span class="n">cv2</span><span class="o">.</span><span class="n">threshold</span><span class="p">(</span><span class="n">gray</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">255</span><span class="p">,</span>
|
|
|
<span class="n">cv2</span><span class="o">.</span><span class="n">THRESH_BINARY</span> <span class="o">|</span> <span class="n">cv2</span><span class="o">.</span><span class="n">THRESH_OTSU</span><span class="p">)[</span><span class="mi">1</span><span class="p">]</span>
|
|
|
|
|
|
<span class="c1"># make a check to see if median blurring should be done to remove</span>
|
|
|
<span class="c1"># noise</span>
|
|
|
<span class="k">elif</span> <span class="n">args</span><span class="p">[</span><span class="s2">"preprocess"</span><span class="p">]</span> <span class="o">==</span> <span class="s2">"blur"</span><span class="p">:</span>
|
|
|
<span class="n">gray</span> <span class="o">=</span> <span class="n">cv2</span><span class="o">.</span><span class="n">medianBlur</span><span class="p">(</span><span class="n">gray</span><span class="p">,</span> <span class="mi">3</span><span class="p">)</span>
|
|
|
|
|
|
<span class="c1"># write the grayscale image to disk as a temporary file so we can</span>
|
|
|
<span class="c1"># apply OCR to it</span>
|
|
|
<span class="n">filename</span> <span class="o">=</span> <span class="s2">"{}.png"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">getpid</span><span class="p">())</span>
|
|
|
<span class="n">cv2</span><span class="o">.</span><span class="n">imwrite</span><span class="p">(</span><span class="n">filename</span><span class="p">,</span> <span class="n">gray</span><span class="p">)</span>
|
|
|
|
|
|
<span class="c1"># load the image as a PIL/Pillow image, apply OCR, and then delete</span>
|
|
|
<span class="c1"># the temporary file</span>
|
|
|
<span class="n">text</span> <span class="o">=</span> <span class="n">pytesseract</span><span class="o">.</span><span class="n">image_to_string</span><span class="p">(</span><span class="n">Image</span><span class="o">.</span><span class="n">open</span><span class="p">(</span><span class="n">filename</span><span class="p">))</span>
|
|
|
<span class="n">os</span><span class="o">.</span><span class="n">remove</span><span class="p">(</span><span class="n">filename</span><span class="p">)</span>
|
|
|
<span class="k">print</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
|
|
|
|
|
|
<span class="c1"># show the output images</span>
|
|
|
<span class="n">cv2</span><span class="o">.</span><span class="n">imshow</span><span class="p">(</span><span class="s2">"Image"</span><span class="p">,</span> <span class="n">image</span><span class="p">)</span>
|
|
|
<span class="n">cv2</span><span class="o">.</span><span class="n">imshow</span><span class="p">(</span><span class="s2">"Output"</span><span class="p">,</span> <span class="n">gray</span><span class="p">)</span>
|
|
|
<span class="n">cv2</span><span class="o">.</span><span class="n">waitKey</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
|
|
|
</pre></div>
|
|
|
<!--
|
|
|
NewPP limit report
|
|
|
Cached time: 20200620142346
|
|
|
Cache expiry: 86400
|
|
|
Dynamic content: false
|
|
|
CPU time usage: 0.028 seconds
|
|
|
Real time usage: 0.215 seconds
|
|
|
Preprocessor visited node count: 7/1000000
|
|
|
Preprocessor generated node count: 26/1000000
|
|
|
Post‐expand include size: 194/2097152 bytes
|
|
|
Template argument size: 0/2097152 bytes
|
|
|
Highest expansion depth: 2/40
|
|
|
Expensive parser function count: 0/100
|
|
|
Unstrip recursion depth: 0/20
|
|
|
Unstrip post‐expand size: 6360/5000000 bytes
|
|
|
-->
|
|
|
<!--
|
|
|
Transclusion expansion time report (%,ms,calls,template)
|
|
|
100.00% 191.534 1 User:Simon/self_directed_research/OCR_preprocessing
|
|
|
100.00% 191.534 1 -total
|
|
|
-->
|
|
|
|
|
|
<!-- Saved in parser cache with key wdka_mw_mediadesign-mw_:pcache:idhash:31704-0!canonical and timestamp 20200620142346 and revision id 175214
|
|
|
-->
|
|
|
</div></DOCUMENT_FRAGMENT></div>
|
|
|
|
|
|
</body>
|
|
|
</html>
|