import html5lib
from xml.etree import ElementTree as ET
from PIL import Image
from nltk import FreqDist
from nltk.corpus import stopwords
import glob
import os
from fpdf import FPDF
stopwords.words('english')
sr = set(stopwords.words('english'))
def cleanstopwords(list):
"This cleans stopwords from a list of words"
clean_words = list[:]
for word in list:
if word.lower() in sr:
clean_words.remove(word)
return clean_words
def findleastcommon(list):
"This finds the least common words and returns a list"
fdist = FreqDist(word.lower() for word in list)
leastcommon = fdist.most_common()
for i in leastcommon:
if (i[1] <= limit):
leastcommon_list.append(i[0])
return leastcommon_list
def coordinates(attribute):
"This extracts the box coordinates of words from an hocr / html element tree"
r = attribute # 'title' is the word in the html tag
r, c = r.split(";") # split the attribute into two sections
r = r.split(" ")[1:] # split again and discard the elements which aren't useful
r = [int(x) for x in r] # put coordinates into list as integers
return r
def filternone(word_raw):
if word_raw is None:
remove = None
word = 'y'
else:
word = element.text.strip(',".!:;()')
return word
x = -1
leastcommon_list = []
allwords = []
scanimg = glob.glob('images-tiff/*.tiff')
hocr = glob.glob('hocr/*.html')
maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned
# loop through every image in scanimg folder
for i in scanimg:
x = x + 1
limit = x * maximum # this helps the script remove words in a way that is proportional to number of pages scanned
iim = Image.open(i) # iim is initial image
oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
# open corresponding hocr file
f = open(hocr[x])
print ('Reading scanned image, filtering least common words.')
print ('')
t = html5lib.parse(f, namespaceHTMLElements=False)
# loop through every word in hocr file to analyse words and find least common
for element in t.findall(".//span[@class='ocrx_word']"):
word = filternone(element.text)
allwords.append(word)
clean_words = cleanstopwords(allwords) #clean stopwords
findleastcommon(clean_words) #find least common words and add them to list
print ("The least common words until text", x+1, "are:", leastcommon_list)
print ('')
print ('Processing word coordinates and erasing least common words.')
print ('')
# loop through every word in hocr file to extract coordinates, then remove or paste into output image
for element in t.findall(".//span[@class='ocrx_word']"):
word = filternone(element.text)
c = coordinates(element.attrib['title'])
wim = iim.crop(c) # wim is word image
if word.lower() in leastcommon_list and len(word) < limit:
oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3]))
else:
oim.paste(wim, (c[0], c[1], c[2], c[3]))
#-------------------------------------------------------------------------------#
# save and show images
n = i.replace("images-tiff/","output/erase-replace/").replace(".tiff", "")
oim.save("{}-{}erase.jpg".format(n, x))
#-------------------------------------------------------------------------------#
# save images into PDF
outputs = glob.glob('output/erase-replace/*erase.jpg')
print ("Saving to PDF: output/erase-replace/Erase.pdf")
def makePdf(pdfFileName, listPages, dir = ''):
if (dir):
dir += "/"
cover = Image.open(dir + str(listPages[0]))
width, height = cover.size
pdf = FPDF(unit = "pt", format = [width, height])
for page in listPages:
pdf.add_page()
pdf.image(dir + str(page), 0, 0)
pdf.output(dir + pdfFileName + ".pdf", "F")
makePdf('output/erase-replace/Erase', outputs, dir = '')
#clean up previous jpg files
files = glob.glob('./output/erase-replace/*erase.jpg')
for f in files:
os.remove(f)