You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
OuNuPo/src/erase_leastcommon.py

120 lines
3.8 KiB
Python

import html5lib
from xml.etree import ElementTree as ET
from PIL import Image
from nltk import FreqDist
from nltk.corpus import stopwords
import glob
import os
from fpdf import FPDF
stopwords.words('english')
sr = set(stopwords.words('english'))
def cleanstopwords(list):
"This cleans stopwords from a list of words"
clean_words = list[:]
for word in list:
if word.lower() in sr:
clean_words.remove(word)
return clean_words
def findleastcommon(list):
"This finds the least common words and returns a list"
fdist = FreqDist(word.lower() for word in list)
leastcommon = fdist.most_common()
for i in leastcommon:
if (i[1] <= limit):
leastcommon_list.append(i[0])
return leastcommon_list
def coordinates(attribute):
"This extracts the box coordinates of words from an hocr / html element tree"
r = attribute # 'title' is the word in the html tag
r, c = r.split(";") # split the attribute into two sections
r = r.split(" ")[1:] # split again and discard the elements which aren't useful
r = [int(x) for x in r] # put coordinates into list as integers
return r
def filternone(word_raw):
if word_raw is None:
remove = None
word = 'y'
else:
word = element.text.strip(',".!:;()')
return word
x = -1
leastcommon_list = []
allwords = []
scanimg = glob.glob('images-tiff/*.tiff')
hocr = glob.glob('hocr/*.html')
maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned
# loop through every image in scanimg folder
for i in scanimg:
x = x + 1
limit = x * maximum # this helps the script remove words in a way that is proportional to number of pages scanned
iim = Image.open(i) # iim is initial image
oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
# open corresponding hocr file
f = open(hocr[x])
print ('Reading scanned image, filtering least common words.')
print ('')
t = html5lib.parse(f, namespaceHTMLElements=False)
# loop through every word in hocr file to analyse words and find least common
for element in t.findall(".//span[@class='ocrx_word']"):
word = filternone(element.text)
allwords.append(word)
clean_words = cleanstopwords(allwords) #clean stopwords
findleastcommon(clean_words) #find least common words and add them to list
print ("The least common words until text", x+1, "are:", leastcommon_list)
print ('')
print ('Processing word coordinates and erasing least common words.')
print ('')
# loop through every word in hocr file to extract coordinates, then remove or paste into output image
for element in t.findall(".//span[@class='ocrx_word']"):
word = filternone(element.text)
c = coordinates(element.attrib['title'])
wim = iim.crop(c) # wim is word image
if word.lower() in leastcommon_list and len(word) < limit:
oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3]))
else:
oim.paste(wim, (c[0], c[1], c[2], c[3]))
#-------------------------------------------------------------------------------#
# save and show images
n = i.replace("images-tiff/","output/erase-replace/").replace(".tiff", "")
oim.save("{}-{}erase.jpg".format(n, x))
#-------------------------------------------------------------------------------#
# save images into PDF
outputs = glob.glob('output/erase-replace/*erase.jpg')
print ("Saving to PDF: output/erase-replace/Erase.pdf")
def makePdf(pdfFileName, listPages, dir = ''):
if (dir):
dir += "/"
cover = Image.open(dir + str(listPages[0]))
width, height = cover.size
pdf = FPDF(unit = "pt", format = [width, height])
for page in listPages:
pdf.add_page()
pdf.image(dir + str(page), 0, 0)
pdf.output(dir + pdfFileName + ".pdf", "F")
makePdf('output/erase-replace/Erase', outputs, dir = '')
#clean up previous jpg files
files = glob.glob('./output/erase-replace/*erase.jpg')
for f in files:
os.remove(f)