import html5lib from xml.etree import ElementTree as ET from PIL import Image from nltk import FreqDist from nltk.corpus import stopwords import glob import os from fpdf import FPDF stopwords.words('english') sr = set(stopwords.words('english')) def cleanstopwords(list): "This cleans stopwords from a list of words" clean_words = list[:] for word in list: if word.lower() in sr: clean_words.remove(word) return clean_words def findleastcommon(list): "This finds the least common words and returns a list" fdist = FreqDist(word.lower() for word in list) leastcommon = fdist.most_common() for i in leastcommon: if (i[1] <= 1): leastcommon_list.append(i[0]) return leastcommon_list def coordinates(attribute): "This extracts the box coordinates of words from an hocr / html element tree" r = attribute # 'title' is the word in the html tag r, c = r.split(";") # split the attribute into two sections r = r.split(" ")[1:] # split again and discard the elements which aren't useful r = [int(x) for x in r] # put coordinates into list as integers return r def filternone(word_raw): if word_raw is None: remove = None word = 'y' else: word = element.text.strip(',".!:;()') return word x = -1 leastcommon_list = [] allwords = [] scanimg = sorted(glob.glob('images-tiff/*.tiff')) hocr = sorted(glob.glob('hocr/*.html')) maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned # loop through every image in scanimg folder for i in scanimg: x = x + 1 limit = x * maximum # this helps the script remove words in a way that is proportional to number of pages scanned iim = Image.open(i) # iim is initial image oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image # open corresponding hocr file f = open(hocr[x]) print ('Reading scanned image, filtering least common words.') print ('') t = html5lib.parse(f, namespaceHTMLElements=False) # loop through every word in hocr file to analyse words and find least common for element in t.findall(".//span[@class='ocrx_word']"): word = filternone(element.text) allwords.append(word) clean_words = cleanstopwords(allwords) #clean stopwords findleastcommon(clean_words) #find least common words and add them to list print ("The least common words until text", x+1, "are:", leastcommon_list) print ('') print ('Processing word coordinates and erasing least common words.') print ('') # loop through every word in hocr file to extract coordinates, then remove or paste into output image for element in t.findall(".//span[@class='ocrx_word']"): word = filternone(element.text) c = coordinates(element.attrib['title']) wim = iim.crop(c) # wim is word image if word.lower() in leastcommon_list and len(word) < limit: oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3])) else: oim.paste(wim, (c[0], c[1], c[2], c[3])) #-------------------------------------------------------------------------------# # save and show images n = i.replace("images-tiff/","output/erase-replace/").replace(".tiff", "") oim.save("{}-{}erase.jpg".format(n, x)) #-------------------------------------------------------------------------------# # save images into PDF outputs = glob.glob('output/erase-replace/*erase.jpg') print ("Saving to PDF: output/erase-replace/Erase.pdf") def makePdf(pdfFileName, listPages, dir = ''): if (dir): dir += "/" cover = Image.open(dir + str(listPages[0])) width, height = cover.size pdf = FPDF(unit = "pt", format = [width, height]) for page in listPages: pdf.add_page() pdf.image(dir + str(page), 0, 0) pdf.output(dir + pdfFileName + ".pdf", "F") makePdf('output/erase-replace/Erase', outputs, dir = '') #clean up previous jpg files files = glob.glob('./output/erase-replace/*erase.jpg') for f in files: os.remove(f) # DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE # TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION # 0. You just DO WHAT THE FUCK YOU WANT TO.