import html5lib from xml.etree import ElementTree as ET from PIL import Image from nltk import FreqDist from nltk.corpus import stopwords import random import glob import time from fpdf import FPDF import os import shutil path1 = './temp' if not os.path.isdir(path1): os.makedirs(path1) os.makedirs('./temp/crops4') os.makedirs('./temp/crops7') os.makedirs('./temp/crops_more') stopwords.words('english') sr = set(stopwords.words('english')) def cleanstopwords(list): "This cleans stopwords from a list of words" clean_words = list[:] for word in list: if word.lower() in sr: clean_words.remove(word) return clean_words def findmostcommon(list, int): "This finds the most common words and returns a list" fdist = FreqDist(word.lower() for word in list) mostcommon = fdist.most_common(int) mostcommon_list = [i[0] for i in mostcommon] return mostcommon_list def findleastcommon(list): "This finds the least common words and returns a list" fdist = FreqDist(word.lower() for word in list) leastcommon = fdist.most_common() for i in leastcommon: if (i[1] <= 1): leastcommon_list.append(i[0]) return leastcommon_list def coordinates(attribute): "This extracts the box coordinates of words from an hocr / html element tree" c = attribute # 'title' is the word in the html tag c, r = c.split(";") # split the attribute into two sections c = c.split(" ")[1:] # split again and discard the elements which aren't useful c = [int(x) for x in c] # put coordinates into list as integers return c def filternone(word_raw): if word_raw is None: remove = None word = 'y' else: word = element.text.strip(',".!:;()') return word x = -1 leastcommon_list = [] allwords = [] scanimg = sorted(glob.glob('images-tiff/*.tiff')) hocr = sorted(glob.glob('hocr/*.html')) num = 0 maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned # loop through every image in scanimg folder for i in scanimg: x = x + 1 limit = 15 - (x * maximum) # this helps the script remove words in a way that is proportional to number of pages scanned iim = Image.open(i) # iim is initial image oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image # open corresponding hocr file f = open(hocr[x]) print ('Reading scanned image and hocr file, filtering least common words.') print ('') t = html5lib.parse(f, namespaceHTMLElements=False) # loop through every word in hocr file to analyse words and find least common for element in t.findall(".//span[@class='ocrx_word']"): word = filternone(element.text) allwords.append(word) clean_words = cleanstopwords(allwords) #clean stopwords findleastcommon(clean_words) #find least common words and add them to list mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list print ('The most common words until text', x+1, 'are:', mostcommon_list) print ('The least common words until text', x+1, 'are:', leastcommon_list) print ('') # loop through every word in hocr file to extract coordinates, then remove or paste into output image print ('Processing word coordinates and replacing least common words with most common words.') print ('') for element in t.findall(".//span[@class='ocrx_word']"): word = filternone(element.text) c = coordinates(element.attrib['title']) num = num + 1 wim = iim.crop(c) # wim is word image #extract coordinates if word.lower() in mostcommon_list and len(word) > 1 and len(word) <= 5: wim.save ("temp/crops4/wimreplace{}.png".format(num)) elif word in mostcommon_list and len(word) <= 7 : wim.save ("temp/crops7/wimreplace{}.png".format(num)) elif word in mostcommon_list and len(word) > 7 : wim.save ("temp/crops_more/wimreplace{}.png".format(num)) if x > 0: # use PIL to crop out every box, then paste it according to if rule randomimg4 = random.choice(glob.glob('temp/crops4/*.png')) randomimg7 = random.choice(glob.glob('temp/crops7/*.png')) randomimg_more = random.choice(glob.glob('temp/crops_more/*.png')) wimreplace4 = Image.open(randomimg4) wimreplace7 = Image.open(randomimg7) wimreplace_more = Image.open(randomimg_more) wimcolor4 = Image.new('RGBA', wimreplace4.size, (250, 230, 0, 90)) wimcolor7 = Image.new('RGBA', wimreplace7.size, (250, 230, 0, 90)) wimcolor_more = Image.new('RGBA', wimreplace_more.size, (250, 230, 0, 90)) out4 = Image.alpha_composite(wimreplace4.convert('RGBA'), wimcolor4) out7 = Image.alpha_composite(wimreplace7.convert('RGBA'), wimcolor7) out_more = Image.alpha_composite(wimreplace_more.convert('RGBA'), wimcolor_more) if word.lower() in leastcommon_list and len(word) <= 3: oim.paste(wim, (c[0], c[1], c[2], c[3])) elif word.lower() in leastcommon_list and len(word) < 8: oim.paste(out4, (c[0], c[1])) elif word.lower() in leastcommon_list and len(word) < 11: oim.paste(out7, (c[0], c[1])) elif word.lower() in leastcommon_list and len(word) > 8: oim.paste(out_more, (c[0], c[1])) else: oim.paste(wim, (c[0], c[1], c[2], c[3])) else: oim.paste(wim, (c[0], c[1], c[2], c[3])) #-------------------------------------------------------------------------------# # save images n = i.replace("images-tiff/","output/erase-replace/").replace(".tiff", "") oim.save("{}-{}replace.jpg".format(n, x)) #-------------------------------------------------------------------------------# # save images into PDF outputs = glob.glob('output/erase-replace/*replace.jpg') print ('') print ("Saving to PDF: output/erase-replace/Replace.pdf") def makePdf(pdfFileName, listPages, dir = ''): if (dir): dir += "/" cover = Image.open(dir + str(listPages[0])) width, height = cover.size pdf = FPDF(unit = "pt", format = [width, height]) for page in listPages: pdf.add_page() pdf.image(dir + str(page), 0, 0) pdf.output(dir + pdfFileName + ".pdf", "F") makePdf('output/erase-replace/Replace', outputs, dir = '') #clean up previous jpg and png files files = glob.glob('./output/erase-replace/*replace.jpg') for f in files: os.remove(f) shutil.rmtree('./temp/') # DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE # TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION # 0. You just DO WHAT THE FUCK YOU WANT TO.