diff --git a/src/erase_leastcommon.py b/src/erase_leastcommon.py new file mode 100644 index 0000000..3e7b69b --- /dev/null +++ b/src/erase_leastcommon.py @@ -0,0 +1,123 @@ +import html5lib +from xml.etree import ElementTree as ET +from PIL import Image +from nltk import FreqDist +from nltk.corpus import stopwords +import glob +import os +from fpdf import FPDF + +stopwords.words('english') +sr = set(stopwords.words('english')) + +def cleanstopwords(list): + "This cleans stopwords from a list of words" + clean_words = list[:] + for word in list: + if word.lower() in sr: + clean_words.remove(word) + return clean_words + +def findleastcommon(list): + "This finds the least common words and returns a list" + fdist = FreqDist(word.lower() for word in list) + leastcommon = fdist.most_common() + for i in leastcommon: + if (i[1] <= limit): + leastcommon_list.append(i[0]) + return leastcommon_list + +def coordinates(attribute): + "This extracts the box coordinates of words from an hocr / html element tree" + r = attribute # 'title' is the word in the html tag + r, c = r.split(";") # split the attribute into two sections + r = r.split(" ")[1:] # split again and discard the elements which aren't useful + r = [int(x) for x in r] # put coordinates into list as integers + return r + +def filternone(word_raw): + if word_raw is None: + remove = None + word = 'y' + else: + word = element.text.strip(',".!:;()') + return word + +x = -1 +leastcommon_list = [] +allwords = [] +scanimg = glob.glob('images-tiff/*.tiff') +hocr = glob.glob('hocr/*.html') +maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned + +# loop through every image in scanimg folder +for i in scanimg: + x = x + 1 + limit = x * maximum # this helps the script remove words in a way that is proportional to number of pages scanned + iim = Image.open(i) # iim is initial image + oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image + + # open corresponding hocr file + print ("Analysing", hocr[x]) + f = open(hocr[x]) + print ('Reading scanned image, filtering least common words.') + print ("") + + t = html5lib.parse(f, namespaceHTMLElements=False) + + # loop through every word in hocr file to analyse words and find least common + for element in t.findall(".//span[@class='ocrx_word']"): + word = filternone(element.text) + allwords.append(word) + + clean_words = cleanstopwords(allwords) #clean stopwords + findleastcommon(clean_words) #find least common words and add them to list + print ("The least common words until text", x+1, "are:", leastcommon_list) + + # loop through every word in hocr file to extract coordinates, then remove or paste into output image + for element in t.findall(".//span[@class='ocrx_word']"): + word = filternone(element.text) + c = coordinates(element.attrib['title']) + + wim = iim.crop(c) # wim is word image + + if word.lower() in leastcommon_list and len(word) < limit: + oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3])) + print ('Excluding:', word) + + else: + oim.paste(wim, (c[0], c[1], c[2], c[3])) + print ('Including:', word) + + #-------------------------------------------------------------------------------# + # save and show images + n = i.replace("images-tiff/","output/erase-replace/").replace(".tiff", "") + oim.save("{}-{}erase.jpg".format(n, x)) + + +#-------------------------------------------------------------------------------# +# save images into PDF +outputs = glob.glob('output/erase-replace/*erase.jpg') +print ("Saving to PDF:", outputs) + +def makePdf(pdfFileName, listPages, dir = ''): + if (dir): + dir += "/" + + cover = Image.open(dir + str(listPages[0])) + width, height = cover.size + pdf = FPDF(unit = "pt", format = [width, height]) + + for page in listPages: + pdf.add_page() + pdf.image(dir + str(page), 0, 0) + + pdf.output(dir + pdfFileName + ".pdf", "F") + +makePdf('output/erase-replace/Erase', outputs, dir = '') + +#clean up previous jpg files +files = glob.glob('./output/erase-replace/*erase.jpg') +for f in files: + os.remove(f) + diff --git a/src/replace_leastcommon.py b/src/replace_leastcommon.py new file mode 100644 index 0000000..b596d2a --- /dev/null +++ b/src/replace_leastcommon.py @@ -0,0 +1,192 @@ +import html5lib +from xml.etree import ElementTree as ET +from PIL import Image +from nltk import FreqDist +from nltk.corpus import stopwords +import random +import glob +import time +from fpdf import FPDF +import os + +stopwords.words('english') +sr = set(stopwords.words('english')) + +def cleanstopwords(list): + "This cleans stopwords from a list of words" + clean_words = list[:] + for word in list: + if word.lower() in sr: + clean_words.remove(word) + return clean_words + +def findmostcommon(list, int): + "This finds the most common words and returns a list" + fdist = FreqDist(word.lower() for word in list) + mostcommon = fdist.most_common(int) + mostcommon_list = [i[0] for i in mostcommon] + return mostcommon_list + +def findleastcommon(list): + "This finds the least common words and returns a list" + fdist = FreqDist(word.lower() for word in list) + leastcommon = fdist.most_common() + for i in leastcommon: + if (i[1] <= limit): + leastcommon_list.append(i[0]) + return leastcommon_list + +def coordinates(attribute): + "This extracts the box coordinates of words from an hocr / html element tree" + c = attribute # 'title' is the word in the html tag + c, r = c.split(";") # split the attribute into two sections + c = c.split(" ")[1:] # split again and discard the elements which aren't useful + c = [int(x) for x in c] # put coordinates into list as integers + return c + +def filternone(word_raw): + if word_raw is None: + remove = None + word = 'y' + else: + word = element.text.strip(',".!:;()') + return word + +x = -1 +leastcommon_list = [] +allwords = [] +scanimg = glob.glob('images-tiff/*.tiff') +hocr = glob.glob('hocr/*.html') +num = 0 + +maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned + + +# loop through every image in scanimg folder +for i in scanimg: + x = x + 1 + limit = 15 - (x * maximum) # this helps the script remove words in a way that is proportional to number of pages scanned + iim = Image.open(i) # iim is initial image + oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image + + # open corresponding hocr file + print ("Analysing", hocr[x]) + f = open(hocr[x]) + print ('Reading scanned image, filtering least common words.') + print ("") + + t = html5lib.parse(f, namespaceHTMLElements=False) + + # loop through every word in hocr file to analyse words and find least common + for element in t.findall(".//span[@class='ocrx_word']"): + word = filternone(element.text) + allwords.append(word) + + clean_words = cleanstopwords(allwords) #clean stopwords + findleastcommon(clean_words) #find least common words and add them to list + mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list + + print ("The most common words until text", x+1, "are:", mostcommon_list) + + # loop through every word in hocr file to extract coordinates, then remove or paste into output image + + for element in t.findall(".//span[@class='ocrx_word']"): + word = filternone(element.text) + c = coordinates(element.attrib['title']) + num = num + 1 + + wim = iim.crop(c) # wim is word image + + #extract coordinates + if word.lower() in mostcommon_list and len(word) > 1 and len(word) <= 5: + wim.save ("output/erase-replace/crops4/wimreplace{}.png".format(num)) + elif word in mostcommon_list and len(word) <= 7 : + wim.save ("output/erase-replace/crops7/wimreplace{}.png".format(num)) + elif word in mostcommon_list and len(word) > 7 : + wim.save ("output/erase-replace/crops_more/wimreplace{}.png".format(num)) + + if x > 0: + # use PIL to crop out every box, then paste it according to if rule + randomimg4 = random.choice(glob.glob('./output/erase-replace/crops4/*.png')) + randomimg7 = random.choice(glob.glob('./output/erase-replace/crops7/*.png')) + randomimg_more = random.choice(glob.glob('./output/erase-replace/crops_more/*.png')) + + wimreplace4 = Image.open(randomimg4) + wimreplace7 = Image.open(randomimg7) + wimreplace_more = Image.open(randomimg_more) + + wimcolor4 = Image.new('RGBA', wimreplace4.size, (250, 230, 0, 90)) + wimcolor7 = Image.new('RGBA', wimreplace7.size, (250, 230, 0, 90)) + wimcolor_more = Image.new('RGBA', wimreplace_more.size, (250, 230, 0, 90)) + + out4 = Image.alpha_composite(wimreplace4, wimcolor4) + out7 = Image.alpha_composite(wimreplace7, wimcolor7) + out_more = Image.alpha_composite(wimreplace_more, wimcolor_more) + + if word.lower() in leastcommon_list and len(word) <= limit: + oim.paste(wim, (c[0], c[1], c[2], c[3])) + + elif word.lower() in leastcommon_list and len(word) < 8: + oim.paste(out4, (c[0], c[1])) + print ('Excluding:', word) + + elif word.lower() in leastcommon_list and len(word) < 11: + oim.paste(out7, (c[0], c[1])) + print ('Excluding:', word) + + elif word.lower() in leastcommon_list and len(word) > 8: + oim.paste(out_more, (c[0], c[1])) + print ('Excluding:', word) + + else: + oim.paste(wim, (c[0], c[1], c[2], c[3])) + print ('Including:', word) + + else: + oim.paste(wim, (c[0], c[1], c[2], c[3])) + print ('Including:', word) + + + #-------------------------------------------------------------------------------# + # save images + n = i.replace("images-tiff/","output/erase-replace/").replace(".tiff", "") + oim.save("{}-{}replace.jpg".format(n, x)) + +#-------------------------------------------------------------------------------# +# save images into PDF +outputs = glob.glob('output/erase-replace/*replace.jpg') +print ("Saving to PDF:", outputs) + +def makePdf(pdfFileName, listPages, dir = ''): + if (dir): + dir += "/" + + cover = Image.open(dir + str(listPages[0])) + width, height = cover.size + pdf = FPDF(unit = "pt", format = [width, height]) + + for page in listPages: + pdf.add_page() + pdf.image(dir + str(page), 0, 0) + + pdf.output(dir + pdfFileName + ".pdf", "F") + +makePdf('output/erase-replace/Replace', outputs, dir = '') + +#clean up previous jpg and png files +files = glob.glob('./output/erase-replace/*replace.jpg') +for f in files: + os.remove(f) + +files = glob.glob('./output/erase-replace/crops4/*.png') +for f in files: + os.remove(f) + +files = glob.glob('./output/erase-replace/crops7/*.png') +for f in files: + os.remove(f) + +files = glob.glob('./output/erase-replace/crops_more/*.png') +for f in files: + os.remove(f) +