OuNuPo/src/erase_leastcommon.py

import html5lib
from xml.etree import ElementTree as ET 
from PIL import Image
from nltk import FreqDist
from nltk.corpus import stopwords
import glob
import os
from fpdf import FPDF

stopwords.words('english')
sr = set(stopwords.words('english'))

def cleanstopwords(list):
	"This cleans stopwords from a list of words"
	clean_words = list[:]
	for word in list:
		if word.lower() in sr:
			clean_words.remove(word)
	return clean_words

def findleastcommon(list):
	"This finds the least common words and returns a list"
	fdist = FreqDist(word.lower() for word in list) 
	leastcommon = fdist.most_common()
	for i in leastcommon:
		if (i[1] <= limit):
			leastcommon_list.append(i[0])
	return leastcommon_list

def coordinates(attribute):
	"This extracts the box coordinates of words from an hocr / html element tree"
	r = attribute 	# 'title' is the word in the html tag
	r, c = r.split(";") 	# split the attribute into two sections
	r = r.split(" ")[1:] 	# split again and discard the elements which aren't useful
	r = [int(x) for x in r] # put coordinates into list as integers
	return r

def filternone(word_raw):
	if word_raw is None:
		remove = None
		word = 'y'
	else:
		word = element.text.strip(',".!:;()')
	return word

x = -1
leastcommon_list = []
allwords = []
scanimg = glob.glob('images-tiff/*.tiff')
hocr = glob.glob('hocr/*.html')
maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned 

# loop through every image in scanimg folder
for i in scanimg:
	x = x + 1
	limit = x * maximum # this helps the script remove words in a way that is proportional to number of pages scanned 
	iim = Image.open(i) # iim is initial image
	oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image

	# open corresponding hocr file 
	f = open(hocr[x])
	print ('Reading scanned image, filtering least common words.')
	print ('') 

	t = html5lib.parse(f, namespaceHTMLElements=False)
	
	# loop through every word in hocr file to analyse words and find least common
	for element in t.findall(".//span[@class='ocrx_word']"): 
		word = filternone(element.text) 
		allwords.append(word)
	
	clean_words = cleanstopwords(allwords) #clean stopwords
	findleastcommon(clean_words) #find least common words and add them to list
	print ("The least common words until text", x+1, "are:", leastcommon_list)
	print ('') 
	print ('Processing word coordinates and erasing least common words.')
	print ('') 
	# loop through every word in hocr file to extract coordinates, then remove or paste into output image
	for element in t.findall(".//span[@class='ocrx_word']"): 
		word = filternone(element.text)
		c = coordinates(element.attrib['title']) 

		wim = iim.crop(c) # wim is word image

		if word.lower() in leastcommon_list and len(word) < limit:
			oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3]))

		else:
			oim.paste(wim, (c[0], c[1], c[2], c[3]))
	
	#-------------------------------------------------------------------------------#
	# save and show images
	n = i.replace("images-tiff/","output/erase-replace/").replace(".tiff", "")
	oim.save("{}-{}erase.jpg".format(n, x))


#-------------------------------------------------------------------------------#
# save images into PDF
outputs = glob.glob('output/erase-replace/*erase.jpg')
print ("Saving to PDF: output/erase-replace/Erase.pdf")

def makePdf(pdfFileName, listPages, dir = ''):
	if (dir):
		dir += "/"
	cover = Image.open(dir + str(listPages[0]))
	width, height = cover.size
	pdf = FPDF(unit = "pt", format = [width, height])
	for page in listPages:
		pdf.add_page()
		pdf.image(dir + str(page), 0, 0)
	pdf.output(dir + pdfFileName + ".pdf", "F")

makePdf('output/erase-replace/Erase', outputs, dir = '')

#clean up previous jpg files
files = glob.glob('./output/erase-replace/*erase.jpg')
for f in files:
    os.remove(f)
added erase & replace scripts to src/ 7 years ago			`import html5lib`
			`from xml.etree import ElementTree as ET`
			`from PIL import Image`
			`from nltk import FreqDist`
			`from nltk.corpus import stopwords`
			`import glob`
			`import os`
			`from fpdf import FPDF`

			`stopwords.words('english')`
			`sr = set(stopwords.words('english'))`

			`def cleanstopwords(list):`
			`"This cleans stopwords from a list of words"`
			`clean_words = list[:]`
			`for word in list:`
			`if word.lower() in sr:`
			`clean_words.remove(word)`
			`return clean_words`

			`def findleastcommon(list):`
			`"This finds the least common words and returns a list"`
			`fdist = FreqDist(word.lower() for word in list)`
			`leastcommon = fdist.most_common()`
			`for i in leastcommon:`
			`if (i[1] <= limit):`
			`leastcommon_list.append(i[0])`
			`return leastcommon_list`

			`def coordinates(attribute):`
			`"This extracts the box coordinates of words from an hocr / html element tree"`
			`r = attribute # 'title' is the word in the html tag`
			`r, c = r.split(";") # split the attribute into two sections`
			`r = r.split(" ")[1:] # split again and discard the elements which aren't useful`
			`r = [int(x) for x in r] # put coordinates into list as integers`
			`return r`

			`def filternone(word_raw):`
			`if word_raw is None:`
			`remove = None`
			`word = 'y'`
			`else:`
			`word = element.text.strip(',".!:;()')`
			`return word`

			`x = -1`
			`leastcommon_list = []`
			`allwords = []`
			`scanimg = glob.glob('images-tiff/*.tiff')`
			`hocr = glob.glob('hocr/*.html')`
			`maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned`

			`# loop through every image in scanimg folder`
			`for i in scanimg:`
			`x = x + 1`
			`limit = x * maximum # this helps the script remove words in a way that is proportional to number of pages scanned`
			`iim = Image.open(i) # iim is initial image`
			`oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image`

updated erase & replace rule in makefile 7 years ago			`# open corresponding hocr file`
added erase & replace scripts to src/ 7 years ago			`f = open(hocr[x])`
			`print ('Reading scanned image, filtering least common words.')`
updated erase & replace rule in makefile 7 years ago			`print ('')`
added erase & replace scripts to src/ 7 years ago
			`t = html5lib.parse(f, namespaceHTMLElements=False)`

			`# loop through every word in hocr file to analyse words and find least common`
			`for element in t.findall(".//span[@class='ocrx_word']"):`
			`word = filternone(element.text)`
			`allwords.append(word)`

			`clean_words = cleanstopwords(allwords) #clean stopwords`
			`findleastcommon(clean_words) #find least common words and add them to list`
			`print ("The least common words until text", x+1, "are:", leastcommon_list)`
updated erase & replace rule in makefile 7 years ago			`print ('')`
			`print ('Processing word coordinates and erasing least common words.')`
			`print ('')`
added erase & replace scripts to src/ 7 years ago			`# loop through every word in hocr file to extract coordinates, then remove or paste into output image`
			`for element in t.findall(".//span[@class='ocrx_word']"):`
			`word = filternone(element.text)`
			`c = coordinates(element.attrib['title'])`

			`wim = iim.crop(c) # wim is word image`

			`if word.lower() in leastcommon_list and len(word) < limit:`
			`oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3]))`

			`else:`
			`oim.paste(wim, (c[0], c[1], c[2], c[3]))`

			`#-------------------------------------------------------------------------------#`
			`# save and show images`
			`n = i.replace("images-tiff/","output/erase-replace/").replace(".tiff", "")`
			`oim.save("{}-{}erase.jpg".format(n, x))`


			`#-------------------------------------------------------------------------------#`
			`# save images into PDF`
			`outputs = glob.glob('output/erase-replace/*erase.jpg')`
updated erase & replace rule in makefile 7 years ago			`print ("Saving to PDF: output/erase-replace/Erase.pdf")`
added erase & replace scripts to src/ 7 years ago
			`def makePdf(pdfFileName, listPages, dir = ''):`
			`if (dir):`
			`dir += "/"`
			`cover = Image.open(dir + str(listPages[0]))`
			`width, height = cover.size`
			`pdf = FPDF(unit = "pt", format = [width, height])`
			`for page in listPages:`
			`pdf.add_page()`
			`pdf.image(dir + str(page), 0, 0)`
			`pdf.output(dir + pdfFileName + ".pdf", "F")`

			`makePdf('output/erase-replace/Erase', outputs, dir = '')`

			`#clean up previous jpg files`
			`files = glob.glob('./output/erase-replace/*erase.jpg')`
			`for f in files:`
			`os.remove(f)`