added erase & replace scripts to src/

7 years ago · adaad8b12e
parent d03d52a098
commit adaad8b12e
2 changed files with 315 additions and 0 deletions
--- a/src/erase_leastcommon.py
+++ b/src/erase_leastcommon.py
@ -0,0 +1,123 @@
 import html5lib
 from xml.etree import ElementTree as ET 
 from PIL import Image
 from nltk import FreqDist
 from nltk.corpus import stopwords
 import glob
 import os
 from fpdf import FPDF
 stopwords.words('english')
 sr = set(stopwords.words('english'))
 def cleanstopwords(list):
 	"This cleans stopwords from a list of words"
 	clean_words = list[:]
 	for word in list:
 		if word.lower() in sr:
 			clean_words.remove(word)
 	return clean_words
 def findleastcommon(list):
 	"This finds the least common words and returns a list"
 	fdist = FreqDist(word.lower() for word in list) 
 	leastcommon = fdist.most_common()
 	for i in leastcommon:
 		if (i[1] <= limit):
 			leastcommon_list.append(i[0])
 	return leastcommon_list
 def coordinates(attribute):
 	"This extracts the box coordinates of words from an hocr / html element tree"
 	r = attribute 	# 'title' is the word in the html tag
 	r, c = r.split(";") 	# split the attribute into two sections
 	r = r.split(" ")[1:] 	# split again and discard the elements which aren't useful
 	r = [int(x) for x in r] # put coordinates into list as integers
 	return r
 def filternone(word_raw):
 	if word_raw is None:
 		remove = None
 		word = 'y'
 	else:
 		word = element.text.strip(',".!:;()')
 	return word
 x = -1
 leastcommon_list = []
 allwords = []
 scanimg = glob.glob('images-tiff/*.tiff')
 hocr = glob.glob('hocr/*.html')
 maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned 
 # loop through every image in scanimg folder
 for i in scanimg:
 	x = x + 1
 	limit = x * maximum # this helps the script remove words in a way that is proportional to number of pages scanned 
 	iim = Image.open(i) # iim is initial image
 	oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
 	# open corresponding hocr file
 	print ("Analysing", hocr[x]) 
 	f = open(hocr[x])
 	print ('Reading scanned image, filtering least common words.')
 	print ("") 
 	t = html5lib.parse(f, namespaceHTMLElements=False)
 	# loop through every word in hocr file to analyse words and find least common
 	for element in t.findall(".//span[@class='ocrx_word']"): 
 		word = filternone(element.text) 
 		allwords.append(word)
 	clean_words = cleanstopwords(allwords) #clean stopwords
 	findleastcommon(clean_words) #find least common words and add them to list
 	print ("The least common words until text", x+1, "are:", leastcommon_list)
 	# loop through every word in hocr file to extract coordinates, then remove or paste into output image
 	for element in t.findall(".//span[@class='ocrx_word']"): 
 		word = filternone(element.text)
 		c = coordinates(element.attrib['title']) 
 		wim = iim.crop(c) # wim is word image
 		if word.lower() in leastcommon_list and len(word) < limit:
 			oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3]))
 			print ('Excluding:', word)
 		else:
 			oim.paste(wim, (c[0], c[1], c[2], c[3]))
 			print ('Including:', word)
 	#-------------------------------------------------------------------------------#
 	# save and show images
 	n = i.replace("images-tiff/","output/erase-replace/").replace(".tiff", "")
 	oim.save("{}-{}erase.jpg".format(n, x))
 #-------------------------------------------------------------------------------#
 # save images into PDF
 outputs = glob.glob('output/erase-replace/*erase.jpg')
 print ("Saving to PDF:", outputs)
 def makePdf(pdfFileName, listPages, dir = ''):
 	if (dir):
 		dir += "/"
 	cover = Image.open(dir + str(listPages[0]))
 	width, height = cover.size
 	pdf = FPDF(unit = "pt", format = [width, height])
 	for page in listPages:
 		pdf.add_page()
 		pdf.image(dir + str(page), 0, 0)
 	pdf.output(dir + pdfFileName + ".pdf", "F")
 makePdf('output/erase-replace/Erase', outputs, dir = '')
 #clean up previous jpg files
 files = glob.glob('./output/erase-replace/*erase.jpg')
 for f in files:
    os.remove(f)
--- a/src/replace_leastcommon.py
+++ b/src/replace_leastcommon.py
@ -0,0 +1,192 @@
 import html5lib
 from xml.etree import ElementTree as ET 
 from PIL import Image
 from nltk import FreqDist
 from nltk.corpus import stopwords
 import random 
 import glob
 import time
 from fpdf import FPDF
 import os
 stopwords.words('english')
 sr = set(stopwords.words('english'))
 def cleanstopwords(list):
 	"This cleans stopwords from a list of words"
 	clean_words = list[:]
 	for word in list:
 		if word.lower() in sr:
 			clean_words.remove(word)
 	return clean_words
 def findmostcommon(list, int):
 	"This finds the most common words and returns a list"
 	fdist = FreqDist(word.lower() for word in list)
 	mostcommon = fdist.most_common(int)
 	mostcommon_list = [i[0] for i in mostcommon]
 	return mostcommon_list
 def findleastcommon(list):
 	"This finds the least common words and returns a list"
 	fdist = FreqDist(word.lower() for word in list) 
 	leastcommon = fdist.most_common()
 	for i in leastcommon:
 		if (i[1] <= limit):
 			leastcommon_list.append(i[0])
 	return leastcommon_list
 def coordinates(attribute):
 	"This extracts the box coordinates of words from an hocr / html element tree"
 	c = attribute 	# 'title' is the word in the html tag
 	c, r = c.split(";") 	# split the attribute into two sections
 	c = c.split(" ")[1:] 	# split again and discard the elements which aren't useful
 	c = [int(x) for x in c] # put coordinates into list as integers
 	return c
 def filternone(word_raw):
 	if word_raw is None:
 		remove = None
 		word = 'y'
 	else:
 		word = element.text.strip(',".!:;()')
 	return word
 x = -1
 leastcommon_list = []
 allwords = []
 scanimg = glob.glob('images-tiff/*.tiff')
 hocr = glob.glob('hocr/*.html')
 num = 0
 maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned 
 # loop through every image in scanimg folder
 for i in scanimg:
 	x = x + 1
 	limit = 15 - (x * maximum) # this helps the script remove words in a way that is proportional to number of pages scanned 
 	iim = Image.open(i) # iim is initial image
 	oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
 	# open corresponding hocr file
 	print ("Analysing", hocr[x]) 
 	f = open(hocr[x])
 	print ('Reading scanned image, filtering least common words.')
 	print ("") 
 	t = html5lib.parse(f, namespaceHTMLElements=False)
 	# loop through every word in hocr file to analyse words and find least common
 	for element in t.findall(".//span[@class='ocrx_word']"): 
 		word = filternone(element.text) 
 		allwords.append(word)
 	clean_words = cleanstopwords(allwords) #clean stopwords
 	findleastcommon(clean_words) #find least common words and add them to list
 	mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list
 	print ("The most common words until text", x+1, "are:", mostcommon_list)
 	# loop through every word in hocr file to extract coordinates, then remove or paste into output image
 	for element in t.findall(".//span[@class='ocrx_word']"): 
 		word = filternone(element.text)
 		c = coordinates(element.attrib['title']) 
 		num = num + 1
 		wim = iim.crop(c) # wim is word image
 		#extract coordinates
 		if word.lower() in mostcommon_list and len(word) > 1 and len(word) <= 5:
 			wim.save  ("output/erase-replace/crops4/wimreplace{}.png".format(num))
 		elif word in mostcommon_list and len(word) <= 7 :
 			wim.save ("output/erase-replace/crops7/wimreplace{}.png".format(num))
 		elif word in mostcommon_list and len(word) > 7 :
 			wim.save ("output/erase-replace/crops_more/wimreplace{}.png".format(num))
 		if x > 0:
 			# use PIL to crop out every box, then paste it according to if rule
 			randomimg4 = random.choice(glob.glob('./output/erase-replace/crops4/*.png'))
 			randomimg7 = random.choice(glob.glob('./output/erase-replace/crops7/*.png'))
 			randomimg_more = random.choice(glob.glob('./output/erase-replace/crops_more/*.png'))
 			wimreplace4 = Image.open(randomimg4)
 			wimreplace7 = Image.open(randomimg7)
 			wimreplace_more = Image.open(randomimg_more)
 			wimcolor4 = Image.new('RGBA', wimreplace4.size, (250, 230, 0, 90))
 			wimcolor7 = Image.new('RGBA', wimreplace7.size, (250, 230, 0, 90))
 			wimcolor_more = Image.new('RGBA', wimreplace_more.size, (250, 230, 0, 90))
 			out4 = Image.alpha_composite(wimreplace4, wimcolor4)
 			out7 = Image.alpha_composite(wimreplace7, wimcolor7)
 			out_more = Image.alpha_composite(wimreplace_more, wimcolor_more)
 			if word.lower() in leastcommon_list and len(word) <= limit:
 				oim.paste(wim, (c[0], c[1], c[2], c[3]))
 			elif word.lower() in leastcommon_list and len(word) < 8:
 				oim.paste(out4, (c[0], c[1]))
 				print ('Excluding:', word)
 			elif word.lower() in leastcommon_list and len(word) < 11:
 				oim.paste(out7, (c[0], c[1]))
 				print ('Excluding:', word)	
 			elif word.lower() in leastcommon_list and len(word) > 8:
 				oim.paste(out_more, (c[0], c[1]))
 				print ('Excluding:', word)	
 			else:
 				oim.paste(wim, (c[0], c[1], c[2], c[3]))
 				print ('Including:', word)
 		else:
 			oim.paste(wim, (c[0], c[1], c[2], c[3]))
 			print ('Including:', word)
 	#-------------------------------------------------------------------------------#
 	# save images
 	n = i.replace("images-tiff/","output/erase-replace/").replace(".tiff", "")
 	oim.save("{}-{}replace.jpg".format(n, x))
 #-------------------------------------------------------------------------------#
 # save images into PDF
 outputs = glob.glob('output/erase-replace/*replace.jpg')
 print ("Saving to PDF:", outputs)
 def makePdf(pdfFileName, listPages, dir = ''):
 	if (dir):
 		dir += "/"
 	cover = Image.open(dir + str(listPages[0]))
 	width, height = cover.size
 	pdf = FPDF(unit = "pt", format = [width, height])
 	for page in listPages:
 		pdf.add_page()
 		pdf.image(dir + str(page), 0, 0)
 	pdf.output(dir + pdfFileName + ".pdf", "F")
 makePdf('output/erase-replace/Replace', outputs, dir = '')
 #clean up previous jpg and png files
 files = glob.glob('./output/erase-replace/*replace.jpg')
 for f in files:
    os.remove(f)
 files = glob.glob('./output/erase-replace/crops4/*.png')
 for f in files:
    os.remove(f)
 files = glob.glob('./output/erase-replace/crops7/*.png')
 for f in files:
    os.remove(f)
 files = glob.glob('./output/erase-replace/crops_more/*.png')
 for f in files:
    os.remove(f)