From adaad8b12ee2ee88a5c5a55b1f2fc7a4db21e267 Mon Sep 17 00:00:00 2001
From: Natasha Berting <natasha@145-24-188-130.wlan.hro.nl>
Date: Fri, 23 Mar 2018 14:26:10 +0100
Subject: [PATCH] added erase & replace scripts to src/

---
 src/erase_leastcommon.py   | 123 ++++++++++++++++++++++++
 src/replace_leastcommon.py | 192 +++++++++++++++++++++++++++++++++++++
 2 files changed, 315 insertions(+)
 create mode 100644 src/erase_leastcommon.py
 create mode 100644 src/replace_leastcommon.py

diff --git a/src/erase_leastcommon.py b/src/erase_leastcommon.py
new file mode 100644
index 0000000..3e7b69b
--- /dev/null
+++ b/src/erase_leastcommon.py
@@ -0,0 +1,123 @@
+import html5lib
+from xml.etree import ElementTree as ET 
+from PIL import Image
+from nltk import FreqDist
+from nltk.corpus import stopwords
+import glob
+import os
+from fpdf import FPDF
+
+stopwords.words('english')
+sr = set(stopwords.words('english'))
+
+def cleanstopwords(list):
+	"This cleans stopwords from a list of words"
+	clean_words = list[:]
+	for word in list:
+		if word.lower() in sr:
+			clean_words.remove(word)
+	return clean_words
+
+def findleastcommon(list):
+	"This finds the least common words and returns a list"
+	fdist = FreqDist(word.lower() for word in list) 
+	leastcommon = fdist.most_common()
+	for i in leastcommon:
+		if (i[1] <= limit):
+			leastcommon_list.append(i[0])
+	return leastcommon_list
+
+def coordinates(attribute):
+	"This extracts the box coordinates of words from an hocr / html element tree"
+	r = attribute 	# 'title' is the word in the html tag
+	r, c = r.split(";") 	# split the attribute into two sections
+	r = r.split(" ")[1:] 	# split again and discard the elements which aren't useful
+	r = [int(x) for x in r] # put coordinates into list as integers
+	return r
+
+def filternone(word_raw):
+	if word_raw is None:
+		remove = None
+		word = 'y'
+	else:
+		word = element.text.strip(',".!:;()')
+	return word
+
+x = -1
+leastcommon_list = []
+allwords = []
+scanimg = glob.glob('images-tiff/*.tiff')
+hocr = glob.glob('hocr/*.html')
+maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned 
+
+# loop through every image in scanimg folder
+for i in scanimg:
+	x = x + 1
+	limit = x * maximum # this helps the script remove words in a way that is proportional to number of pages scanned 
+	iim = Image.open(i) # iim is initial image
+	oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
+
+	# open corresponding hocr file
+	print ("Analysing", hocr[x]) 
+	f = open(hocr[x])
+	print ('Reading scanned image, filtering least common words.')
+	print ("") 
+
+	t = html5lib.parse(f, namespaceHTMLElements=False)
+	
+	# loop through every word in hocr file to analyse words and find least common
+	for element in t.findall(".//span[@class='ocrx_word']"): 
+		word = filternone(element.text) 
+		allwords.append(word)
+	
+	clean_words = cleanstopwords(allwords) #clean stopwords
+	findleastcommon(clean_words) #find least common words and add them to list
+	print ("The least common words until text", x+1, "are:", leastcommon_list)
+	
+	# loop through every word in hocr file to extract coordinates, then remove or paste into output image
+	for element in t.findall(".//span[@class='ocrx_word']"): 
+		word = filternone(element.text)
+		c = coordinates(element.attrib['title']) 
+
+		wim = iim.crop(c) # wim is word image
+
+		if word.lower() in leastcommon_list and len(word) < limit:
+			oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3]))
+			print ('Excluding:', word)
+
+		else:
+			oim.paste(wim, (c[0], c[1], c[2], c[3]))
+			print ('Including:', word)
+	
+	#-------------------------------------------------------------------------------#
+	# save and show images
+	n = i.replace("images-tiff/","output/erase-replace/").replace(".tiff", "")
+	oim.save("{}-{}erase.jpg".format(n, x))
+
+
+#-------------------------------------------------------------------------------#
+# save images into PDF
+outputs = glob.glob('output/erase-replace/*erase.jpg')
+print ("Saving to PDF:", outputs)
+
+def makePdf(pdfFileName, listPages, dir = ''):
+	if (dir):
+		dir += "/"
+
+	cover = Image.open(dir + str(listPages[0]))
+	width, height = cover.size
+	pdf = FPDF(unit = "pt", format = [width, height])
+
+	for page in listPages:
+		pdf.add_page()
+		pdf.image(dir + str(page), 0, 0)
+
+	pdf.output(dir + pdfFileName + ".pdf", "F")
+
+makePdf('output/erase-replace/Erase', outputs, dir = '')
+
+#clean up previous jpg files
+files = glob.glob('./output/erase-replace/*erase.jpg')
+for f in files:
+    os.remove(f)
+
diff --git a/src/replace_leastcommon.py b/src/replace_leastcommon.py
new file mode 100644
index 0000000..b596d2a
--- /dev/null
+++ b/src/replace_leastcommon.py
@@ -0,0 +1,192 @@
+import html5lib
+from xml.etree import ElementTree as ET 
+from PIL import Image
+from nltk import FreqDist
+from nltk.corpus import stopwords
+import random 
+import glob
+import time
+from fpdf import FPDF
+import os
+
+stopwords.words('english')
+sr = set(stopwords.words('english'))
+
+def cleanstopwords(list):
+	"This cleans stopwords from a list of words"
+	clean_words = list[:]
+	for word in list:
+		if word.lower() in sr:
+			clean_words.remove(word)
+	return clean_words
+
+def findmostcommon(list, int):
+	"This finds the most common words and returns a list"
+	fdist = FreqDist(word.lower() for word in list)
+	mostcommon = fdist.most_common(int)
+	mostcommon_list = [i[0] for i in mostcommon]
+	return mostcommon_list
+
+def findleastcommon(list):
+	"This finds the least common words and returns a list"
+	fdist = FreqDist(word.lower() for word in list) 
+	leastcommon = fdist.most_common()
+	for i in leastcommon:
+		if (i[1] <= limit):
+			leastcommon_list.append(i[0])
+	return leastcommon_list
+
+def coordinates(attribute):
+	"This extracts the box coordinates of words from an hocr / html element tree"
+	c = attribute 	# 'title' is the word in the html tag
+	c, r = c.split(";") 	# split the attribute into two sections
+	c = c.split(" ")[1:] 	# split again and discard the elements which aren't useful
+	c = [int(x) for x in c] # put coordinates into list as integers
+	return c
+
+def filternone(word_raw):
+	if word_raw is None:
+		remove = None
+		word = 'y'
+	else:
+		word = element.text.strip(',".!:;()')
+	return word
+
+x = -1
+leastcommon_list = []
+allwords = []
+scanimg = glob.glob('images-tiff/*.tiff')
+hocr = glob.glob('hocr/*.html')
+num = 0
+
+maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned 
+
+
+# loop through every image in scanimg folder
+for i in scanimg:
+	x = x + 1
+	limit = 15 - (x * maximum) # this helps the script remove words in a way that is proportional to number of pages scanned 
+	iim = Image.open(i) # iim is initial image
+	oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
+
+	# open corresponding hocr file
+	print ("Analysing", hocr[x]) 
+	f = open(hocr[x])
+	print ('Reading scanned image, filtering least common words.')
+	print ("") 
+
+	t = html5lib.parse(f, namespaceHTMLElements=False)
+	
+	# loop through every word in hocr file to analyse words and find least common
+	for element in t.findall(".//span[@class='ocrx_word']"): 
+		word = filternone(element.text) 
+		allwords.append(word)
+	
+	clean_words = cleanstopwords(allwords) #clean stopwords
+	findleastcommon(clean_words) #find least common words and add them to list
+	mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list
+
+	print ("The most common words until text", x+1, "are:", mostcommon_list)
+	
+	# loop through every word in hocr file to extract coordinates, then remove or paste into output image
+
+	for element in t.findall(".//span[@class='ocrx_word']"): 
+		word = filternone(element.text)
+		c = coordinates(element.attrib['title']) 
+		num = num + 1
+
+		wim = iim.crop(c) # wim is word image
+
+		#extract coordinates
+		if word.lower() in mostcommon_list and len(word) > 1 and len(word) <= 5:
+			wim.save  ("output/erase-replace/crops4/wimreplace{}.png".format(num))
+		elif word in mostcommon_list and len(word) <= 7 :
+			wim.save ("output/erase-replace/crops7/wimreplace{}.png".format(num))
+		elif word in mostcommon_list and len(word) > 7 :
+			wim.save ("output/erase-replace/crops_more/wimreplace{}.png".format(num))
+
+		if x > 0:
+			# use PIL to crop out every box, then paste it according to if rule
+			randomimg4 = random.choice(glob.glob('./output/erase-replace/crops4/*.png'))
+			randomimg7 = random.choice(glob.glob('./output/erase-replace/crops7/*.png'))
+			randomimg_more = random.choice(glob.glob('./output/erase-replace/crops_more/*.png'))
+
+			wimreplace4 = Image.open(randomimg4)
+			wimreplace7 = Image.open(randomimg7)
+			wimreplace_more = Image.open(randomimg_more)
+
+			wimcolor4 = Image.new('RGBA', wimreplace4.size, (250, 230, 0, 90))
+			wimcolor7 = Image.new('RGBA', wimreplace7.size, (250, 230, 0, 90))
+			wimcolor_more = Image.new('RGBA', wimreplace_more.size, (250, 230, 0, 90))
+
+			out4 = Image.alpha_composite(wimreplace4, wimcolor4)
+			out7 = Image.alpha_composite(wimreplace7, wimcolor7)
+			out_more = Image.alpha_composite(wimreplace_more, wimcolor_more)
+
+			if word.lower() in leastcommon_list and len(word) <= limit:
+				oim.paste(wim, (c[0], c[1], c[2], c[3]))
+
+			elif word.lower() in leastcommon_list and len(word) < 8:
+				oim.paste(out4, (c[0], c[1]))
+				print ('Excluding:', word)
+
+			elif word.lower() in leastcommon_list and len(word) < 11:
+				oim.paste(out7, (c[0], c[1]))
+				print ('Excluding:', word)	
+
+			elif word.lower() in leastcommon_list and len(word) > 8:
+				oim.paste(out_more, (c[0], c[1]))
+				print ('Excluding:', word)	
+				
+			else:
+				oim.paste(wim, (c[0], c[1], c[2], c[3]))
+				print ('Including:', word)
+
+		else:
+			oim.paste(wim, (c[0], c[1], c[2], c[3]))
+			print ('Including:', word)
+
+
+	#-------------------------------------------------------------------------------#
+	# save images
+	n = i.replace("images-tiff/","output/erase-replace/").replace(".tiff", "")
+	oim.save("{}-{}replace.jpg".format(n, x))
+
+#-------------------------------------------------------------------------------#
+# save images into PDF
+outputs = glob.glob('output/erase-replace/*replace.jpg')
+print ("Saving to PDF:", outputs)
+
+def makePdf(pdfFileName, listPages, dir = ''):
+	if (dir):
+		dir += "/"
+
+	cover = Image.open(dir + str(listPages[0]))
+	width, height = cover.size
+	pdf = FPDF(unit = "pt", format = [width, height])
+
+	for page in listPages:
+		pdf.add_page()
+		pdf.image(dir + str(page), 0, 0)
+
+	pdf.output(dir + pdfFileName + ".pdf", "F")
+
+makePdf('output/erase-replace/Replace', outputs, dir = '')
+
+#clean up previous jpg and png files
+files = glob.glob('./output/erase-replace/*replace.jpg')
+for f in files:
+    os.remove(f)
+    
+files = glob.glob('./output/erase-replace/crops4/*.png')
+for f in files:
+    os.remove(f)
+
+files = glob.glob('./output/erase-replace/crops7/*.png')
+for f in files:
+    os.remove(f)
+
+files = glob.glob('./output/erase-replace/crops_more/*.png')
+for f in files:
+    os.remove(f)
+