updated erase & replace rule in makefile

7 years ago · 0587b5cdaf
parent ab85a34326
commit 0587b5cdaf
3 changed files with 33 additions and 47 deletions
--- a/3
+++ b/3
@ -100,9 +100,8 @@ overunder: ocr/output.txt ## Alice: An interpreted language that translate simpl
 erase:hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF
 	python3 src/erase_leastcommon.py
 	@echo 'erase rule output: output/erase-replace/Erase.pdf'
-replace: ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF
+replace:hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF
 	python3 src/replace_leastcommon.py
 visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer
--- a/src/erase_leastcommon.py
+++ b/src/erase_leastcommon.py
@ -57,11 +57,10 @@ for i in scanimg:
 	iim = Image.open(i) # iim is initial image
 	oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
-	# open corresponding hocr file
+	# open corresponding hocr file 
 	print ("Analysing", hocr[x]) 
 	f = open(hocr[x])
 	print ('Reading scanned image, filtering least common words.')
-	print ("") 
+	print ('') 
 	t = html5lib.parse(f, namespaceHTMLElements=False)
@ -73,7 +72,9 @@ for i in scanimg:
 	clean_words = cleanstopwords(allwords) #clean stopwords
 	findleastcommon(clean_words) #find least common words and add them to list
 	print ("The least common words until text", x+1, "are:", leastcommon_list)
-	
+	print ('') 
 	print ('Processing word coordinates and erasing least common words.')
 	print ('') 
 	# loop through every word in hocr file to extract coordinates, then remove or paste into output image
 	for element in t.findall(".//span[@class='ocrx_word']"): 
 		word = filternone(element.text)
@ -83,11 +84,9 @@ for i in scanimg:
 		if word.lower() in leastcommon_list and len(word) < limit:
 			oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3]))
 			print ('Excluding:', word)
 		else:
 			oim.paste(wim, (c[0], c[1], c[2], c[3]))
 			print ('Including:', word)
 	#-------------------------------------------------------------------------------#
 	# save and show images
@ -98,20 +97,17 @@ for i in scanimg:
 #-------------------------------------------------------------------------------#
 # save images into PDF
 outputs = glob.glob('output/erase-replace/*erase.jpg')
-print ("Saving to PDF:", outputs)
+print ("Saving to PDF: output/erase-replace/Erase.pdf")
 def makePdf(pdfFileName, listPages, dir = ''):
 	if (dir):
 		dir += "/"
 	cover = Image.open(dir + str(listPages[0]))
 	width, height = cover.size
 	pdf = FPDF(unit = "pt", format = [width, height])
 	for page in listPages:
 		pdf.add_page()
 		pdf.image(dir + str(page), 0, 0)
 	pdf.output(dir + pdfFileName + ".pdf", "F")
 makePdf('output/erase-replace/Erase', outputs, dir = '')
--- a/src/replace_leastcommon.py
+++ b/src/replace_leastcommon.py
@ -8,6 +8,14 @@ import glob
 import time
 from fpdf import FPDF
 import os
 import shutil
 path1 = './temp'
 if not os.path.isdir(path1):
   os.makedirs(path1)
   os.makedirs('./temp/crops4')
   os.makedirs('./temp/crops7')
   os.makedirs('./temp/crops_more')
 stopwords.words('english')
 sr = set(stopwords.words('english'))
@ -58,10 +66,8 @@ allwords = []
 scanimg = glob.glob('images-tiff/*.tiff')
 hocr = glob.glob('hocr/*.html')
 num = 0
 maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned 
 # loop through every image in scanimg folder
 for i in scanimg:
 	x = x + 1
@ -70,10 +76,9 @@ for i in scanimg:
 	oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
 	# open corresponding hocr file
 	print ("Analysing", hocr[x]) 
 	f = open(hocr[x])
-	print ('Reading scanned image, filtering least common words.')
+	print ('Reading scanned image and hocr file, filtering least common words.')
-	print ("") 
+	print ('') 
 	t = html5lib.parse(f, namespaceHTMLElements=False)
@ -86,11 +91,15 @@ for i in scanimg:
 	findleastcommon(clean_words) #find least common words and add them to list
 	mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list
-	print ("The most common words until text", x+1, "are:", mostcommon_list)
+	print ('The most common words until text', x+1, 'are:', mostcommon_list)
-	
+	print ('') 
 	# loop through every word in hocr file to extract coordinates, then remove or paste into output image
-	for element in t.findall(".//span[@class='ocrx_word']"): 
+	print ('Processing word coordinates and replacing least common words with most common words.')
 	print ('') 
 	for element in t.findall(".//span[@class='ocrx_word']"):
 		word = filternone(element.text)
 		c = coordinates(element.attrib['title']) 
 		num = num + 1
@ -99,17 +108,17 @@ for i in scanimg:
 		#extract coordinates
 		if word.lower() in mostcommon_list and len(word) > 1 and len(word) <= 5:
-			wim.save ("output/erase-replace/crops4/wimreplace{}.png".format(num))
+			wim.save ("temp/crops4/wimreplace{}.png".format(num))
 		elif word in mostcommon_list and len(word) <= 7 :
-			wim.save ("output/erase-replace/crops7/wimreplace{}.png".format(num))
+			wim.save ("temp/crops7/wimreplace{}.png".format(num))
 		elif word in mostcommon_list and len(word) > 7 :
-			wim.save ("output/erase-replace/crops_more/wimreplace{}.png".format(num))
+			wim.save ("temp/crops_more/wimreplace{}.png".format(num))
 		if x > 0:
 			# use PIL to crop out every box, then paste it according to if rule
-			randomimg4 = random.choice(glob.glob('./output/erase-replace/crops4/*.png'))
+			randomimg4 = random.choice(glob.glob('temp/crops4/*.png'))
-			randomimg7 = random.choice(glob.glob('./output/erase-replace/crops7/*.png'))
+			randomimg7 = random.choice(glob.glob('temp/crops7/*.png'))
-			randomimg_more = random.choice(glob.glob('./output/erase-replace/crops_more/*.png'))
+			randomimg_more = random.choice(glob.glob('temp/crops_more/*.png'))
 			wimreplace4 = Image.open(randomimg4)
 			wimreplace7 = Image.open(randomimg7)
@ -128,23 +137,18 @@ for i in scanimg:
 			elif word.lower() in leastcommon_list and len(word) < 8:
 				oim.paste(out4, (c[0], c[1]))
 				print ('Excluding:', word)
 			elif word.lower() in leastcommon_list and len(word) < 11:
 				oim.paste(out7, (c[0], c[1]))
 				print ('Excluding:', word)	
 			elif word.lower() in leastcommon_list and len(word) > 8:
 				oim.paste(out_more, (c[0], c[1]))
 				print ('Excluding:', word)	
 			else:
 				oim.paste(wim, (c[0], c[1], c[2], c[3]))
 				print ('Including:', word)
 		else:
 			oim.paste(wim, (c[0], c[1], c[2], c[3]))
 			print ('Including:', word)
 	#-------------------------------------------------------------------------------#
@ -155,20 +159,18 @@ for i in scanimg:
 #-------------------------------------------------------------------------------#
 # save images into PDF
 outputs = glob.glob('output/erase-replace/*replace.jpg')
-print ("Saving to PDF:", outputs)
+print ('') 
 print ("Saving to PDF: output/erase-replace/Replace.pdf")
 def makePdf(pdfFileName, listPages, dir = ''):
 	if (dir):
 		dir += "/"
 	cover = Image.open(dir + str(listPages[0]))
 	width, height = cover.size
 	pdf = FPDF(unit = "pt", format = [width, height])
 	for page in listPages:
 		pdf.add_page()
 		pdf.image(dir + str(page), 0, 0)
 	pdf.output(dir + pdfFileName + ".pdf", "F")
 makePdf('output/erase-replace/Replace', outputs, dir = '')
@ -177,16 +179,5 @@ makePdf('output/erase-replace/Replace', outputs, dir = '')
 files = glob.glob('./output/erase-replace/*replace.jpg')
 for f in files:
    os.remove(f)
 files = glob.glob('./output/erase-replace/crops4/*.png')
 for f in files:
    os.remove(f)
 files = glob.glob('./output/erase-replace/crops7/*.png')
 for f in files:
    os.remove(f)
 files = glob.glob('./output/erase-replace/crops_more/*.png')
 for f in files:
    os.remove(f)
 shutil.rmtree('./temp/')