updated erase & replace rule in makefile

7 years ago · 0587b5cdaf
parent ab85a34326
commit 0587b5cdaf
3 changed files with 33 additions and 47 deletions
--- a/3
+++ b/3
@ -100,9 +100,8 @@ overunder: ocr/output.txt ## Alice: An interpreted language that translate simpl
 erase:hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF
 	python3 src/erase_leastcommon.py
 	@echo 'erase rule output: output/erase-replace/Erase.pdf'
-replace: ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF
+replace:hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF
 	python3 src/replace_leastcommon.py
 visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer
--- a/src/erase_leastcommon.py
+++ b/src/erase_leastcommon.py
@ -58,10 +58,9 @@ for i in scanimg:
 	oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
 	# open corresponding hocr file 
 	print ("Analysing", hocr[x]) 
 	f = open(hocr[x])
 	print ('Reading scanned image, filtering least common words.')
-	print ("") 
+	print ('') 
 	t = html5lib.parse(f, namespaceHTMLElements=False)
@ -73,7 +72,9 @@ for i in scanimg:
 	clean_words = cleanstopwords(allwords) #clean stopwords
 	findleastcommon(clean_words) #find least common words and add them to list
 	print ("The least common words until text", x+1, "are:", leastcommon_list)
-	
+	print ('') 
 	print ('Processing word coordinates and erasing least common words.')
 	print ('') 
 	# loop through every word in hocr file to extract coordinates, then remove or paste into output image
 	for element in t.findall(".//span[@class='ocrx_word']"): 
 		word = filternone(element.text)
@ -83,11 +84,9 @@ for i in scanimg:
 		if word.lower() in leastcommon_list and len(word) < limit:
 			oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3]))
 			print ('Excluding:', word)
 		else:
 			oim.paste(wim, (c[0], c[1], c[2], c[3]))
 			print ('Including:', word)
 	#-------------------------------------------------------------------------------#
 	# save and show images
@ -98,20 +97,17 @@ for i in scanimg:
 #-------------------------------------------------------------------------------#
 # save images into PDF
 outputs = glob.glob('output/erase-replace/*erase.jpg')
-print ("Saving to PDF:", outputs)
+print ("Saving to PDF: output/erase-replace/Erase.pdf")
 def makePdf(pdfFileName, listPages, dir = ''):
 	if (dir):
 		dir += "/"
 	cover = Image.open(dir + str(listPages[0]))
 	width, height = cover.size
 	pdf = FPDF(unit = "pt", format = [width, height])
 	for page in listPages:
 		pdf.add_page()
 		pdf.image(dir + str(page), 0, 0)
 	pdf.output(dir + pdfFileName + ".pdf", "F")
 makePdf('output/erase-replace/Erase', outputs, dir = '')
--- a/src/replace_leastcommon.py
+++ b/src/replace_leastcommon.py
@ -8,6 +8,14 @@ import glob
 import time
 from fpdf import FPDF
 import os
 import shutil
 path1 = './temp'
 if not os.path.isdir(path1):
   os.makedirs(path1)
   os.makedirs('./temp/crops4')
   os.makedirs('./temp/crops7')
   os.makedirs('./temp/crops_more')
 stopwords.words('english')
 sr = set(stopwords.words('english'))
@ -58,10 +66,8 @@ allwords = []
 scanimg = glob.glob('images-tiff/*.tiff')
 hocr = glob.glob('hocr/*.html')
 num = 0
 maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned 
 # loop through every image in scanimg folder
 for i in scanimg:
 	x = x + 1
@ -70,10 +76,9 @@ for i in scanimg:
 	oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
 	# open corresponding hocr file
 	print ("Analysing", hocr[x]) 
 	f = open(hocr[x])
-	print ('Reading scanned image, filtering least common words.')
+	print ('Reading scanned image and hocr file, filtering least common words.')
-	print ("") 
+	print ('') 
 	t = html5lib.parse(f, namespaceHTMLElements=False)
@ -86,10 +91,14 @@ for i in scanimg:
 	findleastcommon(clean_words) #find least common words and add them to list
 	mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list
-	print ("The most common words until text", x+1, "are:", mostcommon_list)
+	print ('The most common words until text', x+1, 'are:', mostcommon_list)
 	print ('') 
 	# loop through every word in hocr file to extract coordinates, then remove or paste into output image
 	print ('Processing word coordinates and replacing least common words with most common words.')
 	print ('') 
 	for element in t.findall(".//span[@class='ocrx_word']"):
 		word = filternone(element.text)
 		c = coordinates(element.attrib['title']) 
@ -99,17 +108,17 @@ for i in scanimg:
 		#extract coordinates
 		if word.lower() in mostcommon_list and len(word) > 1 and len(word) <= 5:
-			wim.save ("output/erase-replace/crops4/wimreplace{}.png".format(num))
+			wim.save ("temp/crops4/wimreplace{}.png".format(num))
 		elif word in mostcommon_list and len(word) <= 7 :
-			wim.save ("output/erase-replace/crops7/wimreplace{}.png".format(num))
+			wim.save ("temp/crops7/wimreplace{}.png".format(num))
 		elif word in mostcommon_list and len(word) > 7 :
-			wim.save ("output/erase-replace/crops_more/wimreplace{}.png".format(num))
+			wim.save ("temp/crops_more/wimreplace{}.png".format(num))
 		if x > 0:
 			# use PIL to crop out every box, then paste it according to if rule
-			randomimg4 = random.choice(glob.glob('./output/erase-replace/crops4/*.png'))
+			randomimg4 = random.choice(glob.glob('temp/crops4/*.png'))
-			randomimg7 = random.choice(glob.glob('./output/erase-replace/crops7/*.png'))
+			randomimg7 = random.choice(glob.glob('temp/crops7/*.png'))
-			randomimg_more = random.choice(glob.glob('./output/erase-replace/crops_more/*.png'))
+			randomimg_more = random.choice(glob.glob('temp/crops_more/*.png'))
 			wimreplace4 = Image.open(randomimg4)
 			wimreplace7 = Image.open(randomimg7)
@ -128,23 +137,18 @@ for i in scanimg:
 			elif word.lower() in leastcommon_list and len(word) < 8:
 				oim.paste(out4, (c[0], c[1]))
 				print ('Excluding:', word)
 			elif word.lower() in leastcommon_list and len(word) < 11:
 				oim.paste(out7, (c[0], c[1]))
 				print ('Excluding:', word)	
 			elif word.lower() in leastcommon_list and len(word) > 8:
 				oim.paste(out_more, (c[0], c[1]))
 				print ('Excluding:', word)	
 			else:
 				oim.paste(wim, (c[0], c[1], c[2], c[3]))
 				print ('Including:', word)
 		else:
 			oim.paste(wim, (c[0], c[1], c[2], c[3]))
 			print ('Including:', word)
 	#-------------------------------------------------------------------------------#
@ -155,20 +159,18 @@ for i in scanimg:
 #-------------------------------------------------------------------------------#
 # save images into PDF
 outputs = glob.glob('output/erase-replace/*replace.jpg')
-print ("Saving to PDF:", outputs)
+print ('') 
 print ("Saving to PDF: output/erase-replace/Replace.pdf")
 def makePdf(pdfFileName, listPages, dir = ''):
 	if (dir):
 		dir += "/"
 	cover = Image.open(dir + str(listPages[0]))
 	width, height = cover.size
 	pdf = FPDF(unit = "pt", format = [width, height])
 	for page in listPages:
 		pdf.add_page()
 		pdf.image(dir + str(page), 0, 0)
 	pdf.output(dir + pdfFileName + ".pdf", "F")
 makePdf('output/erase-replace/Replace', outputs, dir = '')
@ -178,15 +180,4 @@ files = glob.glob('./output/erase-replace/*replace.jpg')
 for f in files:
    os.remove(f)
-files = glob.glob('./output/erase-replace/crops4/*.png')
+shutil.rmtree('./temp/')
 for f in files:
    os.remove(f)
 files = glob.glob('./output/erase-replace/crops7/*.png')
 for f in files:
    os.remove(f)
 files = glob.glob('./output/erase-replace/crops_more/*.png')
 for f in files:
    os.remove(f)