Merge branch 'master' of git.xpub.nl:/var/www/git.xpub.nl/repos/OuNuPo-make

7 years ago · 430fd0fca2
parent 57c68a8743 c9820c1a17
commit 430fd0fca2
4 changed files with 69 additions and 58 deletions
--- a/10
+++ b/10
@ -1,5 +1,6 @@
 images=$(sort $(wildcard images/*.jpg))
 # @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file
+images-tiff=$(sort $(wildcard images-tiff/*.tiff))
 output_ocr:=$(dir_ocr)/output.txt
 tmpfile:= $(shell mktemp)
 space:= $(empty) $(empty)
@ -34,9 +35,10 @@ dirs: ## create the dirs in working dir
 	@-mkdir -p images/
 	@-mkdir -p images-tiff/
 	@-mkdir -p output/
+	@-mkdir -p output/erase-replace/
 	@-mkdir -p ocr/
 	@-mkdir -p hocr/
-	@echo $(color_r)'Directories made': images/ output/
+	@echo $(color_r)'Directories made': ocr/ hocr/ images/ images-tiff/ output/ 


 testif:
@ -91,10 +93,16 @@ carlandre: ocr/output.txt ## Alice: Creates visual poetry out of a text. Depende
 .PHONY: carlandre
 # cat $(@) > /dev/usb/lp0

+
 overunder: ocr/output.txt ## Alice: An interpreted language that translate simple weaving instructions and creates a weaving pattern on text.
 	@python3 src/overunder.py
 .PHONY: overunder

+erase:hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF
+	python3 src/erase_leastcommon.py
+
+replace:hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF
+	python3 src/replace_leastcommon.py

 visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer
 	@echo $(tmpfile)
--- a/36
+++ b/36
@ -1,16 +1,33 @@
 # OuNuPo Make
-Software experiments for the OuNuPo bookscanner. Part of Special Issue #5
+Software experiments for the OuNuPo bookscanner, part of Special Issue 5
+
+https://issue.xpub.nl/05/
+
+https://xpub.nl/
+

 ## License

 ## Authors
-#Angeliki Diakrousi
+Natasha Berting, Angeliki Diakrousi, Joca van der Horst, Alexander Roidl, Alice Strete and Zalán Szakács.


-# Install 
+## Clone Repository
 `git clone https://git.xpub.nl/repos/OuNuPo-make.git`


+## General depencies
+* Python3
+* GNU make
+* Python3 NLTK  `pip3 install nltk`
+* NLTK English Corpus:
+    * run NLTK downloader `python -m nltk.downloader`
+    * select menu "Corpora"
+    * select "stopwords"
+    * "Dowload"
+
+
+
 # Make commands

 ## N+7 (example) Author
@ -18,25 +35,24 @@ Description: Replaces every word with the 7th next word in a dictionary.

 run: `make N+7`

-dependencies:
+Specific Dependencies:
 * a
 * b
 * c


-## ttssr-human-only Angeliki
+## Sitting inside a pocket(sphinx): Angeliki
 Description: Speech recognition feedback loops using the first sentence of a scanned text as input

 run: `make ttssr-human-only`

-dependencies:
-*python 3.3
-*pocketsphinx
+Specific Dependencies:
+* [pocketsphinx](https://github.com/bambocher/pocketsphinx-python) `sudo pip3 install pocketsphinx`  ---> FOLLOW THIS EXAMPLE
 * SpeechRecognition 3.8.1 
 * PyAudio 

-install: 
-`https://pzwiki.wdka.nl/mediadesign/Speech_recognition`
+
+



--- a/src/erase_leastcommon.py
+++ b/src/erase_leastcommon.py
@ -58,10 +58,9 @@ for i in scanimg:
 	oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image

 	# open corresponding hocr file 
-	print ("Analysing", hocr[x]) 
 	f = open(hocr[x])
 	print ('Reading scanned image, filtering least common words.')
-	print ("") 
+	print ('') 

 	t = html5lib.parse(f, namespaceHTMLElements=False)
 	
@ -73,7 +72,9 @@ for i in scanimg:
 	clean_words = cleanstopwords(allwords) #clean stopwords
 	findleastcommon(clean_words) #find least common words and add them to list
 	print ("The least common words until text", x+1, "are:", leastcommon_list)
-	
+	print ('') 
+	print ('Processing word coordinates and erasing least common words.')
+	print ('') 
 	# loop through every word in hocr file to extract coordinates, then remove or paste into output image
 	for element in t.findall(".//span[@class='ocrx_word']"): 
 		word = filternone(element.text)
@ -83,11 +84,9 @@ for i in scanimg:

 		if word.lower() in leastcommon_list and len(word) < limit:
 			oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3]))
-			print ('Excluding:', word)

 		else:
 			oim.paste(wim, (c[0], c[1], c[2], c[3]))
-			print ('Including:', word)
 	
 	#-------------------------------------------------------------------------------#
 	# save and show images
@ -98,20 +97,17 @@ for i in scanimg:
 #-------------------------------------------------------------------------------#
 # save images into PDF
 outputs = glob.glob('output/erase-replace/*erase.jpg')
-print ("Saving to PDF:", outputs)
+print ("Saving to PDF: output/erase-replace/Erase.pdf")

 def makePdf(pdfFileName, listPages, dir = ''):
 	if (dir):
 		dir += "/"
-
 	cover = Image.open(dir + str(listPages[0]))
 	width, height = cover.size
 	pdf = FPDF(unit = "pt", format = [width, height])
-
 	for page in listPages:
 		pdf.add_page()
 		pdf.image(dir + str(page), 0, 0)
-
 	pdf.output(dir + pdfFileName + ".pdf", "F")

 makePdf('output/erase-replace/Erase', outputs, dir = '')
--- a/src/replace_leastcommon.py
+++ b/src/replace_leastcommon.py
@ -8,6 +8,14 @@ import glob
 import time
 from fpdf import FPDF
 import os
+import shutil
+
+path1 = './temp'
+if not os.path.isdir(path1):
+   os.makedirs(path1)
+   os.makedirs('./temp/crops4')
+   os.makedirs('./temp/crops7')
+   os.makedirs('./temp/crops_more')

 stopwords.words('english')
 sr = set(stopwords.words('english'))
@ -58,10 +66,8 @@ allwords = []
 scanimg = glob.glob('images-tiff/*.tiff')
 hocr = glob.glob('hocr/*.html')
 num = 0
-
 maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned 

-
 # loop through every image in scanimg folder
 for i in scanimg:
 	x = x + 1
@ -70,10 +76,9 @@ for i in scanimg:
 	oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image

 	# open corresponding hocr file
-	print ("Analysing", hocr[x]) 
 	f = open(hocr[x])
-	print ('Reading scanned image, filtering least common words.')
-	print ("") 
+	print ('Reading scanned image and hocr file, filtering least common words.')
+	print ('') 

 	t = html5lib.parse(f, namespaceHTMLElements=False)
 	
@ -86,10 +91,14 @@ for i in scanimg:
 	findleastcommon(clean_words) #find least common words and add them to list
 	mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list

-	print ("The most common words until text", x+1, "are:", mostcommon_list)
+	print ('The most common words until text', x+1, 'are:', mostcommon_list)
+	print ('') 

 	# loop through every word in hocr file to extract coordinates, then remove or paste into output image

+	print ('Processing word coordinates and replacing least common words with most common words.')
+	print ('') 
+
 	for element in t.findall(".//span[@class='ocrx_word']"):
 		word = filternone(element.text)
 		c = coordinates(element.attrib['title']) 
@ -99,17 +108,17 @@ for i in scanimg:

 		#extract coordinates
 		if word.lower() in mostcommon_list and len(word) > 1 and len(word) <= 5:
-			wim.save  ("output/erase-replace/crops4/wimreplace{}.png".format(num))
+			wim.save ("temp/crops4/wimreplace{}.png".format(num))
 		elif word in mostcommon_list and len(word) <= 7 :
-			wim.save ("output/erase-replace/crops7/wimreplace{}.png".format(num))
+			wim.save ("temp/crops7/wimreplace{}.png".format(num))
 		elif word in mostcommon_list and len(word) > 7 :
-			wim.save ("output/erase-replace/crops_more/wimreplace{}.png".format(num))
+			wim.save ("temp/crops_more/wimreplace{}.png".format(num))

 		if x > 0:
 			# use PIL to crop out every box, then paste it according to if rule
-			randomimg4 = random.choice(glob.glob('./output/erase-replace/crops4/*.png'))
-			randomimg7 = random.choice(glob.glob('./output/erase-replace/crops7/*.png'))
-			randomimg_more = random.choice(glob.glob('./output/erase-replace/crops_more/*.png'))
+			randomimg4 = random.choice(glob.glob('temp/crops4/*.png'))
+			randomimg7 = random.choice(glob.glob('temp/crops7/*.png'))
+			randomimg_more = random.choice(glob.glob('temp/crops_more/*.png'))

 			wimreplace4 = Image.open(randomimg4)
 			wimreplace7 = Image.open(randomimg7)
@ -128,23 +137,18 @@ for i in scanimg:

 			elif word.lower() in leastcommon_list and len(word) < 8:
 				oim.paste(out4, (c[0], c[1]))
-				print ('Excluding:', word)

 			elif word.lower() in leastcommon_list and len(word) < 11:
 				oim.paste(out7, (c[0], c[1]))
-				print ('Excluding:', word)	

 			elif word.lower() in leastcommon_list and len(word) > 8:
 				oim.paste(out_more, (c[0], c[1]))
-				print ('Excluding:', word)	
 				
 			else:
 				oim.paste(wim, (c[0], c[1], c[2], c[3]))
-				print ('Including:', word)

 		else:
 			oim.paste(wim, (c[0], c[1], c[2], c[3]))
-			print ('Including:', word)


 	#-------------------------------------------------------------------------------#
@ -155,20 +159,18 @@ for i in scanimg:
 #-------------------------------------------------------------------------------#
 # save images into PDF
 outputs = glob.glob('output/erase-replace/*replace.jpg')
-print ("Saving to PDF:", outputs)
+print ('') 
+print ("Saving to PDF: output/erase-replace/Replace.pdf")

 def makePdf(pdfFileName, listPages, dir = ''):
 	if (dir):
 		dir += "/"
-
 	cover = Image.open(dir + str(listPages[0]))
 	width, height = cover.size
 	pdf = FPDF(unit = "pt", format = [width, height])
-
 	for page in listPages:
 		pdf.add_page()
 		pdf.image(dir + str(page), 0, 0)
-
 	pdf.output(dir + pdfFileName + ".pdf", "F")

 makePdf('output/erase-replace/Replace', outputs, dir = '')
@ -178,15 +180,4 @@ files = glob.glob('./output/erase-replace/*replace.jpg')
 for f in files:
    os.remove(f)

-files = glob.glob('./output/erase-replace/crops4/*.png')
-for f in files:
-    os.remove(f)
-
-files = glob.glob('./output/erase-replace/crops7/*.png')
-for f in files:
-    os.remove(f)
-
-files = glob.glob('./output/erase-replace/crops_more/*.png')
-for f in files:
-    os.remove(f)
-
+shutil.rmtree('./temp/')