Merge branch 'master' of git.xpub.nl:/var/www/git.xpub.nl/repos/OuNuPo-make

7 years ago · 430fd0fca2
parent 57c68a8743 c9820c1a17
commit 430fd0fca2
4 changed files with 69 additions and 58 deletions
--- a/10
+++ b/10
@ -1,5 +1,6 @@
 images=$(sort $(wildcard images/*.jpg))
 # @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file
 images-tiff=$(sort $(wildcard images-tiff/*.tiff))
 output_ocr:=$(dir_ocr)/output.txt
 tmpfile:= $(shell mktemp)
 space:= $(empty) $(empty)
@ -34,9 +35,10 @@ dirs: ## create the dirs in working dir
 	@-mkdir -p images/
 	@-mkdir -p images-tiff/
 	@-mkdir -p output/
 	@-mkdir -p output/erase-replace/
 	@-mkdir -p ocr/
 	@-mkdir -p hocr/
-	@echo $(color_r)'Directories made': images/ output/
+	@echo $(color_r)'Directories made': ocr/ hocr/ images/ images-tiff/ output/ 
 testif:
@ -91,10 +93,16 @@ carlandre: ocr/output.txt ## Alice: Creates visual poetry out of a text. Depende
 .PHONY: carlandre
 # cat $(@) > /dev/usb/lp0
 overunder: ocr/output.txt ## Alice: An interpreted language that translate simple weaving instructions and creates a weaving pattern on text.
 	@python3 src/overunder.py
 .PHONY: overunder
 erase:hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF
 	python3 src/erase_leastcommon.py
 replace:hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF
 	python3 src/replace_leastcommon.py
 visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer
 	@echo $(tmpfile)
--- a/40
+++ b/40
@ -1,16 +1,33 @@
 # OuNuPo Make
-Software experiments for the OuNuPo bookscanner. Part of Special Issue #5
+Software experiments for the OuNuPo bookscanner, part of Special Issue 5
 https://issue.xpub.nl/05/
 https://xpub.nl/
 ## License
 ## Authors
-#Angeliki Diakrousi
+Natasha Berting, Angeliki Diakrousi, Joca van der Horst, Alexander Roidl, Alice Strete and Zalán Szakács.
-# Install 
+## Clone Repository
 `git clone https://git.xpub.nl/repos/OuNuPo-make.git`
 ## General depencies
 * Python3
 * GNU make
 * Python3 NLTK  `pip3 install nltk`
 * NLTK English Corpus:
    * run NLTK downloader `python -m nltk.downloader`
    * select menu "Corpora"
    * select "stopwords"
    * "Dowload"
 # Make commands
 ## N+7 (example) Author
@ -18,25 +35,24 @@ Description: Replaces every word with the 7th next word in a dictionary.
 run: `make N+7`
-dependencies:
+Specific Dependencies:
 * a
 * b
 * c
-## ttssr-human-only Angeliki
+## Sitting inside a pocket(sphinx): Angeliki
 Description: Speech recognition feedback loops using the first sentence of a scanned text as input
 run: `make ttssr-human-only`
-dependencies:
+Specific Dependencies:
-*python 3.3
+* [pocketsphinx](https://github.com/bambocher/pocketsphinx-python) `sudo pip3 install pocketsphinx`  ---> FOLLOW THIS EXAMPLE
-*pocketsphinx
+* SpeechRecognition 3.8.1 
-*SpeechRecognition 3.8.1 
+* PyAudio 
-*PyAudio
+
 install: 
 `https://pzwiki.wdka.nl/mediadesign/Speech_recognition`
--- a/src/erase_leastcommon.py
+++ b/src/erase_leastcommon.py
@ -57,11 +57,10 @@ for i in scanimg:
 	iim = Image.open(i) # iim is initial image
 	oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
-	# open corresponding hocr file
+	# open corresponding hocr file 
 	print ("Analysing", hocr[x]) 
 	f = open(hocr[x])
 	print ('Reading scanned image, filtering least common words.')
-	print ("") 
+	print ('') 
 	t = html5lib.parse(f, namespaceHTMLElements=False)
@ -73,7 +72,9 @@ for i in scanimg:
 	clean_words = cleanstopwords(allwords) #clean stopwords
 	findleastcommon(clean_words) #find least common words and add them to list
 	print ("The least common words until text", x+1, "are:", leastcommon_list)
-	
+	print ('') 
 	print ('Processing word coordinates and erasing least common words.')
 	print ('') 
 	# loop through every word in hocr file to extract coordinates, then remove or paste into output image
 	for element in t.findall(".//span[@class='ocrx_word']"): 
 		word = filternone(element.text)
@ -83,11 +84,9 @@ for i in scanimg:
 		if word.lower() in leastcommon_list and len(word) < limit:
 			oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3]))
 			print ('Excluding:', word)
 		else:
 			oim.paste(wim, (c[0], c[1], c[2], c[3]))
 			print ('Including:', word)
 	#-------------------------------------------------------------------------------#
 	# save and show images
@ -98,20 +97,17 @@ for i in scanimg:
 #-------------------------------------------------------------------------------#
 # save images into PDF
 outputs = glob.glob('output/erase-replace/*erase.jpg')
-print ("Saving to PDF:", outputs)
+print ("Saving to PDF: output/erase-replace/Erase.pdf")
 def makePdf(pdfFileName, listPages, dir = ''):
 	if (dir):
 		dir += "/"
 	cover = Image.open(dir + str(listPages[0]))
 	width, height = cover.size
 	pdf = FPDF(unit = "pt", format = [width, height])
 	for page in listPages:
 		pdf.add_page()
 		pdf.image(dir + str(page), 0, 0)
 	pdf.output(dir + pdfFileName + ".pdf", "F")
 makePdf('output/erase-replace/Erase', outputs, dir = '')
--- a/src/replace_leastcommon.py
+++ b/src/replace_leastcommon.py
@ -8,6 +8,14 @@ import glob
 import time
 from fpdf import FPDF
 import os
 import shutil
 path1 = './temp'
 if not os.path.isdir(path1):
   os.makedirs(path1)
   os.makedirs('./temp/crops4')
   os.makedirs('./temp/crops7')
   os.makedirs('./temp/crops_more')
 stopwords.words('english')
 sr = set(stopwords.words('english'))
@ -58,10 +66,8 @@ allwords = []
 scanimg = glob.glob('images-tiff/*.tiff')
 hocr = glob.glob('hocr/*.html')
 num = 0
 maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned 
 # loop through every image in scanimg folder
 for i in scanimg:
 	x = x + 1
@ -70,10 +76,9 @@ for i in scanimg:
 	oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
 	# open corresponding hocr file
 	print ("Analysing", hocr[x]) 
 	f = open(hocr[x])
-	print ('Reading scanned image, filtering least common words.')
+	print ('Reading scanned image and hocr file, filtering least common words.')
-	print ("") 
+	print ('') 
 	t = html5lib.parse(f, namespaceHTMLElements=False)
@ -86,11 +91,15 @@ for i in scanimg:
 	findleastcommon(clean_words) #find least common words and add them to list
 	mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list
-	print ("The most common words until text", x+1, "are:", mostcommon_list)
+	print ('The most common words until text', x+1, 'are:', mostcommon_list)
-	
+	print ('') 
 	# loop through every word in hocr file to extract coordinates, then remove or paste into output image
-	for element in t.findall(".//span[@class='ocrx_word']"): 
+	print ('Processing word coordinates and replacing least common words with most common words.')
 	print ('') 
 	for element in t.findall(".//span[@class='ocrx_word']"):
 		word = filternone(element.text)
 		c = coordinates(element.attrib['title']) 
 		num = num + 1
@ -99,17 +108,17 @@ for i in scanimg:
 		#extract coordinates
 		if word.lower() in mostcommon_list and len(word) > 1 and len(word) <= 5:
-			wim.save  ("output/erase-replace/crops4/wimreplace{}.png".format(num))
+			wim.save ("temp/crops4/wimreplace{}.png".format(num))
 		elif word in mostcommon_list and len(word) <= 7 :
-			wim.save ("output/erase-replace/crops7/wimreplace{}.png".format(num))
+			wim.save ("temp/crops7/wimreplace{}.png".format(num))
 		elif word in mostcommon_list and len(word) > 7 :
-			wim.save ("output/erase-replace/crops_more/wimreplace{}.png".format(num))
+			wim.save ("temp/crops_more/wimreplace{}.png".format(num))
 		if x > 0:
 			# use PIL to crop out every box, then paste it according to if rule
-			randomimg4 = random.choice(glob.glob('./output/erase-replace/crops4/*.png'))
+			randomimg4 = random.choice(glob.glob('temp/crops4/*.png'))
-			randomimg7 = random.choice(glob.glob('./output/erase-replace/crops7/*.png'))
+			randomimg7 = random.choice(glob.glob('temp/crops7/*.png'))
-			randomimg_more = random.choice(glob.glob('./output/erase-replace/crops_more/*.png'))
+			randomimg_more = random.choice(glob.glob('temp/crops_more/*.png'))
 			wimreplace4 = Image.open(randomimg4)
 			wimreplace7 = Image.open(randomimg7)
@ -128,23 +137,18 @@ for i in scanimg:
 			elif word.lower() in leastcommon_list and len(word) < 8:
 				oim.paste(out4, (c[0], c[1]))
 				print ('Excluding:', word)
 			elif word.lower() in leastcommon_list and len(word) < 11:
 				oim.paste(out7, (c[0], c[1]))
 				print ('Excluding:', word)	
 			elif word.lower() in leastcommon_list and len(word) > 8:
 				oim.paste(out_more, (c[0], c[1]))
 				print ('Excluding:', word)	
 			else:
 				oim.paste(wim, (c[0], c[1], c[2], c[3]))
 				print ('Including:', word)
 		else:
 			oim.paste(wim, (c[0], c[1], c[2], c[3]))
 			print ('Including:', word)
 	#-------------------------------------------------------------------------------#
@ -155,20 +159,18 @@ for i in scanimg:
 #-------------------------------------------------------------------------------#
 # save images into PDF
 outputs = glob.glob('output/erase-replace/*replace.jpg')
-print ("Saving to PDF:", outputs)
+print ('') 
 print ("Saving to PDF: output/erase-replace/Replace.pdf")
 def makePdf(pdfFileName, listPages, dir = ''):
 	if (dir):
 		dir += "/"
 	cover = Image.open(dir + str(listPages[0]))
 	width, height = cover.size
 	pdf = FPDF(unit = "pt", format = [width, height])
 	for page in listPages:
 		pdf.add_page()
 		pdf.image(dir + str(page), 0, 0)
 	pdf.output(dir + pdfFileName + ".pdf", "F")
 makePdf('output/erase-replace/Replace', outputs, dir = '')
@ -177,16 +179,5 @@ makePdf('output/erase-replace/Replace', outputs, dir = '')
 files = glob.glob('./output/erase-replace/*replace.jpg')
 for f in files:
    os.remove(f)
 files = glob.glob('./output/erase-replace/crops4/*.png')
 for f in files:
    os.remove(f)
 files = glob.glob('./output/erase-replace/crops7/*.png')
 for f in files:
    os.remove(f)
 files = glob.glob('./output/erase-replace/crops_more/*.png')
 for f in files:
    os.remove(f)
 shutil.rmtree('./temp/')