diff --git a/Makefile b/Makefile index c0c31d5..235edad 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ images=$(sort $(wildcard images/*.jpg)) # @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file +images-tiff=$(sort $(wildcard images-tiff/*.tiff)) output_ocr:=$(dir_ocr)/output.txt tmpfile:= $(shell mktemp) space:= $(empty) $(empty) @@ -34,9 +35,10 @@ dirs: ## create the dirs in working dir @-mkdir -p images/ @-mkdir -p images-tiff/ @-mkdir -p output/ + @-mkdir -p output/erase-replace/ @-mkdir -p ocr/ @-mkdir -p hocr/ - @echo $(color_r)'Directories made': images/ output/ + @echo $(color_r)'Directories made': ocr/ hocr/ images/ images-tiff/ output/ testif: @@ -91,10 +93,16 @@ carlandre: ocr/output.txt ## Alice: Creates visual poetry out of a text. Depende .PHONY: carlandre # cat $(@) > /dev/usb/lp0 + overunder: ocr/output.txt ## Alice: An interpreted language that translate simple weaving instructions and creates a weaving pattern on text. @python3 src/overunder.py .PHONY: overunder +erase:hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF + python3 src/erase_leastcommon.py + +replace:hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF + python3 src/replace_leastcommon.py visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer @echo $(tmpfile) diff --git a/README b/README index 1bd6563..bea5d99 100644 --- a/README +++ b/README @@ -1,16 +1,33 @@ # OuNuPo Make -Software experiments for the OuNuPo bookscanner. Part of Special Issue #5 +Software experiments for the OuNuPo bookscanner, part of Special Issue 5 + +https://issue.xpub.nl/05/ + +https://xpub.nl/ + ## License ## Authors -#Angeliki Diakrousi +Natasha Berting, Angeliki Diakrousi, Joca van der Horst, Alexander Roidl, Alice Strete and Zalán Szakács. -# Install +## Clone Repository `git clone https://git.xpub.nl/repos/OuNuPo-make.git` +## General depencies +* Python3 +* GNU make +* Python3 NLTK `pip3 install nltk` +* NLTK English Corpus: + * run NLTK downloader `python -m nltk.downloader` + * select menu "Corpora" + * select "stopwords" + * "Dowload" + + + # Make commands ## N+7 (example) Author @@ -18,25 +35,24 @@ Description: Replaces every word with the 7th next word in a dictionary. run: `make N+7` -dependencies: +Specific Dependencies: * a * b * c -## ttssr-human-only Angeliki +## Sitting inside a pocket(sphinx): Angeliki Description: Speech recognition feedback loops using the first sentence of a scanned text as input run: `make ttssr-human-only` -dependencies: -*python 3.3 -*pocketsphinx -*SpeechRecognition 3.8.1 -*PyAudio +Specific Dependencies: +* [pocketsphinx](https://github.com/bambocher/pocketsphinx-python) `sudo pip3 install pocketsphinx` ---> FOLLOW THIS EXAMPLE +* SpeechRecognition 3.8.1 +* PyAudio + + -install: -`https://pzwiki.wdka.nl/mediadesign/Speech_recognition` diff --git a/src/erase_leastcommon.py b/src/erase_leastcommon.py index 3e7b69b..825bfda 100644 --- a/src/erase_leastcommon.py +++ b/src/erase_leastcommon.py @@ -57,11 +57,10 @@ for i in scanimg: iim = Image.open(i) # iim is initial image oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image - # open corresponding hocr file - print ("Analysing", hocr[x]) + # open corresponding hocr file f = open(hocr[x]) print ('Reading scanned image, filtering least common words.') - print ("") + print ('') t = html5lib.parse(f, namespaceHTMLElements=False) @@ -73,7 +72,9 @@ for i in scanimg: clean_words = cleanstopwords(allwords) #clean stopwords findleastcommon(clean_words) #find least common words and add them to list print ("The least common words until text", x+1, "are:", leastcommon_list) - + print ('') + print ('Processing word coordinates and erasing least common words.') + print ('') # loop through every word in hocr file to extract coordinates, then remove or paste into output image for element in t.findall(".//span[@class='ocrx_word']"): word = filternone(element.text) @@ -83,11 +84,9 @@ for i in scanimg: if word.lower() in leastcommon_list and len(word) < limit: oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3])) - print ('Excluding:', word) else: oim.paste(wim, (c[0], c[1], c[2], c[3])) - print ('Including:', word) #-------------------------------------------------------------------------------# # save and show images @@ -98,20 +97,17 @@ for i in scanimg: #-------------------------------------------------------------------------------# # save images into PDF outputs = glob.glob('output/erase-replace/*erase.jpg') -print ("Saving to PDF:", outputs) +print ("Saving to PDF: output/erase-replace/Erase.pdf") def makePdf(pdfFileName, listPages, dir = ''): if (dir): dir += "/" - cover = Image.open(dir + str(listPages[0])) width, height = cover.size pdf = FPDF(unit = "pt", format = [width, height]) - for page in listPages: pdf.add_page() pdf.image(dir + str(page), 0, 0) - pdf.output(dir + pdfFileName + ".pdf", "F") makePdf('output/erase-replace/Erase', outputs, dir = '') diff --git a/src/replace_leastcommon.py b/src/replace_leastcommon.py index b596d2a..9cc61b8 100644 --- a/src/replace_leastcommon.py +++ b/src/replace_leastcommon.py @@ -8,6 +8,14 @@ import glob import time from fpdf import FPDF import os +import shutil + +path1 = './temp' +if not os.path.isdir(path1): + os.makedirs(path1) + os.makedirs('./temp/crops4') + os.makedirs('./temp/crops7') + os.makedirs('./temp/crops_more') stopwords.words('english') sr = set(stopwords.words('english')) @@ -58,10 +66,8 @@ allwords = [] scanimg = glob.glob('images-tiff/*.tiff') hocr = glob.glob('hocr/*.html') num = 0 - maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned - # loop through every image in scanimg folder for i in scanimg: x = x + 1 @@ -70,10 +76,9 @@ for i in scanimg: oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image # open corresponding hocr file - print ("Analysing", hocr[x]) f = open(hocr[x]) - print ('Reading scanned image, filtering least common words.') - print ("") + print ('Reading scanned image and hocr file, filtering least common words.') + print ('') t = html5lib.parse(f, namespaceHTMLElements=False) @@ -86,11 +91,15 @@ for i in scanimg: findleastcommon(clean_words) #find least common words and add them to list mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list - print ("The most common words until text", x+1, "are:", mostcommon_list) - + print ('The most common words until text', x+1, 'are:', mostcommon_list) + print ('') + # loop through every word in hocr file to extract coordinates, then remove or paste into output image - for element in t.findall(".//span[@class='ocrx_word']"): + print ('Processing word coordinates and replacing least common words with most common words.') + print ('') + + for element in t.findall(".//span[@class='ocrx_word']"): word = filternone(element.text) c = coordinates(element.attrib['title']) num = num + 1 @@ -99,17 +108,17 @@ for i in scanimg: #extract coordinates if word.lower() in mostcommon_list and len(word) > 1 and len(word) <= 5: - wim.save ("output/erase-replace/crops4/wimreplace{}.png".format(num)) + wim.save ("temp/crops4/wimreplace{}.png".format(num)) elif word in mostcommon_list and len(word) <= 7 : - wim.save ("output/erase-replace/crops7/wimreplace{}.png".format(num)) + wim.save ("temp/crops7/wimreplace{}.png".format(num)) elif word in mostcommon_list and len(word) > 7 : - wim.save ("output/erase-replace/crops_more/wimreplace{}.png".format(num)) + wim.save ("temp/crops_more/wimreplace{}.png".format(num)) if x > 0: # use PIL to crop out every box, then paste it according to if rule - randomimg4 = random.choice(glob.glob('./output/erase-replace/crops4/*.png')) - randomimg7 = random.choice(glob.glob('./output/erase-replace/crops7/*.png')) - randomimg_more = random.choice(glob.glob('./output/erase-replace/crops_more/*.png')) + randomimg4 = random.choice(glob.glob('temp/crops4/*.png')) + randomimg7 = random.choice(glob.glob('temp/crops7/*.png')) + randomimg_more = random.choice(glob.glob('temp/crops_more/*.png')) wimreplace4 = Image.open(randomimg4) wimreplace7 = Image.open(randomimg7) @@ -128,23 +137,18 @@ for i in scanimg: elif word.lower() in leastcommon_list and len(word) < 8: oim.paste(out4, (c[0], c[1])) - print ('Excluding:', word) elif word.lower() in leastcommon_list and len(word) < 11: oim.paste(out7, (c[0], c[1])) - print ('Excluding:', word) elif word.lower() in leastcommon_list and len(word) > 8: oim.paste(out_more, (c[0], c[1])) - print ('Excluding:', word) else: oim.paste(wim, (c[0], c[1], c[2], c[3])) - print ('Including:', word) else: oim.paste(wim, (c[0], c[1], c[2], c[3])) - print ('Including:', word) #-------------------------------------------------------------------------------# @@ -155,20 +159,18 @@ for i in scanimg: #-------------------------------------------------------------------------------# # save images into PDF outputs = glob.glob('output/erase-replace/*replace.jpg') -print ("Saving to PDF:", outputs) +print ('') +print ("Saving to PDF: output/erase-replace/Replace.pdf") def makePdf(pdfFileName, listPages, dir = ''): if (dir): dir += "/" - cover = Image.open(dir + str(listPages[0])) width, height = cover.size pdf = FPDF(unit = "pt", format = [width, height]) - for page in listPages: pdf.add_page() pdf.image(dir + str(page), 0, 0) - pdf.output(dir + pdfFileName + ".pdf", "F") makePdf('output/erase-replace/Replace', outputs, dir = '') @@ -177,16 +179,5 @@ makePdf('output/erase-replace/Replace', outputs, dir = '') files = glob.glob('./output/erase-replace/*replace.jpg') for f in files: os.remove(f) - -files = glob.glob('./output/erase-replace/crops4/*.png') -for f in files: - os.remove(f) - -files = glob.glob('./output/erase-replace/crops7/*.png') -for f in files: - os.remove(f) - -files = glob.glob('./output/erase-replace/crops_more/*.png') -for f in files: - os.remove(f) +shutil.rmtree('./temp/')