Merge branch 'master' of git.xpub.nl:/var/www/git.xpub.nl/repos/OuNuPo-make

master
ange 7 years ago
commit 430fd0fca2

@ -1,5 +1,6 @@
images=$(sort $(wildcard images/*.jpg)) images=$(sort $(wildcard images/*.jpg))
# @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file # @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file
images-tiff=$(sort $(wildcard images-tiff/*.tiff))
output_ocr:=$(dir_ocr)/output.txt output_ocr:=$(dir_ocr)/output.txt
tmpfile:= $(shell mktemp) tmpfile:= $(shell mktemp)
space:= $(empty) $(empty) space:= $(empty) $(empty)
@ -34,9 +35,10 @@ dirs: ## create the dirs in working dir
@-mkdir -p images/ @-mkdir -p images/
@-mkdir -p images-tiff/ @-mkdir -p images-tiff/
@-mkdir -p output/ @-mkdir -p output/
@-mkdir -p output/erase-replace/
@-mkdir -p ocr/ @-mkdir -p ocr/
@-mkdir -p hocr/ @-mkdir -p hocr/
@echo $(color_r)'Directories made': images/ output/ @echo $(color_r)'Directories made': ocr/ hocr/ images/ images-tiff/ output/
testif: testif:
@ -91,10 +93,16 @@ carlandre: ocr/output.txt ## Alice: Creates visual poetry out of a text. Depende
.PHONY: carlandre .PHONY: carlandre
# cat $(@) > /dev/usb/lp0 # cat $(@) > /dev/usb/lp0
overunder: ocr/output.txt ## Alice: An interpreted language that translate simple weaving instructions and creates a weaving pattern on text. overunder: ocr/output.txt ## Alice: An interpreted language that translate simple weaving instructions and creates a weaving pattern on text.
@python3 src/overunder.py @python3 src/overunder.py
.PHONY: overunder .PHONY: overunder
erase:hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF
python3 src/erase_leastcommon.py
replace:hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF
python3 src/replace_leastcommon.py
visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer
@echo $(tmpfile) @echo $(tmpfile)

@ -1,16 +1,33 @@
# OuNuPo Make # OuNuPo Make
Software experiments for the OuNuPo bookscanner. Part of Special Issue #5 Software experiments for the OuNuPo bookscanner, part of Special Issue 5
https://issue.xpub.nl/05/
https://xpub.nl/
## License ## License
## Authors ## Authors
#Angeliki Diakrousi Natasha Berting, Angeliki Diakrousi, Joca van der Horst, Alexander Roidl, Alice Strete and Zalán Szakács.
# Install ## Clone Repository
`git clone https://git.xpub.nl/repos/OuNuPo-make.git` `git clone https://git.xpub.nl/repos/OuNuPo-make.git`
## General depencies
* Python3
* GNU make
* Python3 NLTK `pip3 install nltk`
* NLTK English Corpus:
* run NLTK downloader `python -m nltk.downloader`
* select menu "Corpora"
* select "stopwords"
* "Dowload"
# Make commands # Make commands
## N+7 (example) Author ## N+7 (example) Author
@ -18,25 +35,24 @@ Description: Replaces every word with the 7th next word in a dictionary.
run: `make N+7` run: `make N+7`
dependencies: Specific Dependencies:
* a * a
* b * b
* c * c
## ttssr-human-only Angeliki ## Sitting inside a pocket(sphinx): Angeliki
Description: Speech recognition feedback loops using the first sentence of a scanned text as input Description: Speech recognition feedback loops using the first sentence of a scanned text as input
run: `make ttssr-human-only` run: `make ttssr-human-only`
dependencies: Specific Dependencies:
*python 3.3 * [pocketsphinx](https://github.com/bambocher/pocketsphinx-python) `sudo pip3 install pocketsphinx` ---> FOLLOW THIS EXAMPLE
*pocketsphinx * SpeechRecognition 3.8.1
*SpeechRecognition 3.8.1 * PyAudio
*PyAudio
install:
`https://pzwiki.wdka.nl/mediadesign/Speech_recognition`

@ -57,11 +57,10 @@ for i in scanimg:
iim = Image.open(i) # iim is initial image iim = Image.open(i) # iim is initial image
oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
# open corresponding hocr file # open corresponding hocr file
print ("Analysing", hocr[x])
f = open(hocr[x]) f = open(hocr[x])
print ('Reading scanned image, filtering least common words.') print ('Reading scanned image, filtering least common words.')
print ("") print ('')
t = html5lib.parse(f, namespaceHTMLElements=False) t = html5lib.parse(f, namespaceHTMLElements=False)
@ -73,7 +72,9 @@ for i in scanimg:
clean_words = cleanstopwords(allwords) #clean stopwords clean_words = cleanstopwords(allwords) #clean stopwords
findleastcommon(clean_words) #find least common words and add them to list findleastcommon(clean_words) #find least common words and add them to list
print ("The least common words until text", x+1, "are:", leastcommon_list) print ("The least common words until text", x+1, "are:", leastcommon_list)
print ('')
print ('Processing word coordinates and erasing least common words.')
print ('')
# loop through every word in hocr file to extract coordinates, then remove or paste into output image # loop through every word in hocr file to extract coordinates, then remove or paste into output image
for element in t.findall(".//span[@class='ocrx_word']"): for element in t.findall(".//span[@class='ocrx_word']"):
word = filternone(element.text) word = filternone(element.text)
@ -83,11 +84,9 @@ for i in scanimg:
if word.lower() in leastcommon_list and len(word) < limit: if word.lower() in leastcommon_list and len(word) < limit:
oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3])) oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3]))
print ('Excluding:', word)
else: else:
oim.paste(wim, (c[0], c[1], c[2], c[3])) oim.paste(wim, (c[0], c[1], c[2], c[3]))
print ('Including:', word)
#-------------------------------------------------------------------------------# #-------------------------------------------------------------------------------#
# save and show images # save and show images
@ -98,20 +97,17 @@ for i in scanimg:
#-------------------------------------------------------------------------------# #-------------------------------------------------------------------------------#
# save images into PDF # save images into PDF
outputs = glob.glob('output/erase-replace/*erase.jpg') outputs = glob.glob('output/erase-replace/*erase.jpg')
print ("Saving to PDF:", outputs) print ("Saving to PDF: output/erase-replace/Erase.pdf")
def makePdf(pdfFileName, listPages, dir = ''): def makePdf(pdfFileName, listPages, dir = ''):
if (dir): if (dir):
dir += "/" dir += "/"
cover = Image.open(dir + str(listPages[0])) cover = Image.open(dir + str(listPages[0]))
width, height = cover.size width, height = cover.size
pdf = FPDF(unit = "pt", format = [width, height]) pdf = FPDF(unit = "pt", format = [width, height])
for page in listPages: for page in listPages:
pdf.add_page() pdf.add_page()
pdf.image(dir + str(page), 0, 0) pdf.image(dir + str(page), 0, 0)
pdf.output(dir + pdfFileName + ".pdf", "F") pdf.output(dir + pdfFileName + ".pdf", "F")
makePdf('output/erase-replace/Erase', outputs, dir = '') makePdf('output/erase-replace/Erase', outputs, dir = '')

@ -8,6 +8,14 @@ import glob
import time import time
from fpdf import FPDF from fpdf import FPDF
import os import os
import shutil
path1 = './temp'
if not os.path.isdir(path1):
os.makedirs(path1)
os.makedirs('./temp/crops4')
os.makedirs('./temp/crops7')
os.makedirs('./temp/crops_more')
stopwords.words('english') stopwords.words('english')
sr = set(stopwords.words('english')) sr = set(stopwords.words('english'))
@ -58,10 +66,8 @@ allwords = []
scanimg = glob.glob('images-tiff/*.tiff') scanimg = glob.glob('images-tiff/*.tiff')
hocr = glob.glob('hocr/*.html') hocr = glob.glob('hocr/*.html')
num = 0 num = 0
maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned
# loop through every image in scanimg folder # loop through every image in scanimg folder
for i in scanimg: for i in scanimg:
x = x + 1 x = x + 1
@ -70,10 +76,9 @@ for i in scanimg:
oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
# open corresponding hocr file # open corresponding hocr file
print ("Analysing", hocr[x])
f = open(hocr[x]) f = open(hocr[x])
print ('Reading scanned image, filtering least common words.') print ('Reading scanned image and hocr file, filtering least common words.')
print ("") print ('')
t = html5lib.parse(f, namespaceHTMLElements=False) t = html5lib.parse(f, namespaceHTMLElements=False)
@ -86,11 +91,15 @@ for i in scanimg:
findleastcommon(clean_words) #find least common words and add them to list findleastcommon(clean_words) #find least common words and add them to list
mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list
print ("The most common words until text", x+1, "are:", mostcommon_list) print ('The most common words until text', x+1, 'are:', mostcommon_list)
print ('')
# loop through every word in hocr file to extract coordinates, then remove or paste into output image # loop through every word in hocr file to extract coordinates, then remove or paste into output image
for element in t.findall(".//span[@class='ocrx_word']"): print ('Processing word coordinates and replacing least common words with most common words.')
print ('')
for element in t.findall(".//span[@class='ocrx_word']"):
word = filternone(element.text) word = filternone(element.text)
c = coordinates(element.attrib['title']) c = coordinates(element.attrib['title'])
num = num + 1 num = num + 1
@ -99,17 +108,17 @@ for i in scanimg:
#extract coordinates #extract coordinates
if word.lower() in mostcommon_list and len(word) > 1 and len(word) <= 5: if word.lower() in mostcommon_list and len(word) > 1 and len(word) <= 5:
wim.save ("output/erase-replace/crops4/wimreplace{}.png".format(num)) wim.save ("temp/crops4/wimreplace{}.png".format(num))
elif word in mostcommon_list and len(word) <= 7 : elif word in mostcommon_list and len(word) <= 7 :
wim.save ("output/erase-replace/crops7/wimreplace{}.png".format(num)) wim.save ("temp/crops7/wimreplace{}.png".format(num))
elif word in mostcommon_list and len(word) > 7 : elif word in mostcommon_list and len(word) > 7 :
wim.save ("output/erase-replace/crops_more/wimreplace{}.png".format(num)) wim.save ("temp/crops_more/wimreplace{}.png".format(num))
if x > 0: if x > 0:
# use PIL to crop out every box, then paste it according to if rule # use PIL to crop out every box, then paste it according to if rule
randomimg4 = random.choice(glob.glob('./output/erase-replace/crops4/*.png')) randomimg4 = random.choice(glob.glob('temp/crops4/*.png'))
randomimg7 = random.choice(glob.glob('./output/erase-replace/crops7/*.png')) randomimg7 = random.choice(glob.glob('temp/crops7/*.png'))
randomimg_more = random.choice(glob.glob('./output/erase-replace/crops_more/*.png')) randomimg_more = random.choice(glob.glob('temp/crops_more/*.png'))
wimreplace4 = Image.open(randomimg4) wimreplace4 = Image.open(randomimg4)
wimreplace7 = Image.open(randomimg7) wimreplace7 = Image.open(randomimg7)
@ -128,23 +137,18 @@ for i in scanimg:
elif word.lower() in leastcommon_list and len(word) < 8: elif word.lower() in leastcommon_list and len(word) < 8:
oim.paste(out4, (c[0], c[1])) oim.paste(out4, (c[0], c[1]))
print ('Excluding:', word)
elif word.lower() in leastcommon_list and len(word) < 11: elif word.lower() in leastcommon_list and len(word) < 11:
oim.paste(out7, (c[0], c[1])) oim.paste(out7, (c[0], c[1]))
print ('Excluding:', word)
elif word.lower() in leastcommon_list and len(word) > 8: elif word.lower() in leastcommon_list and len(word) > 8:
oim.paste(out_more, (c[0], c[1])) oim.paste(out_more, (c[0], c[1]))
print ('Excluding:', word)
else: else:
oim.paste(wim, (c[0], c[1], c[2], c[3])) oim.paste(wim, (c[0], c[1], c[2], c[3]))
print ('Including:', word)
else: else:
oim.paste(wim, (c[0], c[1], c[2], c[3])) oim.paste(wim, (c[0], c[1], c[2], c[3]))
print ('Including:', word)
#-------------------------------------------------------------------------------# #-------------------------------------------------------------------------------#
@ -155,20 +159,18 @@ for i in scanimg:
#-------------------------------------------------------------------------------# #-------------------------------------------------------------------------------#
# save images into PDF # save images into PDF
outputs = glob.glob('output/erase-replace/*replace.jpg') outputs = glob.glob('output/erase-replace/*replace.jpg')
print ("Saving to PDF:", outputs) print ('')
print ("Saving to PDF: output/erase-replace/Replace.pdf")
def makePdf(pdfFileName, listPages, dir = ''): def makePdf(pdfFileName, listPages, dir = ''):
if (dir): if (dir):
dir += "/" dir += "/"
cover = Image.open(dir + str(listPages[0])) cover = Image.open(dir + str(listPages[0]))
width, height = cover.size width, height = cover.size
pdf = FPDF(unit = "pt", format = [width, height]) pdf = FPDF(unit = "pt", format = [width, height])
for page in listPages: for page in listPages:
pdf.add_page() pdf.add_page()
pdf.image(dir + str(page), 0, 0) pdf.image(dir + str(page), 0, 0)
pdf.output(dir + pdfFileName + ".pdf", "F") pdf.output(dir + pdfFileName + ".pdf", "F")
makePdf('output/erase-replace/Replace', outputs, dir = '') makePdf('output/erase-replace/Replace', outputs, dir = '')
@ -177,16 +179,5 @@ makePdf('output/erase-replace/Replace', outputs, dir = '')
files = glob.glob('./output/erase-replace/*replace.jpg') files = glob.glob('./output/erase-replace/*replace.jpg')
for f in files: for f in files:
os.remove(f) os.remove(f)
files = glob.glob('./output/erase-replace/crops4/*.png')
for f in files:
os.remove(f)
files = glob.glob('./output/erase-replace/crops7/*.png')
for f in files:
os.remove(f)
files = glob.glob('./output/erase-replace/crops_more/*.png')
for f in files:
os.remove(f)
shutil.rmtree('./temp/')

Loading…
Cancel
Save