Merge branch 'master' of git.xpub.nl:/var/www/git.xpub.nl/repos/OuNuPo-make

master
ange 6 years ago
commit 430fd0fca2

@ -1,5 +1,6 @@
images=$(sort $(wildcard images/*.jpg))
# @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file
images-tiff=$(sort $(wildcard images-tiff/*.tiff))
output_ocr:=$(dir_ocr)/output.txt
tmpfile:= $(shell mktemp)
space:= $(empty) $(empty)
@ -34,9 +35,10 @@ dirs: ## create the dirs in working dir
@-mkdir -p images/
@-mkdir -p images-tiff/
@-mkdir -p output/
@-mkdir -p output/erase-replace/
@-mkdir -p ocr/
@-mkdir -p hocr/
@echo $(color_r)'Directories made': images/ output/
@echo $(color_r)'Directories made': ocr/ hocr/ images/ images-tiff/ output/
testif:
@ -91,10 +93,16 @@ carlandre: ocr/output.txt ## Alice: Creates visual poetry out of a text. Depende
.PHONY: carlandre
# cat $(@) > /dev/usb/lp0
overunder: ocr/output.txt ## Alice: An interpreted language that translate simple weaving instructions and creates a weaving pattern on text.
@python3 src/overunder.py
.PHONY: overunder
erase:hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF
python3 src/erase_leastcommon.py
replace:hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF
python3 src/replace_leastcommon.py
visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer
@echo $(tmpfile)

@ -1,16 +1,33 @@
# OuNuPo Make
Software experiments for the OuNuPo bookscanner. Part of Special Issue #5
Software experiments for the OuNuPo bookscanner, part of Special Issue 5
https://issue.xpub.nl/05/
https://xpub.nl/
## License
## Authors
#Angeliki Diakrousi
Natasha Berting, Angeliki Diakrousi, Joca van der Horst, Alexander Roidl, Alice Strete and Zalán Szakács.
# Install
## Clone Repository
`git clone https://git.xpub.nl/repos/OuNuPo-make.git`
## General depencies
* Python3
* GNU make
* Python3 NLTK `pip3 install nltk`
* NLTK English Corpus:
* run NLTK downloader `python -m nltk.downloader`
* select menu "Corpora"
* select "stopwords"
* "Dowload"
# Make commands
## N+7 (example) Author
@ -18,25 +35,24 @@ Description: Replaces every word with the 7th next word in a dictionary.
run: `make N+7`
dependencies:
Specific Dependencies:
* a
* b
* c
## ttssr-human-only Angeliki
## Sitting inside a pocket(sphinx): Angeliki
Description: Speech recognition feedback loops using the first sentence of a scanned text as input
run: `make ttssr-human-only`
dependencies:
*python 3.3
*pocketsphinx
*SpeechRecognition 3.8.1
*PyAudio
Specific Dependencies:
* [pocketsphinx](https://github.com/bambocher/pocketsphinx-python) `sudo pip3 install pocketsphinx` ---> FOLLOW THIS EXAMPLE
* SpeechRecognition 3.8.1
* PyAudio
install:
`https://pzwiki.wdka.nl/mediadesign/Speech_recognition`

@ -58,10 +58,9 @@ for i in scanimg:
oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
# open corresponding hocr file
print ("Analysing", hocr[x])
f = open(hocr[x])
print ('Reading scanned image, filtering least common words.')
print ("")
print ('')
t = html5lib.parse(f, namespaceHTMLElements=False)
@ -73,7 +72,9 @@ for i in scanimg:
clean_words = cleanstopwords(allwords) #clean stopwords
findleastcommon(clean_words) #find least common words and add them to list
print ("The least common words until text", x+1, "are:", leastcommon_list)
print ('')
print ('Processing word coordinates and erasing least common words.')
print ('')
# loop through every word in hocr file to extract coordinates, then remove or paste into output image
for element in t.findall(".//span[@class='ocrx_word']"):
word = filternone(element.text)
@ -83,11 +84,9 @@ for i in scanimg:
if word.lower() in leastcommon_list and len(word) < limit:
oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3]))
print ('Excluding:', word)
else:
oim.paste(wim, (c[0], c[1], c[2], c[3]))
print ('Including:', word)
#-------------------------------------------------------------------------------#
# save and show images
@ -98,20 +97,17 @@ for i in scanimg:
#-------------------------------------------------------------------------------#
# save images into PDF
outputs = glob.glob('output/erase-replace/*erase.jpg')
print ("Saving to PDF:", outputs)
print ("Saving to PDF: output/erase-replace/Erase.pdf")
def makePdf(pdfFileName, listPages, dir = ''):
if (dir):
dir += "/"
cover = Image.open(dir + str(listPages[0]))
width, height = cover.size
pdf = FPDF(unit = "pt", format = [width, height])
for page in listPages:
pdf.add_page()
pdf.image(dir + str(page), 0, 0)
pdf.output(dir + pdfFileName + ".pdf", "F")
makePdf('output/erase-replace/Erase', outputs, dir = '')

@ -8,6 +8,14 @@ import glob
import time
from fpdf import FPDF
import os
import shutil
path1 = './temp'
if not os.path.isdir(path1):
os.makedirs(path1)
os.makedirs('./temp/crops4')
os.makedirs('./temp/crops7')
os.makedirs('./temp/crops_more')
stopwords.words('english')
sr = set(stopwords.words('english'))
@ -58,10 +66,8 @@ allwords = []
scanimg = glob.glob('images-tiff/*.tiff')
hocr = glob.glob('hocr/*.html')
num = 0
maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned
# loop through every image in scanimg folder
for i in scanimg:
x = x + 1
@ -70,10 +76,9 @@ for i in scanimg:
oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
# open corresponding hocr file
print ("Analysing", hocr[x])
f = open(hocr[x])
print ('Reading scanned image, filtering least common words.')
print ("")
print ('Reading scanned image and hocr file, filtering least common words.')
print ('')
t = html5lib.parse(f, namespaceHTMLElements=False)
@ -86,10 +91,14 @@ for i in scanimg:
findleastcommon(clean_words) #find least common words and add them to list
mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list
print ("The most common words until text", x+1, "are:", mostcommon_list)
print ('The most common words until text', x+1, 'are:', mostcommon_list)
print ('')
# loop through every word in hocr file to extract coordinates, then remove or paste into output image
print ('Processing word coordinates and replacing least common words with most common words.')
print ('')
for element in t.findall(".//span[@class='ocrx_word']"):
word = filternone(element.text)
c = coordinates(element.attrib['title'])
@ -99,17 +108,17 @@ for i in scanimg:
#extract coordinates
if word.lower() in mostcommon_list and len(word) > 1 and len(word) <= 5:
wim.save ("output/erase-replace/crops4/wimreplace{}.png".format(num))
wim.save ("temp/crops4/wimreplace{}.png".format(num))
elif word in mostcommon_list and len(word) <= 7 :
wim.save ("output/erase-replace/crops7/wimreplace{}.png".format(num))
wim.save ("temp/crops7/wimreplace{}.png".format(num))
elif word in mostcommon_list and len(word) > 7 :
wim.save ("output/erase-replace/crops_more/wimreplace{}.png".format(num))
wim.save ("temp/crops_more/wimreplace{}.png".format(num))
if x > 0:
# use PIL to crop out every box, then paste it according to if rule
randomimg4 = random.choice(glob.glob('./output/erase-replace/crops4/*.png'))
randomimg7 = random.choice(glob.glob('./output/erase-replace/crops7/*.png'))
randomimg_more = random.choice(glob.glob('./output/erase-replace/crops_more/*.png'))
randomimg4 = random.choice(glob.glob('temp/crops4/*.png'))
randomimg7 = random.choice(glob.glob('temp/crops7/*.png'))
randomimg_more = random.choice(glob.glob('temp/crops_more/*.png'))
wimreplace4 = Image.open(randomimg4)
wimreplace7 = Image.open(randomimg7)
@ -128,23 +137,18 @@ for i in scanimg:
elif word.lower() in leastcommon_list and len(word) < 8:
oim.paste(out4, (c[0], c[1]))
print ('Excluding:', word)
elif word.lower() in leastcommon_list and len(word) < 11:
oim.paste(out7, (c[0], c[1]))
print ('Excluding:', word)
elif word.lower() in leastcommon_list and len(word) > 8:
oim.paste(out_more, (c[0], c[1]))
print ('Excluding:', word)
else:
oim.paste(wim, (c[0], c[1], c[2], c[3]))
print ('Including:', word)
else:
oim.paste(wim, (c[0], c[1], c[2], c[3]))
print ('Including:', word)
#-------------------------------------------------------------------------------#
@ -155,20 +159,18 @@ for i in scanimg:
#-------------------------------------------------------------------------------#
# save images into PDF
outputs = glob.glob('output/erase-replace/*replace.jpg')
print ("Saving to PDF:", outputs)
print ('')
print ("Saving to PDF: output/erase-replace/Replace.pdf")
def makePdf(pdfFileName, listPages, dir = ''):
if (dir):
dir += "/"
cover = Image.open(dir + str(listPages[0]))
width, height = cover.size
pdf = FPDF(unit = "pt", format = [width, height])
for page in listPages:
pdf.add_page()
pdf.image(dir + str(page), 0, 0)
pdf.output(dir + pdfFileName + ".pdf", "F")
makePdf('output/erase-replace/Replace', outputs, dir = '')
@ -178,15 +180,4 @@ files = glob.glob('./output/erase-replace/*replace.jpg')
for f in files:
os.remove(f)
files = glob.glob('./output/erase-replace/crops4/*.png')
for f in files:
os.remove(f)
files = glob.glob('./output/erase-replace/crops7/*.png')
for f in files:
os.remove(f)
files = glob.glob('./output/erase-replace/crops_more/*.png')
for f in files:
os.remove(f)
shutil.rmtree('./temp/')

Loading…
Cancel
Save