Merge branch 'master' of git.xpub.nl:/var/www/git.xpub.nl/repos/OuNuPo-make

master^2
jvdhorst 7 years ago
commit cc9c5b39ac

3
.gitignore vendored

@ -1,4 +1,5 @@
images/**
output/**
src/index.json
.DS_Store
.DS_Store
src/**.wav

@ -1,6 +1,7 @@
images=$(sort $(wildcard images/*.jpg))
images=$(sort $(wildcard images/*))
# @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file
images-tiff=$(sort $(wildcard images-tiff/*.tiff))
input-hocr=$(sort $(wildcard hocr/*))
output_ocr:=$(dir_ocr)/output.txt
tmpfile:= $(shell mktemp)
space:= $(empty) $(empty)
@ -25,7 +26,6 @@ help:
# CLEAN
clean: ## removes output (target) files
rm ocr/output.txt
rm $(wildcard output/*)
rm $(tmpfile)
@ -59,7 +59,7 @@ tiffs: ## convert images/ to images-tiff/ Depends on IM
echo $(images)
for i in $(images); \
do tiff=`basename $$i .jpg`.tiff; \
convert -density 300 $$i -alpha on images-tiff/$$tiff; \
convert -density 300 $$i -colorspace RGB -type truecolor -alpha on images-tiff/$$tiff; \
echo $$tiff; \
done;
@ -99,11 +99,15 @@ overunder: ocr/output.txt ## Alice: An interpreted language that translate simpl
@python3 src/overunder.py
.PHONY: overunder
erase:hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF
erase: tiffs hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF
python3 src/erase_leastcommon.py
rm $(input-hocr)
rm $(images-tiff)
replace:hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF
replace:tiffs hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF
python3 src/replace_leastcommon.py
rm $(input-hocr)
rm $(images-tiff)
visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer
@echo $(tmpfile)

@ -20,18 +20,12 @@ Natasha Berting, Angeliki Diakrousi, Joca van der Horst, Alexander Roidl, Alice
* Python3
* GNU make
* Python3 NLTK `pip3 install nltk`
* NLTK English Corpus:
* run NLTK downloader `python -m nltk.downloader`
* select menu "Corpora"
* select "stopwords"
* "Dowload"
# Make commands
## N+7 (example) Author
Description: Replaces every word with the 7th next word in a dictionary.
Description: Replaces every noun with the 7th next noun in a dictionary. Inspired by an Oulipo work of the same name.
run: `make N+7`
@ -48,14 +42,13 @@ run: `make ttssr-human-only`
Specific Dependencies:
* PocketSphinx pacakge `sudo aptitude install pocketsphinx pocketsphinx-en-us`
Python Libaries:
* PocketSphinx: `sudo pip3 install PocketSphinx`, install dependencies: `sudo apt-get install gcc automake autoconf libtool bison swig python-dev libpulse-dev`
* PocketSphinx package `sudo aptitude install pocketsphinx pocketsphinx-en-us`
* PocketSphinx: `sudo pip3 install PocketSphinx`
* Python Libaries:`sudo apt-get install gcc automake autoconf libtool bison swig python-dev libpulse-dev`
* Speech Recognition: `sudo pip3 install SpeechRecognition`
* TermColor: `sudo pip3 install termcolor`
* PyAudio: `pip3 install pyaudio`
## Reading the Structure: Joca
Description: Uses OCR'ed text as an input, labels each word for Part-of-Speech, stopwords and sentiment. Then it generates a reading interface
where words with a specific label are hidden. Output can be saved as poster, or exported as json featuring the full data set.
@ -72,3 +65,48 @@ Specific Dependencies:
* jinja2 (http://jinja.pocoo.org/docs/2.10/intro/#installation)
* font: PT Sans (os font https://www.fontsquirrel.com/fonts/pt-serif)
* font: Ubuntu Mono (os font https://www.fontsquirrel.com/fonts/ubuntu-mono)
## Erase / Replace: Natasha
Description: Receives your scanned pages in order, then analyzes each image and its vocabulary. Finds and crops the least common words, and either erases them, or replaces them with the most common words. Outputs a PDF of increasingly distorted scan images.
for erase script run: `make erase`
for replace script run: `make replace`
Specific Dependencies:
* NLTK English Corpus:
* run NLTK downloader `python -m nltk.downloader`
* select menu "Corpora"
* select "stopwords"
* "Download"
* Python Image Library (PIL): `pip3 install Pillow`
* PDF generation for Python (FPDF): `pip3 install fpdf`
* HTML5lib: `pip3 install html5lib`
Notes & Bugs:
This script is very picky about the input images it can work with. For best results, please use high resolution images in RGB colorspace. Errors can occur when image modes do not match or tesseract cannot successfully make HOCR files.
## carlandre: Alice
Description: Generates concrete poetry from a text file. If you're connected to a printer located in /dev/usb/lp0 you can print the poem.
run: make carlandre
Dependencies:
* pytest (Documentation: https://docs.pytest.org/en/latest/getting-started.html)
## over/under: Alice
Description: Interpreted programming language written in Python3 which translates basic weaving instructions into code and applies them to text.
run: make overunder
Instructions:
over/under works with specific commands which execute specific instructions.
When running, an interpreter will open:
>
To load your text, type 'load'. This is necessary before any other instructions. Every time you load the text, the previous instructions will be discarded.
To see the line you are currently on, type 'show'.
To start your pattern, type 'over' or 'under', each followed by an integer, separated by a comma.
e.g. over 5, under 5, over 6, under 10
To move on to the next line of text, press enter twice.
To see your pattern, type 'pattern'.
To save your pattern in a text file, type 'save'.
To leave the program, type 'quit'.

@ -1,3 +1,3 @@
images/0029.jpg
images/0012.tif

@ -1,21 +0,0 @@
Write it down quickly
before I forget
in the car with D. and N.
cutting across Americas seasons
muggy sunlight in Santa Barbara
wet snow in Denver
and in every Best Western hotel
the TVs flickering light
on her dear sleeping face
like a young girl once again
but writing down the words
alters what I want to remember
that which had no words
was a living breathing image
so now I have two versions of the same
today I can superimpose them
but tomorrow when Im gone
only the words are left
signs evoking something
that no eye sees any more

@ -118,7 +118,8 @@ if os.path.exists(my_path):
"init_printer": "\x1B\x40",
'papercut':'\x1D\x56\x00',
}
emptylines= "\n\n\n\n"
print(escpos['init_printer'])
print(joined_list)
print(emptylines)

@ -23,7 +23,7 @@ def findleastcommon(list):
fdist = FreqDist(word.lower() for word in list)
leastcommon = fdist.most_common()
for i in leastcommon:
if (i[1] <= limit):
if (i[1] <= 1):
leastcommon_list.append(i[0])
return leastcommon_list

@ -77,6 +77,7 @@ def eval(cmds):
print('\n'.join(pattern))
elif cmd[0] == 'save':
pattern = text[0:line_number + 1]
pattern_file = open('output/patternfile.txt', 'w')
pattern_file.write('\n'.join(pattern))
pattern_file.close()

@ -40,7 +40,7 @@ def findleastcommon(list):
fdist = FreqDist(word.lower() for word in list)
leastcommon = fdist.most_common()
for i in leastcommon:
if (i[1] <= limit):
if (i[1] <= 1):
leastcommon_list.append(i[0])
return leastcommon_list
@ -92,6 +92,8 @@ for i in scanimg:
mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list
print ('The most common words until text', x+1, 'are:', mostcommon_list)
print ('The least common words until text', x+1, 'are:', leastcommon_list)
print ('')
# loop through every word in hocr file to extract coordinates, then remove or paste into output image
@ -128,11 +130,11 @@ for i in scanimg:
wimcolor7 = Image.new('RGBA', wimreplace7.size, (250, 230, 0, 90))
wimcolor_more = Image.new('RGBA', wimreplace_more.size, (250, 230, 0, 90))
out4 = Image.alpha_composite(wimreplace4, wimcolor4)
out7 = Image.alpha_composite(wimreplace7, wimcolor7)
out_more = Image.alpha_composite(wimreplace_more, wimcolor_more)
out4 = Image.alpha_composite(wimreplace4.convert('RGBA'), wimcolor4)
out7 = Image.alpha_composite(wimreplace7.convert('RGBA'), wimcolor7)
out_more = Image.alpha_composite(wimreplace_more.convert('RGBA'), wimcolor_more)
if word.lower() in leastcommon_list and len(word) <= limit:
if word.lower() in leastcommon_list and len(word) <= 3:
oim.paste(wim, (c[0], c[1], c[2], c[3]))
elif word.lower() in leastcommon_list and len(word) < 8:

@ -5,9 +5,9 @@ head -n 1 $1 > output/input0.txt
while [[ $i -le 10 ]]
do echo $i
cat output/input$i.txt
python3 src/write_audio.py src/sound$i.wav 2> /dev/null
python3 src/ttssr_write_audio.py src/sound$i.wav 2> /dev/null
play src/sound$i.wav repeat 5 2> /dev/null & #in the background the sound, without it all the sounds play one by one//2 is stderr
python3 src/audio_transcribe.py sound$i.wav > output/input$((i+1)).txt 2> /dev/null
python3 src/ttssr_transcribe.py sound$i.wav > output/input$((i+1)).txt 2> /dev/null
sleep 1
(( i++ ))
done

Loading…
Cancel
Save