images = $( sort $( wildcard images/*) )
# @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file
images-tiff = $( sort $( wildcard images-tiff/*.tiff) )
input-hocr = $( sort $( wildcard hocr/*) )
output_ocr := $( dir_ocr) /output.txt
tmpfile := $( shell mktemp)
space := $( empty) $( empty)
newline := '\n'
listimgs := $( subst $( space) ,$( newline) , $( images) ) # list of the images, with one filename on each line $(subst $(delimitator),$(replacement),$(list))
OS := $( shell uname)
# Colors: add color to output ie @echo $(color_r) output text
color_w := "\033[0;29m"
color_r := "\033[0;31m"
color_g := "\033[0;32m"
color_b := "\033[0;34m"
# HELP / SELF DOCUMENTATION
# rules where first line contains comment with 2x# (see example in clean rule)
.DEFAULT_GOAL := help # help rule as default when you run: make
.PHONY : help
help :
@grep -E '^[a-zA-Z_-\/]+:.*?## .*$$' $( MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
# CLEAN
clean : ## removes output (target) files
rm ocr/output.txt
rm $( tmpfile)
# ADMINISTRATIVE RECIPES
dirs : ## create the dirs in working dir
@-mkdir -p images/
@-mkdir -p images-tiff/
@-mkdir -p output/
@-mkdir -p output/erase-replace/
@-mkdir -p ocr/
@-mkdir -p hocr/
@echo $( color_r) 'Directories made' : ocr/ hocr/ images/ images-tiff/ output/
# POST-PROCESSING RECIPES
ocr/output.txt : ## ocr with tesseract
echo $( listimgs) > $( @D) /list.txt
@echo $( basename $@ .txt)
tesseract $( @D) /list.txt $( basename $@ .txt)
python3 src/build_database.py $( @)
tiffs : ## convert images/ to images-tiff/ Depends on IM
echo $( images)
for i in $( images) ; \
do tiff = ` basename $$ i .jpg` .tiff; \
convert -density 300 $$ i -colorspace RGB -type truecolor -alpha on images-tiff/$$ tiff; \
echo $$ tiff; \
done ;
hocrs : ## hocr with tesseract and then change extension to .html
for i in images-tiff/*.tiff; \
do echo $$ i; hocrfile = ` basename $$ i .tiff` ; \
tesseract $$ i hocr/$$ hocrfile hocr; \
mv hocr/$$ hocrfile.hocr hocr/$$ hocrfile.html; \
done ;
#OUTPUT GENERATION RECIPES
reading_structure : ocr /output .txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2, weasyprint
mkdir -p output/reading_structure
cp src/reading_structure/jquery.min.js output/reading_structure
cp src/reading_structure/script.js output/reading_structure
cp src/reading_structure/style.css output/reading_structure
cat $< | python3 src/reading_structure/reading_structure.py
weasyprint -s src/reading_structure/print-noun.css output/reading_structure/index.html output/reading_structure/poster_noun.pdf
weasyprint -s src/reading_structure/print-adv.css output/reading_structure/index.html output/reading_structure/poster_adv.pdf
weasyprint -s src/reading_structure/print-dppt.css output/reading_structure/index.html output/reading_structure/poster_dppt.pdf
weasyprint -s src/reading_structure/print-stopword.css output/reading_structure/index.html output/reading_structure/poster_stopword.pdf
weasyprint -s src/reading_structure/print-neutral.css output/reading_structure/index.html output/reading_structure/poster_neutral.pdf
weasyprint -s src/reading_structure/print-entity.css output/reading_structure/index.html output/reading_structure/poster_named_entities.pdf
x-www-browser output/reading_structure/index.html
output/chatbot.txt : ocr /output .txt ## Comments a text with a simple chatbot. Dependencies: python3's chatterbot
cat $< | python3 src/textbotconversation.py $( @)
output/n7.txt : ocr /output .txt ## Replaces nouns with the 7th noun that follows. Dependencies: 91k_nouns
cat $< | python3 src/n_7.py > $( @)
carlandre : ocr /output .txt ## Alice: Creates visual poetry out of a text. Dependencies: pytest
@python3 src/carlandre.py
.PHONY : carlandre
# cat $(@) > /dev/usb/lp0
overunder : ocr /output .txt ## Alice: An interpreted language that translate simple weaving instructions and creates a weaving pattern on text.
@python3 src/overunder.py
.PHONY : overunder
erase : tiffs hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF
python3 src/erase_leastcommon.py
rm $( input-hocr)
rm $( images-tiff)
replace : tiffs hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF
python3 src/replace_leastcommon.py
rm $( input-hocr)
rm $( images-tiff)
ttssr-human-only : ocr /output .txt ## Loop: text to speech-speech recognition. Dependencies: espeak, pocketsphinx
bash src/ttssr-loop-human-only.sh ocr/output.txt
chatbook : ocr /output .txt #chatbot based on the knowledge of the scans Dependencies: nltk_rake, irc, nltk
python3 src/chatbook.py
oulibot : ocr /output .txt #chatbot based on the knowledge of the scans Dependencies: nltk_rake, irc, nltk
python3 src/oulibot.py