You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
133 lines
5.2 KiB
Makefile
133 lines
5.2 KiB
Makefile
images=$(sort $(wildcard images/*))
|
|
# @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file
|
|
images-tiff=$(sort $(wildcard images-tiff/*.tiff))
|
|
input-hocr=$(sort $(wildcard hocr/*))
|
|
output_ocr:=$(dir_ocr)/output.txt
|
|
tmpfile:= $(shell mktemp)
|
|
space:= $(empty) $(empty)
|
|
newline:= '\n'
|
|
listimgs:= $(subst $(space),$(newline), $(images) ) # list of the images, with one filename on each line $(subst $(delimitator),$(replacement),$(list))
|
|
OS:= $(shell uname)
|
|
# Colors: add color to output ie @echo $(color_r) output text
|
|
color_w:="\033[0;29m"
|
|
color_r:="\033[0;31m"
|
|
color_g:="\033[0;32m"
|
|
color_b:="\033[0;34m"
|
|
|
|
# HELP / SELF DOCUMENTATION
|
|
# rules where first line contains comment with 2x# (see example in clean rule)
|
|
.DEFAULT_GOAL := help # help rule as default when you run: make
|
|
|
|
.PHONY: help
|
|
|
|
help:
|
|
@grep -E '^[a-zA-Z_-\/]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
|
|
|
|
# CLEAN
|
|
clean: ## removes output (target) files
|
|
rm ocr/output.txt
|
|
rm $(tmpfile)
|
|
|
|
|
|
# ADMINISTRATIVE RECIPES
|
|
|
|
dirs: ## create the dirs in working dir
|
|
@-mkdir -p images/
|
|
@-mkdir -p images-tiff/
|
|
@-mkdir -p output/
|
|
@-mkdir -p output/erase-replace/
|
|
@-mkdir -p ocr/
|
|
@-mkdir -p hocr/
|
|
@echo $(color_r)'Directories made': ocr/ hocr/ images/ images-tiff/ output/
|
|
|
|
|
|
testif:
|
|
ifeq ($(OS),Darwin)
|
|
@echo $(OS)
|
|
endif
|
|
|
|
|
|
# POST-PROCESSING RECIPES
|
|
|
|
ocr/output.txt: ## ocr with tesseract
|
|
echo $(listimgs) > $(@D)/list.txt
|
|
@echo $(basename $@ .txt)
|
|
tesseract $(@D)/list.txt $(basename $@ .txt)
|
|
python3 src/build_database.py $(@)
|
|
|
|
tiffs: ## convert images/ to images-tiff/ Depends on IM
|
|
echo $(images)
|
|
for i in $(images); \
|
|
do tiff=`basename $$i .jpg`.tiff; \
|
|
convert -density 300 $$i -colorspace RGB -type truecolor -alpha on images-tiff/$$tiff; \
|
|
echo $$tiff; \
|
|
done;
|
|
|
|
hocrs: ## hocr with tesseract and then change extension to .html
|
|
for i in images-tiff/*.tiff; \
|
|
do echo $$i; hocrfile=`basename $$i .tiff`; \
|
|
tesseract $$i hocr/$$hocrfile hocr; \
|
|
mv hocr/$$hocrfile.hocr hocr/$$hocrfile.html; \
|
|
done;
|
|
|
|
#OUTPUT GENERATION RECIPES
|
|
|
|
reading_structure: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2, weasyprint
|
|
mkdir -p output/reading_structure
|
|
cp src/reading_structure/jquery.min.js output/reading_structure
|
|
cp src/reading_structure/script.js output/reading_structure
|
|
cp src/reading_structure/style.css output/reading_structure
|
|
cat $< | python3 src/reading_structure/reading_structure.py
|
|
weasyprint -s src/reading_structure/print-noun.css output/reading_structure/index.html output/reading_structure/poster_noun.pdf
|
|
weasyprint -s src/reading_structure/print-adv.css output/reading_structure/index.html output/reading_structure/poster_adv.pdf
|
|
weasyprint -s src/reading_structure/print-dppt.css output/reading_structure/index.html output/reading_structure/poster_dppt.pdf
|
|
weasyprint -s src/reading_structure/print-stopword.css output/reading_structure/index.html output/reading_structure/poster_stopword.pdf
|
|
weasyprint -s src/reading_structure/print-neutral.css output/reading_structure/index.html output/reading_structure/poster_neutral.pdf
|
|
weasyprint -s src/reading_structure/print-entity.css output/reading_structure/index.html output/reading_structure/poster_named_entities.pdf
|
|
x-www-browser output/reading_structure/index.html
|
|
|
|
output/chatbot.txt: ocr/output.txt ## Comments a text with a simple chatbot. Dependencies: python3's chatterbot
|
|
cat $< | python3 src/textbotconversation.py $(@)
|
|
|
|
|
|
output/n7.txt: ocr/output.txt ## Replaces nouns with the 7th noun that follows. Dependencies: 91k_nouns
|
|
cat $< | python3 src/n_7.py > $(@)
|
|
|
|
carlandre: ocr/output.txt ## Alice: Creates visual poetry out of a text. Dependencies: pytest
|
|
@python3 src/carlandre.py
|
|
.PHONY: carlandre
|
|
# cat $(@) > /dev/usb/lp0
|
|
|
|
|
|
overunder: ocr/output.txt ## Alice: An interpreted language that translate simple weaving instructions and creates a weaving pattern on text.
|
|
@python3 src/overunder.py
|
|
.PHONY: overunder
|
|
|
|
erase: tiffs hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF
|
|
python3 src/erase_leastcommon.py
|
|
rm $(input-hocr)
|
|
rm $(images-tiff)
|
|
|
|
replace:tiffs hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF
|
|
python3 src/replace_leastcommon.py
|
|
rm $(input-hocr)
|
|
rm $(images-tiff)
|
|
|
|
visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer
|
|
@echo $(tmpfile)
|
|
for i in $(images); do \
|
|
cat $$i >> $(tmpfile); \
|
|
done;
|
|
ifeq ($(OS),Darwin)
|
|
cat $(tmpfile) | mplayer -sws 4 -zoom -vf dsize=720:720 -demuxer rawvideo -rawvideo w=56:h=64:i420:fps=25 -;
|
|
else
|
|
cat $(tmpfile) | mplayer -vo x11 -sws 4 -zoom -vf dsize=720:720 -demuxer rawvideo -rawvideo w=50:h=50:i420:fps=25 -;
|
|
endif
|
|
|
|
|
|
ttssr-human-only: ocr/output.txt ## Loop: text to speech-speech recognition. Dependencies: espeak, pocketsphinx
|
|
bash src/ttssr-loop-human-only.sh ocr/output.txt
|
|
|
|
chatbook: ocr/output.txt #chatbot based on the knowledge of the scans Dependencies: nltk_rake, irc, nltk
|
|
python3 src/chatbook.py
|