diff --git a/Makefile b/Makefile index 8d721a3..1c3bfa8 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,5 @@ -images=$(sort $(wildcard images/*.jpg)) +images=$(sort $(wildcard images/*.jpg)) +# @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file output_ocr:=$(dir_ocr)/output.txt tmpfile:= $(shell mktemp) space:= $(empty) $(empty) @@ -35,10 +36,10 @@ dirs: ## create the dirs in working dir @-mkdir -p output/ @-mkdir -p output/wordtagger @-mkdir -p ocr/ + @-mkdir -p hocr/ @echo $(color_r)'Directories made': images/ output/ - testif: ifeq ($(OS),Darwin) @echo $(OS) @@ -60,6 +61,13 @@ tiffs: ## convert images/ to images-tiff/ Depends on IM echo $$tiff; \ done; +hocrs: ## hocr with tesseract and then change extension to .html + for i in images-tiff/*.tiff; \ + do echo $$i; hocrfile=`basename $$i .tiff`; \ + tesseract $$i hocr/$$hocrfile hocr; \ + mv hocr/$$hocrfile.hocr hocr/$$hocrfile.html; \ + done; + #OUTPUT GENERATION RECIPES output/wordtagger/index.html: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2