added hocrs

master
Natasha Berting 7 years ago
parent 83d23ca3ac
commit d4232c80ef

@ -1,4 +1,5 @@
images=$(sort $(wildcard images/*.jpg)) images=$(sort $(wildcard images/*.jpg))
# @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file
output_ocr:=$(dir_ocr)/output.txt output_ocr:=$(dir_ocr)/output.txt
tmpfile:= $(shell mktemp) tmpfile:= $(shell mktemp)
space:= $(empty) $(empty) space:= $(empty) $(empty)
@ -35,10 +36,10 @@ dirs: ## create the dirs in working dir
@-mkdir -p output/ @-mkdir -p output/
@-mkdir -p output/wordtagger @-mkdir -p output/wordtagger
@-mkdir -p ocr/ @-mkdir -p ocr/
@-mkdir -p hocr/
@echo $(color_r)'Directories made': images/ output/ @echo $(color_r)'Directories made': images/ output/
testif: testif:
ifeq ($(OS),Darwin) ifeq ($(OS),Darwin)
@echo $(OS) @echo $(OS)
@ -60,6 +61,13 @@ tiffs: ## convert images/ to images-tiff/ Depends on IM
echo $$tiff; \ echo $$tiff; \
done; done;
hocrs: ## hocr with tesseract and then change extension to .html
for i in images-tiff/*.tiff; \
do echo $$i; hocrfile=`basename $$i .tiff`; \
tesseract $$i hocr/$$hocrfile hocr; \
mv hocr/$$hocrfile.hocr hocr/$$hocrfile.html; \
done;
#OUTPUT GENERATION RECIPES #OUTPUT GENERATION RECIPES
output/wordtagger/index.html: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2 output/wordtagger/index.html: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2

Loading…
Cancel
Save