added hocrs

7 years ago · d4232c80ef
parent 83d23ca3ac
commit d4232c80ef
1 changed files with 10 additions and 2 deletions
--- a/12
+++ b/12
@ -1,4 +1,5 @@
-images=$(sort $(wildcard images/*.jpg))
+images=$(sort $(wildcard images/*.jpg)) 
+# @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file
 output_ocr:=$(dir_ocr)/output.txt
 tmpfile:= $(shell mktemp)
 space:= $(empty) $(empty)
@ -35,10 +36,10 @@ dirs: ## create the dirs in working dir
 	@-mkdir -p output/
 	@-mkdir -p output/wordtagger
 	@-mkdir -p ocr/
+	@-mkdir -p hocr/
 	@echo $(color_r)'Directories made': images/ output/


-
 testif:
 ifeq ($(OS),Darwin)
 	@echo $(OS)
@ -60,6 +61,13 @@ tiffs: ## convert images/ to images-tiff/ Depends on IM
 	echo $$tiff; \
 	done;

+hocrs: ## hocr with tesseract and then change extension to .html
+	for i in images-tiff/*.tiff; \
+	do echo $$i; hocrfile=`basename $$i .tiff`; \
+	tesseract $$i hocr/$$hocrfile hocr; \
+	mv hocr/$$hocrfile.hocr hocr/$$hocrfile.html; \
+	done; 
+
 #OUTPUT GENERATION RECIPES

 output/wordtagger/index.html: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2