updated erase & replace rule with automatic remove of hocr files

master
Natasha Berting 6 years ago
parent c9820c1a17
commit 4056182d25

@ -1,6 +1,7 @@
images=$(sort $(wildcard images/*.jpg)) images=$(sort $(wildcard images/*.jpg))
# @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file # @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file
images-tiff=$(sort $(wildcard images-tiff/*.tiff)) images-tiff=$(sort $(wildcard images-tiff/*.tiff))
input-hocr=$(sort $(wildcard hocr/*))
output_ocr:=$(dir_ocr)/output.txt output_ocr:=$(dir_ocr)/output.txt
tmpfile:= $(shell mktemp) tmpfile:= $(shell mktemp)
space:= $(empty) $(empty) space:= $(empty) $(empty)
@ -98,11 +99,13 @@ overunder: ocr/output.txt ## Alice: An interpreted language that translate simpl
@python3 src/overunder.py @python3 src/overunder.py
.PHONY: overunder .PHONY: overunder
erase:hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF erase: tiffs hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF
python3 src/erase_leastcommon.py python3 src/erase_leastcommon.py
rm $(input-hocr)
replace:hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF replace:tiffs hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF
python3 src/replace_leastcommon.py python3 src/replace_leastcommon.py
rm $(input-hocr)
visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer
@echo $(tmpfile) @echo $(tmpfile)

@ -23,7 +23,7 @@ def findleastcommon(list):
fdist = FreqDist(word.lower() for word in list) fdist = FreqDist(word.lower() for word in list)
leastcommon = fdist.most_common() leastcommon = fdist.most_common()
for i in leastcommon: for i in leastcommon:
if (i[1] <= limit): if (i[1] <= 1):
leastcommon_list.append(i[0]) leastcommon_list.append(i[0])
return leastcommon_list return leastcommon_list

Loading…
Cancel
Save