updated erase & replace rule with automatic remove of hocr files

master
Natasha Berting 6 years ago
parent c9820c1a17
commit 4056182d25

@ -1,6 +1,7 @@
images=$(sort $(wildcard images/*.jpg))
# @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file
images-tiff=$(sort $(wildcard images-tiff/*.tiff))
input-hocr=$(sort $(wildcard hocr/*))
output_ocr:=$(dir_ocr)/output.txt
tmpfile:= $(shell mktemp)
space:= $(empty) $(empty)
@ -98,11 +99,13 @@ overunder: ocr/output.txt ## Alice: An interpreted language that translate simpl
@python3 src/overunder.py
.PHONY: overunder
erase:hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF
erase: tiffs hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF
python3 src/erase_leastcommon.py
rm $(input-hocr)
replace:hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF
replace:tiffs hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF
python3 src/replace_leastcommon.py
rm $(input-hocr)
visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer
@echo $(tmpfile)

@ -23,7 +23,7 @@ def findleastcommon(list):
fdist = FreqDist(word.lower() for word in list)
leastcommon = fdist.most_common()
for i in leastcommon:
if (i[1] <= limit):
if (i[1] <= 1):
leastcommon_list.append(i[0])
return leastcommon_list

Loading…
Cancel
Save