diff --git a/Makefile b/Makefile index 235edad..c990a20 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ images=$(sort $(wildcard images/*.jpg)) # @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file images-tiff=$(sort $(wildcard images-tiff/*.tiff)) +input-hocr=$(sort $(wildcard hocr/*)) output_ocr:=$(dir_ocr)/output.txt tmpfile:= $(shell mktemp) space:= $(empty) $(empty) @@ -98,11 +99,13 @@ overunder: ocr/output.txt ## Alice: An interpreted language that translate simpl @python3 src/overunder.py .PHONY: overunder -erase:hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF +erase: tiffs hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF python3 src/erase_leastcommon.py + rm $(input-hocr) -replace:hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF +replace:tiffs hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF python3 src/replace_leastcommon.py + rm $(input-hocr) visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer @echo $(tmpfile) diff --git a/src/erase_leastcommon.py b/src/erase_leastcommon.py index 825bfda..bef06d9 100644 --- a/src/erase_leastcommon.py +++ b/src/erase_leastcommon.py @@ -23,7 +23,7 @@ def findleastcommon(list): fdist = FreqDist(word.lower() for word in list) leastcommon = fdist.most_common() for i in leastcommon: - if (i[1] <= limit): + if (i[1] <= 1): leastcommon_list.append(i[0]) return leastcommon_list