From 4056182d250606da1377b1106b12a6ae0d905799 Mon Sep 17 00:00:00 2001 From: Natasha Berting Date: Fri, 23 Mar 2018 17:12:51 +0100 Subject: [PATCH] updated erase & replace rule with automatic remove of hocr files --- Makefile | 7 +++++-- src/erase_leastcommon.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 235edad..c990a20 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ images=$(sort $(wildcard images/*.jpg)) # @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file images-tiff=$(sort $(wildcard images-tiff/*.tiff)) +input-hocr=$(sort $(wildcard hocr/*)) output_ocr:=$(dir_ocr)/output.txt tmpfile:= $(shell mktemp) space:= $(empty) $(empty) @@ -98,11 +99,13 @@ overunder: ocr/output.txt ## Alice: An interpreted language that translate simpl @python3 src/overunder.py .PHONY: overunder -erase:hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF +erase: tiffs hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF python3 src/erase_leastcommon.py + rm $(input-hocr) -replace:hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF +replace:tiffs hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF python3 src/replace_leastcommon.py + rm $(input-hocr) visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer @echo $(tmpfile) diff --git a/src/erase_leastcommon.py b/src/erase_leastcommon.py index 825bfda..bef06d9 100644 --- a/src/erase_leastcommon.py +++ b/src/erase_leastcommon.py @@ -23,7 +23,7 @@ def findleastcommon(list): fdist = FreqDist(word.lower() for word in list) leastcommon = fdist.most_common() for i in leastcommon: - if (i[1] <= limit): + if (i[1] <= 1): leastcommon_list.append(i[0]) return leastcommon_list