|
|
@ -46,8 +46,8 @@ def filternone(word_raw):
|
|
|
|
x = -1
|
|
|
|
x = -1
|
|
|
|
leastcommon_list = []
|
|
|
|
leastcommon_list = []
|
|
|
|
allwords = []
|
|
|
|
allwords = []
|
|
|
|
scanimg = glob.glob('images-tiff/*.tiff')
|
|
|
|
scanimg = sorted(glob.glob('images-tiff/*.tiff'))
|
|
|
|
hocr = glob.glob('hocr/*.html')
|
|
|
|
hocr = sorted(glob.glob('hocr/*.html'))
|
|
|
|
maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned
|
|
|
|
maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned
|
|
|
|
|
|
|
|
|
|
|
|
# loop through every image in scanimg folder
|
|
|
|
# loop through every image in scanimg folder
|
|
|
|