|
|
|
@ -8,6 +8,14 @@ import glob
|
|
|
|
|
import time
|
|
|
|
|
from fpdf import FPDF
|
|
|
|
|
import os
|
|
|
|
|
import shutil
|
|
|
|
|
|
|
|
|
|
path1 = './temp'
|
|
|
|
|
if not os.path.isdir(path1):
|
|
|
|
|
os.makedirs(path1)
|
|
|
|
|
os.makedirs('./temp/crops4')
|
|
|
|
|
os.makedirs('./temp/crops7')
|
|
|
|
|
os.makedirs('./temp/crops_more')
|
|
|
|
|
|
|
|
|
|
stopwords.words('english')
|
|
|
|
|
sr = set(stopwords.words('english'))
|
|
|
|
@ -58,10 +66,8 @@ allwords = []
|
|
|
|
|
scanimg = glob.glob('images-tiff/*.tiff')
|
|
|
|
|
hocr = glob.glob('hocr/*.html')
|
|
|
|
|
num = 0
|
|
|
|
|
|
|
|
|
|
maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# loop through every image in scanimg folder
|
|
|
|
|
for i in scanimg:
|
|
|
|
|
x = x + 1
|
|
|
|
@ -70,10 +76,9 @@ for i in scanimg:
|
|
|
|
|
oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
|
|
|
|
|
|
|
|
|
|
# open corresponding hocr file
|
|
|
|
|
print ("Analysing", hocr[x])
|
|
|
|
|
f = open(hocr[x])
|
|
|
|
|
print ('Reading scanned image, filtering least common words.')
|
|
|
|
|
print ("")
|
|
|
|
|
print ('Reading scanned image and hocr file, filtering least common words.')
|
|
|
|
|
print ('')
|
|
|
|
|
|
|
|
|
|
t = html5lib.parse(f, namespaceHTMLElements=False)
|
|
|
|
|
|
|
|
|
@ -86,11 +91,15 @@ for i in scanimg:
|
|
|
|
|
findleastcommon(clean_words) #find least common words and add them to list
|
|
|
|
|
mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list
|
|
|
|
|
|
|
|
|
|
print ("The most common words until text", x+1, "are:", mostcommon_list)
|
|
|
|
|
|
|
|
|
|
print ('The most common words until text', x+1, 'are:', mostcommon_list)
|
|
|
|
|
print ('')
|
|
|
|
|
|
|
|
|
|
# loop through every word in hocr file to extract coordinates, then remove or paste into output image
|
|
|
|
|
|
|
|
|
|
for element in t.findall(".//span[@class='ocrx_word']"):
|
|
|
|
|
print ('Processing word coordinates and replacing least common words with most common words.')
|
|
|
|
|
print ('')
|
|
|
|
|
|
|
|
|
|
for element in t.findall(".//span[@class='ocrx_word']"):
|
|
|
|
|
word = filternone(element.text)
|
|
|
|
|
c = coordinates(element.attrib['title'])
|
|
|
|
|
num = num + 1
|
|
|
|
@ -99,17 +108,17 @@ for i in scanimg:
|
|
|
|
|
|
|
|
|
|
#extract coordinates
|
|
|
|
|
if word.lower() in mostcommon_list and len(word) > 1 and len(word) <= 5:
|
|
|
|
|
wim.save ("output/erase-replace/crops4/wimreplace{}.png".format(num))
|
|
|
|
|
wim.save ("temp/crops4/wimreplace{}.png".format(num))
|
|
|
|
|
elif word in mostcommon_list and len(word) <= 7 :
|
|
|
|
|
wim.save ("output/erase-replace/crops7/wimreplace{}.png".format(num))
|
|
|
|
|
wim.save ("temp/crops7/wimreplace{}.png".format(num))
|
|
|
|
|
elif word in mostcommon_list and len(word) > 7 :
|
|
|
|
|
wim.save ("output/erase-replace/crops_more/wimreplace{}.png".format(num))
|
|
|
|
|
wim.save ("temp/crops_more/wimreplace{}.png".format(num))
|
|
|
|
|
|
|
|
|
|
if x > 0:
|
|
|
|
|
# use PIL to crop out every box, then paste it according to if rule
|
|
|
|
|
randomimg4 = random.choice(glob.glob('./output/erase-replace/crops4/*.png'))
|
|
|
|
|
randomimg7 = random.choice(glob.glob('./output/erase-replace/crops7/*.png'))
|
|
|
|
|
randomimg_more = random.choice(glob.glob('./output/erase-replace/crops_more/*.png'))
|
|
|
|
|
randomimg4 = random.choice(glob.glob('temp/crops4/*.png'))
|
|
|
|
|
randomimg7 = random.choice(glob.glob('temp/crops7/*.png'))
|
|
|
|
|
randomimg_more = random.choice(glob.glob('temp/crops_more/*.png'))
|
|
|
|
|
|
|
|
|
|
wimreplace4 = Image.open(randomimg4)
|
|
|
|
|
wimreplace7 = Image.open(randomimg7)
|
|
|
|
@ -128,23 +137,18 @@ for i in scanimg:
|
|
|
|
|
|
|
|
|
|
elif word.lower() in leastcommon_list and len(word) < 8:
|
|
|
|
|
oim.paste(out4, (c[0], c[1]))
|
|
|
|
|
print ('Excluding:', word)
|
|
|
|
|
|
|
|
|
|
elif word.lower() in leastcommon_list and len(word) < 11:
|
|
|
|
|
oim.paste(out7, (c[0], c[1]))
|
|
|
|
|
print ('Excluding:', word)
|
|
|
|
|
|
|
|
|
|
elif word.lower() in leastcommon_list and len(word) > 8:
|
|
|
|
|
oim.paste(out_more, (c[0], c[1]))
|
|
|
|
|
print ('Excluding:', word)
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
oim.paste(wim, (c[0], c[1], c[2], c[3]))
|
|
|
|
|
print ('Including:', word)
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
oim.paste(wim, (c[0], c[1], c[2], c[3]))
|
|
|
|
|
print ('Including:', word)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------#
|
|
|
|
@ -155,20 +159,18 @@ for i in scanimg:
|
|
|
|
|
#-------------------------------------------------------------------------------#
|
|
|
|
|
# save images into PDF
|
|
|
|
|
outputs = glob.glob('output/erase-replace/*replace.jpg')
|
|
|
|
|
print ("Saving to PDF:", outputs)
|
|
|
|
|
print ('')
|
|
|
|
|
print ("Saving to PDF: output/erase-replace/Replace.pdf")
|
|
|
|
|
|
|
|
|
|
def makePdf(pdfFileName, listPages, dir = ''):
|
|
|
|
|
if (dir):
|
|
|
|
|
dir += "/"
|
|
|
|
|
|
|
|
|
|
cover = Image.open(dir + str(listPages[0]))
|
|
|
|
|
width, height = cover.size
|
|
|
|
|
pdf = FPDF(unit = "pt", format = [width, height])
|
|
|
|
|
|
|
|
|
|
for page in listPages:
|
|
|
|
|
pdf.add_page()
|
|
|
|
|
pdf.image(dir + str(page), 0, 0)
|
|
|
|
|
|
|
|
|
|
pdf.output(dir + pdfFileName + ".pdf", "F")
|
|
|
|
|
|
|
|
|
|
makePdf('output/erase-replace/Replace', outputs, dir = '')
|
|
|
|
@ -177,16 +179,5 @@ makePdf('output/erase-replace/Replace', outputs, dir = '')
|
|
|
|
|
files = glob.glob('./output/erase-replace/*replace.jpg')
|
|
|
|
|
for f in files:
|
|
|
|
|
os.remove(f)
|
|
|
|
|
|
|
|
|
|
files = glob.glob('./output/erase-replace/crops4/*.png')
|
|
|
|
|
for f in files:
|
|
|
|
|
os.remove(f)
|
|
|
|
|
|
|
|
|
|
files = glob.glob('./output/erase-replace/crops7/*.png')
|
|
|
|
|
for f in files:
|
|
|
|
|
os.remove(f)
|
|
|
|
|
|
|
|
|
|
files = glob.glob('./output/erase-replace/crops_more/*.png')
|
|
|
|
|
for f in files:
|
|
|
|
|
os.remove(f)
|
|
|
|
|
|
|
|
|
|
shutil.rmtree('./temp/')
|
|
|
|
|