added erase & replace scripts to src/
parent
d03d52a098
commit
adaad8b12e
@ -0,0 +1,123 @@
|
|||||||
|
import html5lib
|
||||||
|
from xml.etree import ElementTree as ET
|
||||||
|
from PIL import Image
|
||||||
|
from nltk import FreqDist
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
|
from fpdf import FPDF
|
||||||
|
|
||||||
|
stopwords.words('english')
|
||||||
|
sr = set(stopwords.words('english'))
|
||||||
|
|
||||||
|
def cleanstopwords(list):
|
||||||
|
"This cleans stopwords from a list of words"
|
||||||
|
clean_words = list[:]
|
||||||
|
for word in list:
|
||||||
|
if word.lower() in sr:
|
||||||
|
clean_words.remove(word)
|
||||||
|
return clean_words
|
||||||
|
|
||||||
|
def findleastcommon(list):
|
||||||
|
"This finds the least common words and returns a list"
|
||||||
|
fdist = FreqDist(word.lower() for word in list)
|
||||||
|
leastcommon = fdist.most_common()
|
||||||
|
for i in leastcommon:
|
||||||
|
if (i[1] <= limit):
|
||||||
|
leastcommon_list.append(i[0])
|
||||||
|
return leastcommon_list
|
||||||
|
|
||||||
|
def coordinates(attribute):
|
||||||
|
"This extracts the box coordinates of words from an hocr / html element tree"
|
||||||
|
r = attribute # 'title' is the word in the html tag
|
||||||
|
r, c = r.split(";") # split the attribute into two sections
|
||||||
|
r = r.split(" ")[1:] # split again and discard the elements which aren't useful
|
||||||
|
r = [int(x) for x in r] # put coordinates into list as integers
|
||||||
|
return r
|
||||||
|
|
||||||
|
def filternone(word_raw):
|
||||||
|
if word_raw is None:
|
||||||
|
remove = None
|
||||||
|
word = 'y'
|
||||||
|
else:
|
||||||
|
word = element.text.strip(',".!:;()')
|
||||||
|
return word
|
||||||
|
|
||||||
|
x = -1
|
||||||
|
leastcommon_list = []
|
||||||
|
allwords = []
|
||||||
|
scanimg = glob.glob('images-tiff/*.tiff')
|
||||||
|
hocr = glob.glob('hocr/*.html')
|
||||||
|
maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned
|
||||||
|
|
||||||
|
# loop through every image in scanimg folder
|
||||||
|
for i in scanimg:
|
||||||
|
x = x + 1
|
||||||
|
limit = x * maximum # this helps the script remove words in a way that is proportional to number of pages scanned
|
||||||
|
iim = Image.open(i) # iim is initial image
|
||||||
|
oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
|
||||||
|
|
||||||
|
# open corresponding hocr file
|
||||||
|
print ("Analysing", hocr[x])
|
||||||
|
f = open(hocr[x])
|
||||||
|
print ('Reading scanned image, filtering least common words.')
|
||||||
|
print ("")
|
||||||
|
|
||||||
|
t = html5lib.parse(f, namespaceHTMLElements=False)
|
||||||
|
|
||||||
|
# loop through every word in hocr file to analyse words and find least common
|
||||||
|
for element in t.findall(".//span[@class='ocrx_word']"):
|
||||||
|
word = filternone(element.text)
|
||||||
|
allwords.append(word)
|
||||||
|
|
||||||
|
clean_words = cleanstopwords(allwords) #clean stopwords
|
||||||
|
findleastcommon(clean_words) #find least common words and add them to list
|
||||||
|
print ("The least common words until text", x+1, "are:", leastcommon_list)
|
||||||
|
|
||||||
|
# loop through every word in hocr file to extract coordinates, then remove or paste into output image
|
||||||
|
for element in t.findall(".//span[@class='ocrx_word']"):
|
||||||
|
word = filternone(element.text)
|
||||||
|
c = coordinates(element.attrib['title'])
|
||||||
|
|
||||||
|
wim = iim.crop(c) # wim is word image
|
||||||
|
|
||||||
|
if word.lower() in leastcommon_list and len(word) < limit:
|
||||||
|
oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3]))
|
||||||
|
print ('Excluding:', word)
|
||||||
|
|
||||||
|
else:
|
||||||
|
oim.paste(wim, (c[0], c[1], c[2], c[3]))
|
||||||
|
print ('Including:', word)
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------------#
|
||||||
|
# save and show images
|
||||||
|
n = i.replace("images-tiff/","output/erase-replace/").replace(".tiff", "")
|
||||||
|
oim.save("{}-{}erase.jpg".format(n, x))
|
||||||
|
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------------#
|
||||||
|
# save images into PDF
|
||||||
|
outputs = glob.glob('output/erase-replace/*erase.jpg')
|
||||||
|
print ("Saving to PDF:", outputs)
|
||||||
|
|
||||||
|
def makePdf(pdfFileName, listPages, dir = ''):
|
||||||
|
if (dir):
|
||||||
|
dir += "/"
|
||||||
|
|
||||||
|
cover = Image.open(dir + str(listPages[0]))
|
||||||
|
width, height = cover.size
|
||||||
|
pdf = FPDF(unit = "pt", format = [width, height])
|
||||||
|
|
||||||
|
for page in listPages:
|
||||||
|
pdf.add_page()
|
||||||
|
pdf.image(dir + str(page), 0, 0)
|
||||||
|
|
||||||
|
pdf.output(dir + pdfFileName + ".pdf", "F")
|
||||||
|
|
||||||
|
makePdf('output/erase-replace/Erase', outputs, dir = '')
|
||||||
|
|
||||||
|
#clean up previous jpg files
|
||||||
|
files = glob.glob('./output/erase-replace/*erase.jpg')
|
||||||
|
for f in files:
|
||||||
|
os.remove(f)
|
||||||
|
|
@ -0,0 +1,192 @@
|
|||||||
|
import html5lib
|
||||||
|
from xml.etree import ElementTree as ET
|
||||||
|
from PIL import Image
|
||||||
|
from nltk import FreqDist
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
import random
|
||||||
|
import glob
|
||||||
|
import time
|
||||||
|
from fpdf import FPDF
|
||||||
|
import os
|
||||||
|
|
||||||
|
stopwords.words('english')
|
||||||
|
sr = set(stopwords.words('english'))
|
||||||
|
|
||||||
|
def cleanstopwords(list):
|
||||||
|
"This cleans stopwords from a list of words"
|
||||||
|
clean_words = list[:]
|
||||||
|
for word in list:
|
||||||
|
if word.lower() in sr:
|
||||||
|
clean_words.remove(word)
|
||||||
|
return clean_words
|
||||||
|
|
||||||
|
def findmostcommon(list, int):
|
||||||
|
"This finds the most common words and returns a list"
|
||||||
|
fdist = FreqDist(word.lower() for word in list)
|
||||||
|
mostcommon = fdist.most_common(int)
|
||||||
|
mostcommon_list = [i[0] for i in mostcommon]
|
||||||
|
return mostcommon_list
|
||||||
|
|
||||||
|
def findleastcommon(list):
|
||||||
|
"This finds the least common words and returns a list"
|
||||||
|
fdist = FreqDist(word.lower() for word in list)
|
||||||
|
leastcommon = fdist.most_common()
|
||||||
|
for i in leastcommon:
|
||||||
|
if (i[1] <= limit):
|
||||||
|
leastcommon_list.append(i[0])
|
||||||
|
return leastcommon_list
|
||||||
|
|
||||||
|
def coordinates(attribute):
|
||||||
|
"This extracts the box coordinates of words from an hocr / html element tree"
|
||||||
|
c = attribute # 'title' is the word in the html tag
|
||||||
|
c, r = c.split(";") # split the attribute into two sections
|
||||||
|
c = c.split(" ")[1:] # split again and discard the elements which aren't useful
|
||||||
|
c = [int(x) for x in c] # put coordinates into list as integers
|
||||||
|
return c
|
||||||
|
|
||||||
|
def filternone(word_raw):
|
||||||
|
if word_raw is None:
|
||||||
|
remove = None
|
||||||
|
word = 'y'
|
||||||
|
else:
|
||||||
|
word = element.text.strip(',".!:;()')
|
||||||
|
return word
|
||||||
|
|
||||||
|
x = -1
|
||||||
|
leastcommon_list = []
|
||||||
|
allwords = []
|
||||||
|
scanimg = glob.glob('images-tiff/*.tiff')
|
||||||
|
hocr = glob.glob('hocr/*.html')
|
||||||
|
num = 0
|
||||||
|
|
||||||
|
maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned
|
||||||
|
|
||||||
|
|
||||||
|
# loop through every image in scanimg folder
|
||||||
|
for i in scanimg:
|
||||||
|
x = x + 1
|
||||||
|
limit = 15 - (x * maximum) # this helps the script remove words in a way that is proportional to number of pages scanned
|
||||||
|
iim = Image.open(i) # iim is initial image
|
||||||
|
oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
|
||||||
|
|
||||||
|
# open corresponding hocr file
|
||||||
|
print ("Analysing", hocr[x])
|
||||||
|
f = open(hocr[x])
|
||||||
|
print ('Reading scanned image, filtering least common words.')
|
||||||
|
print ("")
|
||||||
|
|
||||||
|
t = html5lib.parse(f, namespaceHTMLElements=False)
|
||||||
|
|
||||||
|
# loop through every word in hocr file to analyse words and find least common
|
||||||
|
for element in t.findall(".//span[@class='ocrx_word']"):
|
||||||
|
word = filternone(element.text)
|
||||||
|
allwords.append(word)
|
||||||
|
|
||||||
|
clean_words = cleanstopwords(allwords) #clean stopwords
|
||||||
|
findleastcommon(clean_words) #find least common words and add them to list
|
||||||
|
mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list
|
||||||
|
|
||||||
|
print ("The most common words until text", x+1, "are:", mostcommon_list)
|
||||||
|
|
||||||
|
# loop through every word in hocr file to extract coordinates, then remove or paste into output image
|
||||||
|
|
||||||
|
for element in t.findall(".//span[@class='ocrx_word']"):
|
||||||
|
word = filternone(element.text)
|
||||||
|
c = coordinates(element.attrib['title'])
|
||||||
|
num = num + 1
|
||||||
|
|
||||||
|
wim = iim.crop(c) # wim is word image
|
||||||
|
|
||||||
|
#extract coordinates
|
||||||
|
if word.lower() in mostcommon_list and len(word) > 1 and len(word) <= 5:
|
||||||
|
wim.save ("output/erase-replace/crops4/wimreplace{}.png".format(num))
|
||||||
|
elif word in mostcommon_list and len(word) <= 7 :
|
||||||
|
wim.save ("output/erase-replace/crops7/wimreplace{}.png".format(num))
|
||||||
|
elif word in mostcommon_list and len(word) > 7 :
|
||||||
|
wim.save ("output/erase-replace/crops_more/wimreplace{}.png".format(num))
|
||||||
|
|
||||||
|
if x > 0:
|
||||||
|
# use PIL to crop out every box, then paste it according to if rule
|
||||||
|
randomimg4 = random.choice(glob.glob('./output/erase-replace/crops4/*.png'))
|
||||||
|
randomimg7 = random.choice(glob.glob('./output/erase-replace/crops7/*.png'))
|
||||||
|
randomimg_more = random.choice(glob.glob('./output/erase-replace/crops_more/*.png'))
|
||||||
|
|
||||||
|
wimreplace4 = Image.open(randomimg4)
|
||||||
|
wimreplace7 = Image.open(randomimg7)
|
||||||
|
wimreplace_more = Image.open(randomimg_more)
|
||||||
|
|
||||||
|
wimcolor4 = Image.new('RGBA', wimreplace4.size, (250, 230, 0, 90))
|
||||||
|
wimcolor7 = Image.new('RGBA', wimreplace7.size, (250, 230, 0, 90))
|
||||||
|
wimcolor_more = Image.new('RGBA', wimreplace_more.size, (250, 230, 0, 90))
|
||||||
|
|
||||||
|
out4 = Image.alpha_composite(wimreplace4, wimcolor4)
|
||||||
|
out7 = Image.alpha_composite(wimreplace7, wimcolor7)
|
||||||
|
out_more = Image.alpha_composite(wimreplace_more, wimcolor_more)
|
||||||
|
|
||||||
|
if word.lower() in leastcommon_list and len(word) <= limit:
|
||||||
|
oim.paste(wim, (c[0], c[1], c[2], c[3]))
|
||||||
|
|
||||||
|
elif word.lower() in leastcommon_list and len(word) < 8:
|
||||||
|
oim.paste(out4, (c[0], c[1]))
|
||||||
|
print ('Excluding:', word)
|
||||||
|
|
||||||
|
elif word.lower() in leastcommon_list and len(word) < 11:
|
||||||
|
oim.paste(out7, (c[0], c[1]))
|
||||||
|
print ('Excluding:', word)
|
||||||
|
|
||||||
|
elif word.lower() in leastcommon_list and len(word) > 8:
|
||||||
|
oim.paste(out_more, (c[0], c[1]))
|
||||||
|
print ('Excluding:', word)
|
||||||
|
|
||||||
|
else:
|
||||||
|
oim.paste(wim, (c[0], c[1], c[2], c[3]))
|
||||||
|
print ('Including:', word)
|
||||||
|
|
||||||
|
else:
|
||||||
|
oim.paste(wim, (c[0], c[1], c[2], c[3]))
|
||||||
|
print ('Including:', word)
|
||||||
|
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------------#
|
||||||
|
# save images
|
||||||
|
n = i.replace("images-tiff/","output/erase-replace/").replace(".tiff", "")
|
||||||
|
oim.save("{}-{}replace.jpg".format(n, x))
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------------#
|
||||||
|
# save images into PDF
|
||||||
|
outputs = glob.glob('output/erase-replace/*replace.jpg')
|
||||||
|
print ("Saving to PDF:", outputs)
|
||||||
|
|
||||||
|
def makePdf(pdfFileName, listPages, dir = ''):
|
||||||
|
if (dir):
|
||||||
|
dir += "/"
|
||||||
|
|
||||||
|
cover = Image.open(dir + str(listPages[0]))
|
||||||
|
width, height = cover.size
|
||||||
|
pdf = FPDF(unit = "pt", format = [width, height])
|
||||||
|
|
||||||
|
for page in listPages:
|
||||||
|
pdf.add_page()
|
||||||
|
pdf.image(dir + str(page), 0, 0)
|
||||||
|
|
||||||
|
pdf.output(dir + pdfFileName + ".pdf", "F")
|
||||||
|
|
||||||
|
makePdf('output/erase-replace/Replace', outputs, dir = '')
|
||||||
|
|
||||||
|
#clean up previous jpg and png files
|
||||||
|
files = glob.glob('./output/erase-replace/*replace.jpg')
|
||||||
|
for f in files:
|
||||||
|
os.remove(f)
|
||||||
|
|
||||||
|
files = glob.glob('./output/erase-replace/crops4/*.png')
|
||||||
|
for f in files:
|
||||||
|
os.remove(f)
|
||||||
|
|
||||||
|
files = glob.glob('./output/erase-replace/crops7/*.png')
|
||||||
|
for f in files:
|
||||||
|
os.remove(f)
|
||||||
|
|
||||||
|
files = glob.glob('./output/erase-replace/crops_more/*.png')
|
||||||
|
for f in files:
|
||||||
|
os.remove(f)
|
||||||
|
|
Loading…
Reference in New Issue