updated erase & replace rule in makefile

master
Natasha Berting 7 years ago
parent ab85a34326
commit 0587b5cdaf

@ -100,9 +100,8 @@ overunder: ocr/output.txt ## Alice: An interpreted language that translate simpl
erase:hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF erase:hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF
python3 src/erase_leastcommon.py python3 src/erase_leastcommon.py
@echo 'erase rule output: output/erase-replace/Erase.pdf'
replace: ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF replace:hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF
python3 src/replace_leastcommon.py python3 src/replace_leastcommon.py
visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer

@ -57,11 +57,10 @@ for i in scanimg:
iim = Image.open(i) # iim is initial image iim = Image.open(i) # iim is initial image
oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
# open corresponding hocr file # open corresponding hocr file
print ("Analysing", hocr[x])
f = open(hocr[x]) f = open(hocr[x])
print ('Reading scanned image, filtering least common words.') print ('Reading scanned image, filtering least common words.')
print ("") print ('')
t = html5lib.parse(f, namespaceHTMLElements=False) t = html5lib.parse(f, namespaceHTMLElements=False)
@ -73,7 +72,9 @@ for i in scanimg:
clean_words = cleanstopwords(allwords) #clean stopwords clean_words = cleanstopwords(allwords) #clean stopwords
findleastcommon(clean_words) #find least common words and add them to list findleastcommon(clean_words) #find least common words and add them to list
print ("The least common words until text", x+1, "are:", leastcommon_list) print ("The least common words until text", x+1, "are:", leastcommon_list)
print ('')
print ('Processing word coordinates and erasing least common words.')
print ('')
# loop through every word in hocr file to extract coordinates, then remove or paste into output image # loop through every word in hocr file to extract coordinates, then remove or paste into output image
for element in t.findall(".//span[@class='ocrx_word']"): for element in t.findall(".//span[@class='ocrx_word']"):
word = filternone(element.text) word = filternone(element.text)
@ -83,11 +84,9 @@ for i in scanimg:
if word.lower() in leastcommon_list and len(word) < limit: if word.lower() in leastcommon_list and len(word) < limit:
oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3])) oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3]))
print ('Excluding:', word)
else: else:
oim.paste(wim, (c[0], c[1], c[2], c[3])) oim.paste(wim, (c[0], c[1], c[2], c[3]))
print ('Including:', word)
#-------------------------------------------------------------------------------# #-------------------------------------------------------------------------------#
# save and show images # save and show images
@ -98,20 +97,17 @@ for i in scanimg:
#-------------------------------------------------------------------------------# #-------------------------------------------------------------------------------#
# save images into PDF # save images into PDF
outputs = glob.glob('output/erase-replace/*erase.jpg') outputs = glob.glob('output/erase-replace/*erase.jpg')
print ("Saving to PDF:", outputs) print ("Saving to PDF: output/erase-replace/Erase.pdf")
def makePdf(pdfFileName, listPages, dir = ''): def makePdf(pdfFileName, listPages, dir = ''):
if (dir): if (dir):
dir += "/" dir += "/"
cover = Image.open(dir + str(listPages[0])) cover = Image.open(dir + str(listPages[0]))
width, height = cover.size width, height = cover.size
pdf = FPDF(unit = "pt", format = [width, height]) pdf = FPDF(unit = "pt", format = [width, height])
for page in listPages: for page in listPages:
pdf.add_page() pdf.add_page()
pdf.image(dir + str(page), 0, 0) pdf.image(dir + str(page), 0, 0)
pdf.output(dir + pdfFileName + ".pdf", "F") pdf.output(dir + pdfFileName + ".pdf", "F")
makePdf('output/erase-replace/Erase', outputs, dir = '') makePdf('output/erase-replace/Erase', outputs, dir = '')

@ -8,6 +8,14 @@ import glob
import time import time
from fpdf import FPDF from fpdf import FPDF
import os import os
import shutil
path1 = './temp'
if not os.path.isdir(path1):
os.makedirs(path1)
os.makedirs('./temp/crops4')
os.makedirs('./temp/crops7')
os.makedirs('./temp/crops_more')
stopwords.words('english') stopwords.words('english')
sr = set(stopwords.words('english')) sr = set(stopwords.words('english'))
@ -58,10 +66,8 @@ allwords = []
scanimg = glob.glob('images-tiff/*.tiff') scanimg = glob.glob('images-tiff/*.tiff')
hocr = glob.glob('hocr/*.html') hocr = glob.glob('hocr/*.html')
num = 0 num = 0
maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned
# loop through every image in scanimg folder # loop through every image in scanimg folder
for i in scanimg: for i in scanimg:
x = x + 1 x = x + 1
@ -70,10 +76,9 @@ for i in scanimg:
oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
# open corresponding hocr file # open corresponding hocr file
print ("Analysing", hocr[x])
f = open(hocr[x]) f = open(hocr[x])
print ('Reading scanned image, filtering least common words.') print ('Reading scanned image and hocr file, filtering least common words.')
print ("") print ('')
t = html5lib.parse(f, namespaceHTMLElements=False) t = html5lib.parse(f, namespaceHTMLElements=False)
@ -86,11 +91,15 @@ for i in scanimg:
findleastcommon(clean_words) #find least common words and add them to list findleastcommon(clean_words) #find least common words and add them to list
mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list
print ("The most common words until text", x+1, "are:", mostcommon_list) print ('The most common words until text', x+1, 'are:', mostcommon_list)
print ('')
# loop through every word in hocr file to extract coordinates, then remove or paste into output image # loop through every word in hocr file to extract coordinates, then remove or paste into output image
for element in t.findall(".//span[@class='ocrx_word']"): print ('Processing word coordinates and replacing least common words with most common words.')
print ('')
for element in t.findall(".//span[@class='ocrx_word']"):
word = filternone(element.text) word = filternone(element.text)
c = coordinates(element.attrib['title']) c = coordinates(element.attrib['title'])
num = num + 1 num = num + 1
@ -99,17 +108,17 @@ for i in scanimg:
#extract coordinates #extract coordinates
if word.lower() in mostcommon_list and len(word) > 1 and len(word) <= 5: if word.lower() in mostcommon_list and len(word) > 1 and len(word) <= 5:
wim.save ("output/erase-replace/crops4/wimreplace{}.png".format(num)) wim.save ("temp/crops4/wimreplace{}.png".format(num))
elif word in mostcommon_list and len(word) <= 7 : elif word in mostcommon_list and len(word) <= 7 :
wim.save ("output/erase-replace/crops7/wimreplace{}.png".format(num)) wim.save ("temp/crops7/wimreplace{}.png".format(num))
elif word in mostcommon_list and len(word) > 7 : elif word in mostcommon_list and len(word) > 7 :
wim.save ("output/erase-replace/crops_more/wimreplace{}.png".format(num)) wim.save ("temp/crops_more/wimreplace{}.png".format(num))
if x > 0: if x > 0:
# use PIL to crop out every box, then paste it according to if rule # use PIL to crop out every box, then paste it according to if rule
randomimg4 = random.choice(glob.glob('./output/erase-replace/crops4/*.png')) randomimg4 = random.choice(glob.glob('temp/crops4/*.png'))
randomimg7 = random.choice(glob.glob('./output/erase-replace/crops7/*.png')) randomimg7 = random.choice(glob.glob('temp/crops7/*.png'))
randomimg_more = random.choice(glob.glob('./output/erase-replace/crops_more/*.png')) randomimg_more = random.choice(glob.glob('temp/crops_more/*.png'))
wimreplace4 = Image.open(randomimg4) wimreplace4 = Image.open(randomimg4)
wimreplace7 = Image.open(randomimg7) wimreplace7 = Image.open(randomimg7)
@ -128,23 +137,18 @@ for i in scanimg:
elif word.lower() in leastcommon_list and len(word) < 8: elif word.lower() in leastcommon_list and len(word) < 8:
oim.paste(out4, (c[0], c[1])) oim.paste(out4, (c[0], c[1]))
print ('Excluding:', word)
elif word.lower() in leastcommon_list and len(word) < 11: elif word.lower() in leastcommon_list and len(word) < 11:
oim.paste(out7, (c[0], c[1])) oim.paste(out7, (c[0], c[1]))
print ('Excluding:', word)
elif word.lower() in leastcommon_list and len(word) > 8: elif word.lower() in leastcommon_list and len(word) > 8:
oim.paste(out_more, (c[0], c[1])) oim.paste(out_more, (c[0], c[1]))
print ('Excluding:', word)
else: else:
oim.paste(wim, (c[0], c[1], c[2], c[3])) oim.paste(wim, (c[0], c[1], c[2], c[3]))
print ('Including:', word)
else: else:
oim.paste(wim, (c[0], c[1], c[2], c[3])) oim.paste(wim, (c[0], c[1], c[2], c[3]))
print ('Including:', word)
#-------------------------------------------------------------------------------# #-------------------------------------------------------------------------------#
@ -155,20 +159,18 @@ for i in scanimg:
#-------------------------------------------------------------------------------# #-------------------------------------------------------------------------------#
# save images into PDF # save images into PDF
outputs = glob.glob('output/erase-replace/*replace.jpg') outputs = glob.glob('output/erase-replace/*replace.jpg')
print ("Saving to PDF:", outputs) print ('')
print ("Saving to PDF: output/erase-replace/Replace.pdf")
def makePdf(pdfFileName, listPages, dir = ''): def makePdf(pdfFileName, listPages, dir = ''):
if (dir): if (dir):
dir += "/" dir += "/"
cover = Image.open(dir + str(listPages[0])) cover = Image.open(dir + str(listPages[0]))
width, height = cover.size width, height = cover.size
pdf = FPDF(unit = "pt", format = [width, height]) pdf = FPDF(unit = "pt", format = [width, height])
for page in listPages: for page in listPages:
pdf.add_page() pdf.add_page()
pdf.image(dir + str(page), 0, 0) pdf.image(dir + str(page), 0, 0)
pdf.output(dir + pdfFileName + ".pdf", "F") pdf.output(dir + pdfFileName + ".pdf", "F")
makePdf('output/erase-replace/Replace', outputs, dir = '') makePdf('output/erase-replace/Replace', outputs, dir = '')
@ -177,16 +179,5 @@ makePdf('output/erase-replace/Replace', outputs, dir = '')
files = glob.glob('./output/erase-replace/*replace.jpg') files = glob.glob('./output/erase-replace/*replace.jpg')
for f in files: for f in files:
os.remove(f) os.remove(f)
files = glob.glob('./output/erase-replace/crops4/*.png')
for f in files:
os.remove(f)
files = glob.glob('./output/erase-replace/crops7/*.png')
for f in files:
os.remove(f)
files = glob.glob('./output/erase-replace/crops_more/*.png')
for f in files:
os.remove(f)
shutil.rmtree('./temp/')

Loading…
Cancel
Save