updated erase & replace rule in makefile

master
Natasha Berting 7 years ago
parent ab85a34326
commit 0587b5cdaf

@ -100,9 +100,8 @@ overunder: ocr/output.txt ## Alice: An interpreted language that translate simpl
erase:hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF erase:hocrs ## Natasha: Analyzes pages in order, erases least common words from view. Dependencies: PIL, html5lib, FPDF
python3 src/erase_leastcommon.py python3 src/erase_leastcommon.py
@echo 'erase rule output: output/erase-replace/Erase.pdf'
replace: ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF replace:hocrs ## Natasha: Analyzes pages in order, replace least common words with most common words. Dependencies: PIL, html5lib, FPDF
python3 src/replace_leastcommon.py python3 src/replace_leastcommon.py
visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer

@ -58,10 +58,9 @@ for i in scanimg:
oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
# open corresponding hocr file # open corresponding hocr file
print ("Analysing", hocr[x])
f = open(hocr[x]) f = open(hocr[x])
print ('Reading scanned image, filtering least common words.') print ('Reading scanned image, filtering least common words.')
print ("") print ('')
t = html5lib.parse(f, namespaceHTMLElements=False) t = html5lib.parse(f, namespaceHTMLElements=False)
@ -73,7 +72,9 @@ for i in scanimg:
clean_words = cleanstopwords(allwords) #clean stopwords clean_words = cleanstopwords(allwords) #clean stopwords
findleastcommon(clean_words) #find least common words and add them to list findleastcommon(clean_words) #find least common words and add them to list
print ("The least common words until text", x+1, "are:", leastcommon_list) print ("The least common words until text", x+1, "are:", leastcommon_list)
print ('')
print ('Processing word coordinates and erasing least common words.')
print ('')
# loop through every word in hocr file to extract coordinates, then remove or paste into output image # loop through every word in hocr file to extract coordinates, then remove or paste into output image
for element in t.findall(".//span[@class='ocrx_word']"): for element in t.findall(".//span[@class='ocrx_word']"):
word = filternone(element.text) word = filternone(element.text)
@ -83,11 +84,9 @@ for i in scanimg:
if word.lower() in leastcommon_list and len(word) < limit: if word.lower() in leastcommon_list and len(word) < limit:
oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3])) oim.paste((255, 255, 255), (c[0], c[1], c[2], c[3]))
print ('Excluding:', word)
else: else:
oim.paste(wim, (c[0], c[1], c[2], c[3])) oim.paste(wim, (c[0], c[1], c[2], c[3]))
print ('Including:', word)
#-------------------------------------------------------------------------------# #-------------------------------------------------------------------------------#
# save and show images # save and show images
@ -98,20 +97,17 @@ for i in scanimg:
#-------------------------------------------------------------------------------# #-------------------------------------------------------------------------------#
# save images into PDF # save images into PDF
outputs = glob.glob('output/erase-replace/*erase.jpg') outputs = glob.glob('output/erase-replace/*erase.jpg')
print ("Saving to PDF:", outputs) print ("Saving to PDF: output/erase-replace/Erase.pdf")
def makePdf(pdfFileName, listPages, dir = ''): def makePdf(pdfFileName, listPages, dir = ''):
if (dir): if (dir):
dir += "/" dir += "/"
cover = Image.open(dir + str(listPages[0])) cover = Image.open(dir + str(listPages[0]))
width, height = cover.size width, height = cover.size
pdf = FPDF(unit = "pt", format = [width, height]) pdf = FPDF(unit = "pt", format = [width, height])
for page in listPages: for page in listPages:
pdf.add_page() pdf.add_page()
pdf.image(dir + str(page), 0, 0) pdf.image(dir + str(page), 0, 0)
pdf.output(dir + pdfFileName + ".pdf", "F") pdf.output(dir + pdfFileName + ".pdf", "F")
makePdf('output/erase-replace/Erase', outputs, dir = '') makePdf('output/erase-replace/Erase', outputs, dir = '')

@ -8,6 +8,14 @@ import glob
import time import time
from fpdf import FPDF from fpdf import FPDF
import os import os
import shutil
path1 = './temp'
if not os.path.isdir(path1):
os.makedirs(path1)
os.makedirs('./temp/crops4')
os.makedirs('./temp/crops7')
os.makedirs('./temp/crops_more')
stopwords.words('english') stopwords.words('english')
sr = set(stopwords.words('english')) sr = set(stopwords.words('english'))
@ -58,10 +66,8 @@ allwords = []
scanimg = glob.glob('images-tiff/*.tiff') scanimg = glob.glob('images-tiff/*.tiff')
hocr = glob.glob('hocr/*.html') hocr = glob.glob('hocr/*.html')
num = 0 num = 0
maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned maximum = 20 / len(scanimg) # this helps the script remove words in a way that is proportional to number of pages scanned
# loop through every image in scanimg folder # loop through every image in scanimg folder
for i in scanimg: for i in scanimg:
x = x + 1 x = x + 1
@ -70,10 +76,9 @@ for i in scanimg:
oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image oim = Image.new("RGB", iim.size, (255, 255, 255)) #oim is output image
# open corresponding hocr file # open corresponding hocr file
print ("Analysing", hocr[x])
f = open(hocr[x]) f = open(hocr[x])
print ('Reading scanned image, filtering least common words.') print ('Reading scanned image and hocr file, filtering least common words.')
print ("") print ('')
t = html5lib.parse(f, namespaceHTMLElements=False) t = html5lib.parse(f, namespaceHTMLElements=False)
@ -86,10 +91,14 @@ for i in scanimg:
findleastcommon(clean_words) #find least common words and add them to list findleastcommon(clean_words) #find least common words and add them to list
mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list mostcommon_list = findmostcommon(clean_words, 30) #find most common words and add them to list
print ("The most common words until text", x+1, "are:", mostcommon_list) print ('The most common words until text', x+1, 'are:', mostcommon_list)
print ('')
# loop through every word in hocr file to extract coordinates, then remove or paste into output image # loop through every word in hocr file to extract coordinates, then remove or paste into output image
print ('Processing word coordinates and replacing least common words with most common words.')
print ('')
for element in t.findall(".//span[@class='ocrx_word']"): for element in t.findall(".//span[@class='ocrx_word']"):
word = filternone(element.text) word = filternone(element.text)
c = coordinates(element.attrib['title']) c = coordinates(element.attrib['title'])
@ -99,17 +108,17 @@ for i in scanimg:
#extract coordinates #extract coordinates
if word.lower() in mostcommon_list and len(word) > 1 and len(word) <= 5: if word.lower() in mostcommon_list and len(word) > 1 and len(word) <= 5:
wim.save ("output/erase-replace/crops4/wimreplace{}.png".format(num)) wim.save ("temp/crops4/wimreplace{}.png".format(num))
elif word in mostcommon_list and len(word) <= 7 : elif word in mostcommon_list and len(word) <= 7 :
wim.save ("output/erase-replace/crops7/wimreplace{}.png".format(num)) wim.save ("temp/crops7/wimreplace{}.png".format(num))
elif word in mostcommon_list and len(word) > 7 : elif word in mostcommon_list and len(word) > 7 :
wim.save ("output/erase-replace/crops_more/wimreplace{}.png".format(num)) wim.save ("temp/crops_more/wimreplace{}.png".format(num))
if x > 0: if x > 0:
# use PIL to crop out every box, then paste it according to if rule # use PIL to crop out every box, then paste it according to if rule
randomimg4 = random.choice(glob.glob('./output/erase-replace/crops4/*.png')) randomimg4 = random.choice(glob.glob('temp/crops4/*.png'))
randomimg7 = random.choice(glob.glob('./output/erase-replace/crops7/*.png')) randomimg7 = random.choice(glob.glob('temp/crops7/*.png'))
randomimg_more = random.choice(glob.glob('./output/erase-replace/crops_more/*.png')) randomimg_more = random.choice(glob.glob('temp/crops_more/*.png'))
wimreplace4 = Image.open(randomimg4) wimreplace4 = Image.open(randomimg4)
wimreplace7 = Image.open(randomimg7) wimreplace7 = Image.open(randomimg7)
@ -128,23 +137,18 @@ for i in scanimg:
elif word.lower() in leastcommon_list and len(word) < 8: elif word.lower() in leastcommon_list and len(word) < 8:
oim.paste(out4, (c[0], c[1])) oim.paste(out4, (c[0], c[1]))
print ('Excluding:', word)
elif word.lower() in leastcommon_list and len(word) < 11: elif word.lower() in leastcommon_list and len(word) < 11:
oim.paste(out7, (c[0], c[1])) oim.paste(out7, (c[0], c[1]))
print ('Excluding:', word)
elif word.lower() in leastcommon_list and len(word) > 8: elif word.lower() in leastcommon_list and len(word) > 8:
oim.paste(out_more, (c[0], c[1])) oim.paste(out_more, (c[0], c[1]))
print ('Excluding:', word)
else: else:
oim.paste(wim, (c[0], c[1], c[2], c[3])) oim.paste(wim, (c[0], c[1], c[2], c[3]))
print ('Including:', word)
else: else:
oim.paste(wim, (c[0], c[1], c[2], c[3])) oim.paste(wim, (c[0], c[1], c[2], c[3]))
print ('Including:', word)
#-------------------------------------------------------------------------------# #-------------------------------------------------------------------------------#
@ -155,20 +159,18 @@ for i in scanimg:
#-------------------------------------------------------------------------------# #-------------------------------------------------------------------------------#
# save images into PDF # save images into PDF
outputs = glob.glob('output/erase-replace/*replace.jpg') outputs = glob.glob('output/erase-replace/*replace.jpg')
print ("Saving to PDF:", outputs) print ('')
print ("Saving to PDF: output/erase-replace/Replace.pdf")
def makePdf(pdfFileName, listPages, dir = ''): def makePdf(pdfFileName, listPages, dir = ''):
if (dir): if (dir):
dir += "/" dir += "/"
cover = Image.open(dir + str(listPages[0])) cover = Image.open(dir + str(listPages[0]))
width, height = cover.size width, height = cover.size
pdf = FPDF(unit = "pt", format = [width, height]) pdf = FPDF(unit = "pt", format = [width, height])
for page in listPages: for page in listPages:
pdf.add_page() pdf.add_page()
pdf.image(dir + str(page), 0, 0) pdf.image(dir + str(page), 0, 0)
pdf.output(dir + pdfFileName + ".pdf", "F") pdf.output(dir + pdfFileName + ".pdf", "F")
makePdf('output/erase-replace/Replace', outputs, dir = '') makePdf('output/erase-replace/Replace', outputs, dir = '')
@ -178,15 +180,4 @@ files = glob.glob('./output/erase-replace/*replace.jpg')
for f in files: for f in files:
os.remove(f) os.remove(f)
files = glob.glob('./output/erase-replace/crops4/*.png') shutil.rmtree('./temp/')
for f in files:
os.remove(f)
files = glob.glob('./output/erase-replace/crops7/*.png')
for f in files:
os.remove(f)
files = glob.glob('./output/erase-replace/crops_more/*.png')
for f in files:
os.remove(f)

Loading…
Cancel
Save