Needs to be tested
commit
b078f1c75b
@ -0,0 +1,43 @@
|
||||
#Based in the code in https://iq.opengenus.org/pdf_to_image_in_python/
|
||||
|
||||
import pdf2image
|
||||
from PIL import Image
|
||||
import time
|
||||
|
||||
#DECLARE CONSTANTS
|
||||
PDF_PATH = (input("What pdf do you want to use? (include extention as example.pdf): "))
|
||||
DPI = 200
|
||||
FIRST_PAGE = None
|
||||
LAST_PAGE = None
|
||||
FORMAT = 'jpg'
|
||||
THREAD_COUNT = 1
|
||||
USERPWD = None
|
||||
USE_CROPBOX = False
|
||||
STRICT = False
|
||||
|
||||
def pdftopil():
|
||||
#This method reads a pdf and converts it into a sequence of images
|
||||
#PDF_PATH sets the path to the PDF file
|
||||
#dpi parameter assists in adjusting the resolution of the image
|
||||
#first_page parameter allows you to set a first page to be processed by pdftoppm
|
||||
#last_page parameter allows you to set a last page to be processed by pdftoppm
|
||||
#fmt parameter allows to set the format of pdftoppm conversion (PpmImageFile, TIFF)
|
||||
#thread_count parameter allows you to set how many thread will be used for conversion.
|
||||
#userpw parameter allows you to set a password to unlock the converted PDF
|
||||
#use_cropbox parameter allows you to use the crop box instead of the media box when converting
|
||||
#strict parameter allows you to catch pdftoppm syntax error with a custom type PDFSyntaxError
|
||||
|
||||
start_time = time.time()
|
||||
pil_images = pdf2image.convert_from_path(PDF_PATH, dpi=DPI, first_page=FIRST_PAGE, last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD, use_cropbox=USE_CROPBOX, strict=STRICT)
|
||||
print ("Time taken : " + str(time.time() - start_time))
|
||||
return pil_images
|
||||
|
||||
def save_images(pil_images):
|
||||
d = 1
|
||||
for image in pil_images:
|
||||
image.save(("split/input%d"%d) + ".jpg")
|
||||
d += 1
|
||||
|
||||
if __name__ == "__main__":
|
||||
pil_images = pdftopil()
|
||||
save_images(pil_images)
|
@ -0,0 +1,67 @@
|
||||
import cv2
|
||||
import time
|
||||
import logging
|
||||
|
||||
d = 1
|
||||
|
||||
while True:
|
||||
try:
|
||||
threshold = 25
|
||||
time.sleep(1)
|
||||
|
||||
input = ('input%d.jpg'%d)
|
||||
page = ('page%d.jpg'%d)
|
||||
|
||||
print("Value of d is:",d,"\n","Page name:",input)
|
||||
img = cv2.imread(input, 0) # load grayscale version
|
||||
|
||||
# the indeces where the useful region starts and ends
|
||||
hStrart = 0
|
||||
hEnd = img.shape[0]
|
||||
vStart = 0
|
||||
vEnd = img.shape[1]
|
||||
|
||||
# get row and column maxes for each row and column
|
||||
hMax = img.max(1)
|
||||
vMax = img.max(0)
|
||||
|
||||
hDone_flag = False
|
||||
vDone_flag = False
|
||||
|
||||
# go through the list of max and begin where the pixel value is greater
|
||||
# than the threshold
|
||||
for i in range(hMax.size):
|
||||
if not hDone_flag:
|
||||
if hMax[i] > threshold:
|
||||
hStart = i
|
||||
hDone_flag = True
|
||||
|
||||
if hDone_flag:
|
||||
if hMax[i] < threshold:
|
||||
hEnd = i
|
||||
break
|
||||
|
||||
for i in range(vMax.size):
|
||||
if not vDone_flag:
|
||||
if vMax[i] > threshold:
|
||||
vStart = i
|
||||
vDone_flag = True
|
||||
|
||||
if vDone_flag:
|
||||
if vMax[i] < threshold:
|
||||
vEnd = i
|
||||
break
|
||||
|
||||
# load the color image and choose only the useful area from it
|
||||
img2 = (cv2.imread(input))[hStart:hEnd, vStart:vEnd,:]
|
||||
|
||||
# write the cropped image
|
||||
cv2.imwrite(page, img2)
|
||||
|
||||
d+=1
|
||||
print("Value of d is:", d)
|
||||
|
||||
except:
|
||||
logging.exception("message")
|
||||
print("All pages must be ready!")
|
||||
break
|
@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
#line 3 means here
|
||||
# cd "$(dirname "$0")"
|
||||
|
||||
cd ocred
|
||||
pwd
|
||||
pdftk *.pdf cat output final.pdf
|
@ -0,0 +1,36 @@
|
||||
from PIL import Image
|
||||
import time
|
||||
|
||||
i = 1
|
||||
|
||||
while True:
|
||||
|
||||
page = Image.open("split/input%i.jpg"%i)
|
||||
|
||||
if i % 2 == 0:
|
||||
#check where the for loop is
|
||||
print("trying even")
|
||||
|
||||
#rotate image by 90 degrees
|
||||
angle = 90
|
||||
out = page.rotate(angle, expand=True)
|
||||
out.save('rotated/input%i.jpg'%i)
|
||||
print('This is an even page number')
|
||||
|
||||
time.sleep(2)
|
||||
print("variable i: ", i)
|
||||
|
||||
else:
|
||||
#check where the for loop is
|
||||
print("trying odd")
|
||||
|
||||
#rotate image by 90 degrees
|
||||
angle = 270
|
||||
out = page.rotate(angle, expand=True)
|
||||
out.save('rotated/input%i.jpg'%i)
|
||||
print('This is an even page number')
|
||||
|
||||
time.sleep(1)
|
||||
print("variable i: ", i)
|
||||
|
||||
i+=1
|
@ -0,0 +1,22 @@
|
||||
# import libraries
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
import time
|
||||
|
||||
i = 1
|
||||
|
||||
while True:
|
||||
try:
|
||||
img = Image.open("split/page%i.jpg"%i)
|
||||
print(img)
|
||||
pdf = pytesseract.image_to_pdf_or_hocr(img, lang="eng", extension='pdf')
|
||||
time.sleep(1)
|
||||
file = open(("ocred/page%i.pdf"%i), "w+b")
|
||||
file.write(bytearray(pdf))
|
||||
file.close()
|
||||
i+=1
|
||||
print(i)
|
||||
|
||||
except:
|
||||
print("All pages must be ready!")
|
||||
break
|
@ -0,0 +1,8 @@
|
||||
mkdir split
|
||||
mkdir rotated
|
||||
mkdir ocred
|
||||
python3 burstpdf.py
|
||||
python3 rotation.py
|
||||
python3 crop.py
|
||||
python3 tesseract_ocr.py
|
||||
./merge_files.sh
|
Loading…
Reference in New Issue