Needs to be tested
commit
b078f1c75b
@ -0,0 +1,43 @@
|
|||||||
|
#Based in the code in https://iq.opengenus.org/pdf_to_image_in_python/
|
||||||
|
|
||||||
|
import pdf2image
|
||||||
|
from PIL import Image
|
||||||
|
import time
|
||||||
|
|
||||||
|
#DECLARE CONSTANTS
|
||||||
|
PDF_PATH = (input("What pdf do you want to use? (include extention as example.pdf): "))
|
||||||
|
DPI = 200
|
||||||
|
FIRST_PAGE = None
|
||||||
|
LAST_PAGE = None
|
||||||
|
FORMAT = 'jpg'
|
||||||
|
THREAD_COUNT = 1
|
||||||
|
USERPWD = None
|
||||||
|
USE_CROPBOX = False
|
||||||
|
STRICT = False
|
||||||
|
|
||||||
|
def pdftopil():
|
||||||
|
#This method reads a pdf and converts it into a sequence of images
|
||||||
|
#PDF_PATH sets the path to the PDF file
|
||||||
|
#dpi parameter assists in adjusting the resolution of the image
|
||||||
|
#first_page parameter allows you to set a first page to be processed by pdftoppm
|
||||||
|
#last_page parameter allows you to set a last page to be processed by pdftoppm
|
||||||
|
#fmt parameter allows to set the format of pdftoppm conversion (PpmImageFile, TIFF)
|
||||||
|
#thread_count parameter allows you to set how many thread will be used for conversion.
|
||||||
|
#userpw parameter allows you to set a password to unlock the converted PDF
|
||||||
|
#use_cropbox parameter allows you to use the crop box instead of the media box when converting
|
||||||
|
#strict parameter allows you to catch pdftoppm syntax error with a custom type PDFSyntaxError
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
pil_images = pdf2image.convert_from_path(PDF_PATH, dpi=DPI, first_page=FIRST_PAGE, last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD, use_cropbox=USE_CROPBOX, strict=STRICT)
|
||||||
|
print ("Time taken : " + str(time.time() - start_time))
|
||||||
|
return pil_images
|
||||||
|
|
||||||
|
def save_images(pil_images):
|
||||||
|
d = 1
|
||||||
|
for image in pil_images:
|
||||||
|
image.save(("split/input%d"%d) + ".jpg")
|
||||||
|
d += 1
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pil_images = pdftopil()
|
||||||
|
save_images(pil_images)
|
@ -0,0 +1,67 @@
|
|||||||
|
import cv2
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
|
||||||
|
d = 1
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
threshold = 25
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
input = ('input%d.jpg'%d)
|
||||||
|
page = ('page%d.jpg'%d)
|
||||||
|
|
||||||
|
print("Value of d is:",d,"\n","Page name:",input)
|
||||||
|
img = cv2.imread(input, 0) # load grayscale version
|
||||||
|
|
||||||
|
# the indeces where the useful region starts and ends
|
||||||
|
hStrart = 0
|
||||||
|
hEnd = img.shape[0]
|
||||||
|
vStart = 0
|
||||||
|
vEnd = img.shape[1]
|
||||||
|
|
||||||
|
# get row and column maxes for each row and column
|
||||||
|
hMax = img.max(1)
|
||||||
|
vMax = img.max(0)
|
||||||
|
|
||||||
|
hDone_flag = False
|
||||||
|
vDone_flag = False
|
||||||
|
|
||||||
|
# go through the list of max and begin where the pixel value is greater
|
||||||
|
# than the threshold
|
||||||
|
for i in range(hMax.size):
|
||||||
|
if not hDone_flag:
|
||||||
|
if hMax[i] > threshold:
|
||||||
|
hStart = i
|
||||||
|
hDone_flag = True
|
||||||
|
|
||||||
|
if hDone_flag:
|
||||||
|
if hMax[i] < threshold:
|
||||||
|
hEnd = i
|
||||||
|
break
|
||||||
|
|
||||||
|
for i in range(vMax.size):
|
||||||
|
if not vDone_flag:
|
||||||
|
if vMax[i] > threshold:
|
||||||
|
vStart = i
|
||||||
|
vDone_flag = True
|
||||||
|
|
||||||
|
if vDone_flag:
|
||||||
|
if vMax[i] < threshold:
|
||||||
|
vEnd = i
|
||||||
|
break
|
||||||
|
|
||||||
|
# load the color image and choose only the useful area from it
|
||||||
|
img2 = (cv2.imread(input))[hStart:hEnd, vStart:vEnd,:]
|
||||||
|
|
||||||
|
# write the cropped image
|
||||||
|
cv2.imwrite(page, img2)
|
||||||
|
|
||||||
|
d+=1
|
||||||
|
print("Value of d is:", d)
|
||||||
|
|
||||||
|
except:
|
||||||
|
logging.exception("message")
|
||||||
|
print("All pages must be ready!")
|
||||||
|
break
|
@ -0,0 +1,7 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#line 3 means here
|
||||||
|
# cd "$(dirname "$0")"
|
||||||
|
|
||||||
|
cd ocred
|
||||||
|
pwd
|
||||||
|
pdftk *.pdf cat output final.pdf
|
@ -0,0 +1,36 @@
|
|||||||
|
from PIL import Image
|
||||||
|
import time
|
||||||
|
|
||||||
|
i = 1
|
||||||
|
|
||||||
|
while True:
|
||||||
|
|
||||||
|
page = Image.open("split/input%i.jpg"%i)
|
||||||
|
|
||||||
|
if i % 2 == 0:
|
||||||
|
#check where the for loop is
|
||||||
|
print("trying even")
|
||||||
|
|
||||||
|
#rotate image by 90 degrees
|
||||||
|
angle = 90
|
||||||
|
out = page.rotate(angle, expand=True)
|
||||||
|
out.save('rotated/input%i.jpg'%i)
|
||||||
|
print('This is an even page number')
|
||||||
|
|
||||||
|
time.sleep(2)
|
||||||
|
print("variable i: ", i)
|
||||||
|
|
||||||
|
else:
|
||||||
|
#check where the for loop is
|
||||||
|
print("trying odd")
|
||||||
|
|
||||||
|
#rotate image by 90 degrees
|
||||||
|
angle = 270
|
||||||
|
out = page.rotate(angle, expand=True)
|
||||||
|
out.save('rotated/input%i.jpg'%i)
|
||||||
|
print('This is an even page number')
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
|
print("variable i: ", i)
|
||||||
|
|
||||||
|
i+=1
|
@ -0,0 +1,22 @@
|
|||||||
|
# import libraries
|
||||||
|
from PIL import Image
|
||||||
|
import pytesseract
|
||||||
|
import time
|
||||||
|
|
||||||
|
i = 1
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
img = Image.open("split/page%i.jpg"%i)
|
||||||
|
print(img)
|
||||||
|
pdf = pytesseract.image_to_pdf_or_hocr(img, lang="eng", extension='pdf')
|
||||||
|
time.sleep(1)
|
||||||
|
file = open(("ocred/page%i.pdf"%i), "w+b")
|
||||||
|
file.write(bytearray(pdf))
|
||||||
|
file.close()
|
||||||
|
i+=1
|
||||||
|
print(i)
|
||||||
|
|
||||||
|
except:
|
||||||
|
print("All pages must be ready!")
|
||||||
|
break
|
@ -0,0 +1,8 @@
|
|||||||
|
mkdir split
|
||||||
|
mkdir rotated
|
||||||
|
mkdir ocred
|
||||||
|
python3 burstpdf.py
|
||||||
|
python3 rotation.py
|
||||||
|
python3 crop.py
|
||||||
|
python3 tesseract_ocr.py
|
||||||
|
./merge_files.sh
|
Loading…
Reference in New Issue