commit b078f1c75b69371f499fffc4ce1cd9651aff106f Author: Pedro Sá Couto Date: Sun Jan 26 14:43:30 2020 +0100 Needs to be tested diff --git a/burstpdf.py b/burstpdf.py new file mode 100755 index 0000000..596c203 --- /dev/null +++ b/burstpdf.py @@ -0,0 +1,43 @@ +#Based in the code in https://iq.opengenus.org/pdf_to_image_in_python/ + +import pdf2image +from PIL import Image +import time + +#DECLARE CONSTANTS +PDF_PATH = (input("What pdf do you want to use? (include extention as example.pdf): ")) +DPI = 200 +FIRST_PAGE = None +LAST_PAGE = None +FORMAT = 'jpg' +THREAD_COUNT = 1 +USERPWD = None +USE_CROPBOX = False +STRICT = False + +def pdftopil(): + #This method reads a pdf and converts it into a sequence of images + #PDF_PATH sets the path to the PDF file + #dpi parameter assists in adjusting the resolution of the image + #first_page parameter allows you to set a first page to be processed by pdftoppm + #last_page parameter allows you to set a last page to be processed by pdftoppm + #fmt parameter allows to set the format of pdftoppm conversion (PpmImageFile, TIFF) + #thread_count parameter allows you to set how many thread will be used for conversion. + #userpw parameter allows you to set a password to unlock the converted PDF + #use_cropbox parameter allows you to use the crop box instead of the media box when converting + #strict parameter allows you to catch pdftoppm syntax error with a custom type PDFSyntaxError + + start_time = time.time() + pil_images = pdf2image.convert_from_path(PDF_PATH, dpi=DPI, first_page=FIRST_PAGE, last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD, use_cropbox=USE_CROPBOX, strict=STRICT) + print ("Time taken : " + str(time.time() - start_time)) + return pil_images + +def save_images(pil_images): + d = 1 + for image in pil_images: + image.save(("split/input%d"%d) + ".jpg") + d += 1 + +if __name__ == "__main__": + pil_images = pdftopil() + save_images(pil_images) diff --git a/chmod.sh b/chmod.sh new file mode 100755 index 0000000..8cbeb25 --- /dev/null +++ b/chmod.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo chmod 777 * diff --git a/crop.py b/crop.py new file mode 100644 index 0000000..ce0ca1b --- /dev/null +++ b/crop.py @@ -0,0 +1,67 @@ +import cv2 +import time +import logging + +d = 1 + +while True: + try: + threshold = 25 + time.sleep(1) + + input = ('input%d.jpg'%d) + page = ('page%d.jpg'%d) + + print("Value of d is:",d,"\n","Page name:",input) + img = cv2.imread(input, 0) # load grayscale version + + # the indeces where the useful region starts and ends + hStrart = 0 + hEnd = img.shape[0] + vStart = 0 + vEnd = img.shape[1] + + # get row and column maxes for each row and column + hMax = img.max(1) + vMax = img.max(0) + + hDone_flag = False + vDone_flag = False + + # go through the list of max and begin where the pixel value is greater + # than the threshold + for i in range(hMax.size): + if not hDone_flag: + if hMax[i] > threshold: + hStart = i + hDone_flag = True + + if hDone_flag: + if hMax[i] < threshold: + hEnd = i + break + + for i in range(vMax.size): + if not vDone_flag: + if vMax[i] > threshold: + vStart = i + vDone_flag = True + + if vDone_flag: + if vMax[i] < threshold: + vEnd = i + break + + # load the color image and choose only the useful area from it + img2 = (cv2.imread(input))[hStart:hEnd, vStart:vEnd,:] + + # write the cropped image + cv2.imwrite(page, img2) + + d+=1 + print("Value of d is:", d) + + except: + logging.exception("message") + print("All pages must be ready!") + break diff --git a/merge_files.sh b/merge_files.sh new file mode 100755 index 0000000..b25b3e0 --- /dev/null +++ b/merge_files.sh @@ -0,0 +1,7 @@ +#!/bin/bash +#line 3 means here +# cd "$(dirname "$0")" + +cd ocred +pwd +pdftk *.pdf cat output final.pdf diff --git a/rotation.py b/rotation.py new file mode 100644 index 0000000..a802fa6 --- /dev/null +++ b/rotation.py @@ -0,0 +1,36 @@ +from PIL import Image +import time + +i = 1 + +while True: + + page = Image.open("split/input%i.jpg"%i) + + if i % 2 == 0: + #check where the for loop is + print("trying even") + + #rotate image by 90 degrees + angle = 90 + out = page.rotate(angle, expand=True) + out.save('rotated/input%i.jpg'%i) + print('This is an even page number') + + time.sleep(2) + print("variable i: ", i) + + else: + #check where the for loop is + print("trying odd") + + #rotate image by 90 degrees + angle = 270 + out = page.rotate(angle, expand=True) + out.save('rotated/input%i.jpg'%i) + print('This is an even page number') + + time.sleep(1) + print("variable i: ", i) + + i+=1 diff --git a/tesseract_ocr.py b/tesseract_ocr.py new file mode 100755 index 0000000..0d62b57 --- /dev/null +++ b/tesseract_ocr.py @@ -0,0 +1,22 @@ +# import libraries +from PIL import Image +import pytesseract +import time + +i = 1 + +while True: + try: + img = Image.open("split/page%i.jpg"%i) + print(img) + pdf = pytesseract.image_to_pdf_or_hocr(img, lang="eng", extension='pdf') + time.sleep(1) + file = open(("ocred/page%i.pdf"%i), "w+b") + file.write(bytearray(pdf)) + file.close() + i+=1 + print(i) + + except: + print("All pages must be ready!") + break diff --git a/workshop_stream.sh b/workshop_stream.sh new file mode 100755 index 0000000..c3f04b1 --- /dev/null +++ b/workshop_stream.sh @@ -0,0 +1,8 @@ +mkdir split +mkdir rotated +mkdir ocred +python3 burstpdf.py +python3 rotation.py +python3 crop.py +python3 tesseract_ocr.py +./merge_files.sh