Needs to be tested

5 years ago · b078f1c75b
commit b078f1c75b
7 changed files with 186 additions and 0 deletions
--- a/burstpdf.py
+++ b/burstpdf.py
@ -0,0 +1,43 @@
 #Based in the code in https://iq.opengenus.org/pdf_to_image_in_python/
 import pdf2image
 from PIL import Image
 import time
 #DECLARE CONSTANTS
 PDF_PATH = (input("What pdf do you want to use? (include extention as example.pdf): "))
 DPI = 200
 FIRST_PAGE = None
 LAST_PAGE = None
 FORMAT = 'jpg'
 THREAD_COUNT = 1
 USERPWD = None
 USE_CROPBOX = False
 STRICT = False
 def pdftopil():
    #This method reads a pdf and converts it into a sequence of images
    #PDF_PATH sets the path to the PDF file
    #dpi parameter assists in adjusting the resolution of the image
    #first_page parameter allows you to set a first page to be processed by pdftoppm
    #last_page parameter allows you to set a last page to be processed by pdftoppm
    #fmt parameter allows to set the format of pdftoppm conversion (PpmImageFile, TIFF)
    #thread_count parameter allows you to set how many thread will be used for conversion.
    #userpw parameter allows you to set a password to unlock the converted PDF
    #use_cropbox parameter allows you to use the crop box instead of the media box when converting
    #strict parameter allows you to catch pdftoppm syntax error with a custom type PDFSyntaxError
    start_time = time.time()
    pil_images = pdf2image.convert_from_path(PDF_PATH, dpi=DPI, first_page=FIRST_PAGE, last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD, use_cropbox=USE_CROPBOX, strict=STRICT)
    print ("Time taken : " + str(time.time() - start_time))
    return pil_images
 def save_images(pil_images):
    d = 1
    for image in pil_images:
        image.save(("split/input%d"%d) + ".jpg")
        d += 1
 if __name__ == "__main__":
    pil_images = pdftopil()
    save_images(pil_images)
--- a/chmod.sh
+++ b/chmod.sh
@ -0,0 +1,3 @@
 #!/bin/bash
 sudo chmod 777 *
--- a/crop.py
+++ b/crop.py
@ -0,0 +1,67 @@
 import cv2
 import time
 import logging
 d = 1
 while True:
    try:
        threshold = 25
        time.sleep(1)
        input = ('input%d.jpg'%d)
        page = ('page%d.jpg'%d)
        print("Value of d is:",d,"\n","Page name:",input)
        img = cv2.imread(input, 0) # load grayscale version
        # the indeces where the useful region starts and ends
        hStrart = 0
        hEnd = img.shape[0]
        vStart = 0
        vEnd = img.shape[1]
        # get row and column maxes for each row and column
        hMax = img.max(1)
        vMax = img.max(0)
        hDone_flag = False
        vDone_flag = False
        # go through the list of max and begin where the pixel value is greater
        # than the threshold
        for i in range(hMax.size):
            if not hDone_flag:
                if hMax[i] > threshold:
                    hStart = i
                    hDone_flag = True
            if hDone_flag:
                if hMax[i] < threshold:
                    hEnd = i
                    break
        for i in range(vMax.size):
            if not vDone_flag:
                if vMax[i] > threshold:
                    vStart = i
                    vDone_flag = True
            if vDone_flag:
                if vMax[i] < threshold:
                    vEnd = i
                    break
        # load the color image and choose only the useful area from it
        img2 = (cv2.imread(input))[hStart:hEnd, vStart:vEnd,:]
        # write the cropped image
        cv2.imwrite(page, img2)
        d+=1
        print("Value of d is:", d)
    except:
        logging.exception("message")
        print("All pages must be ready!")
        break
--- a/merge_files.sh
+++ b/merge_files.sh
@ -0,0 +1,7 @@
 #!/bin/bash
 #line 3 means here
 # cd "$(dirname "$0")"
 cd ocred
 pwd
 pdftk *.pdf cat output final.pdf
--- a/rotation.py
+++ b/rotation.py
@ -0,0 +1,36 @@
 from PIL import Image
 import time
 i = 1
 while True:
    page = Image.open("split/input%i.jpg"%i)
    if i % 2 == 0:
        #check where the for loop is
        print("trying even")
        #rotate image by 90 degrees
        angle = 90
        out = page.rotate(angle, expand=True)
        out.save('rotated/input%i.jpg'%i)
        print('This is an even page number')
        time.sleep(2)
        print("variable i: ", i)
    else:
        #check where the for loop is
        print("trying odd")
        #rotate image by 90 degrees
        angle = 270
        out = page.rotate(angle, expand=True)
        out.save('rotated/input%i.jpg'%i)
        print('This is an even page number')
        time.sleep(1)
        print("variable i: ", i)
    i+=1
--- a/tesseract_ocr.py
+++ b/tesseract_ocr.py
@ -0,0 +1,22 @@
 # import libraries
 from PIL import Image
 import pytesseract
 import time
 i = 1
 while True:
    try:
        img = Image.open("split/page%i.jpg"%i)
        print(img)
        pdf = pytesseract.image_to_pdf_or_hocr(img, lang="eng", extension='pdf')
        time.sleep(1)
        file = open(("ocred/page%i.pdf"%i), "w+b")
        file.write(bytearray(pdf))
        file.close()
        i+=1
        print(i)
    except:
        print("All pages must be ready!")
        break
--- a/workshop_stream.sh
+++ b/workshop_stream.sh
@ -0,0 +1,8 @@
 mkdir split
 mkdir rotated
 mkdir ocred
 python3 burstpdf.py
 python3 rotation.py
 python3 crop.py
 python3 tesseract_ocr.py
 ./merge_files.sh