Still need to fix the mirror margins

5 years ago · ae1a91eef7
parent b078f1c75b
commit ae1a91eef7
8 changed files with 47 additions and 71 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/burstpdf.py
+++ b/burstpdf.py
@ -5,7 +5,7 @@ from PIL import Image
 import time
 #DECLARE CONSTANTS
-PDF_PATH = (input("What pdf do you want to use? (include extention as example.pdf): "))
+PDF_PATH = ("scans/out.pdf")
 DPI = 200
 FIRST_PAGE = None
 LAST_PAGE = None
--- a/crop.py
+++ b/crop.py
@ -1,67 +0,0 @@
 import cv2
 import time
 import logging
 d = 1
 while True:
    try:
        threshold = 25
        time.sleep(1)
        input = ('input%d.jpg'%d)
        page = ('page%d.jpg'%d)
        print("Value of d is:",d,"\n","Page name:",input)
        img = cv2.imread(input, 0) # load grayscale version
        # the indeces where the useful region starts and ends
        hStrart = 0
        hEnd = img.shape[0]
        vStart = 0
        vEnd = img.shape[1]
        # get row and column maxes for each row and column
        hMax = img.max(1)
        vMax = img.max(0)
        hDone_flag = False
        vDone_flag = False
        # go through the list of max and begin where the pixel value is greater
        # than the threshold
        for i in range(hMax.size):
            if not hDone_flag:
                if hMax[i] > threshold:
                    hStart = i
                    hDone_flag = True
            if hDone_flag:
                if hMax[i] < threshold:
                    hEnd = i
                    break
        for i in range(vMax.size):
            if not vDone_flag:
                if vMax[i] > threshold:
                    vStart = i
                    vDone_flag = True
            if vDone_flag:
                if vMax[i] < threshold:
                    vEnd = i
                    break
        # load the color image and choose only the useful area from it
        img2 = (cv2.imread(input))[hStart:hEnd, vStart:vEnd,:]
        # write the cropped image
        cv2.imwrite(page, img2)
        d+=1
        print("Value of d is:", d)
    except:
        logging.exception("message")
        print("All pages must be ready!")
        break
--- a/mask_crop.py
+++ b/mask_crop.py
@ -0,0 +1,34 @@
 import cv2
 import logging
 d = 1
 while True:
    try:
        output = ('cropped/page%d.jpg'%d)
        # Load image, convert to grayscale, and find edges
        image = cv2.imread('rotated/input%d.jpg'%d)
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU + cv2.THRESH_BINARY)[1]
        # Find contour and sort by contour area
        cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        cnts = cnts[0] if len(cnts) == 2 else cnts[1]
        cnts = sorted(cnts, key=cv2.contourArea, reverse=True)
        # Find bounding box and extract ROI
        for c in cnts:
            x,y,w,h = cv2.boundingRect(c)
            ROI = image[y:y+h, x:x+w]
            break
        cv2.imwrite(output,ROI)
        cv2.waitKey()
        d+=1
    except:
        logging.exception("message")
        print("All pages must be ready!")
        break
--- a/merge_files.sh
+++ b/merge_files.sh
@ -4,4 +4,4 @@
 cd ocred
 pwd
-pdftk *.pdf cat output final.pdf
+pdfunite *.pdf out.pdf
--- a/merge_scans.sh
+++ b/merge_scans.sh
@ -0,0 +1,7 @@
 #!/bin/bash
 #line 3 means here
 # cd "$(dirname "$0")"
 cd scans
 pwd
 convert *.jpg out.pdf
--- a/tesseract_ocr.py
+++ b/tesseract_ocr.py
@ -7,7 +7,7 @@ i = 1
 while True:
    try:
-        img = Image.open("split/page%i.jpg"%i)
+        img = Image.open("cropped/page%i.jpg"%i)
        print(img)
        pdf = pytesseract.image_to_pdf_or_hocr(img, lang="eng", extension='pdf')
        time.sleep(1)
--- a/workshop_stream.sh
+++ b/workshop_stream.sh
@ -1,8 +1,10 @@
 mkdir split
 mkdir rotated
 mkdir ocred
 mkdir cropped
 ./merge_scans.sh
 python3 burstpdf.py
 python3 rotation.py
-python3 crop.py
+python3 mask_crop.py
 python3 tesseract_ocr.py
 ./merge_files.sh