From ae1a91eef700a26d81d4e95b6f8ffba2fc644519 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pedro=20S=C3=A1=20Couto?= Date: Mon, 27 Jan 2020 23:05:56 +0100 Subject: [PATCH] Still need to fix the mirror margins --- .DS_Store | Bin 0 -> 8196 bytes burstpdf.py | 2 +- crop.py | 67 --------------------------------------------- mask_crop.py | 34 +++++++++++++++++++++++ merge_files.sh | 2 +- merge_scans.sh | 7 +++++ tesseract_ocr.py | 2 +- workshop_stream.sh | 4 ++- 8 files changed, 47 insertions(+), 71 deletions(-) create mode 100644 .DS_Store delete mode 100644 crop.py create mode 100644 mask_crop.py create mode 100755 merge_scans.sh diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..59fbf2370bb4a8a81115ba9318790f1954867408 GIT binary patch literal 8196 zcmeI1!D<^Z5Qax-4hEMHddhLnJ;dbHLLe-OOK1rtkmQ`K9XBy`y~bXX=C-fUSLiGB z9r9YqHzCYo2}AV^4$-kRvQDxz<)6$R1oYbuqYEoPRoY ziB}hw2DI83Fb1j&Xzt#Wj!b1Nlezr;{^KC;b_RI|4c%02Hr2+X7e7ryx^gSuBq!tF z#rAeV{});ZtMo@j=VUr5#^}A13ufqfA6jxMXL2Mhp4g-JD_-T?Cuskemi{W_o}Ru8 z<%a%q^uE)V$cbdEWQa#PSC77ey2UJAe)q8{Xt#J`#e2N@9KETGq5qndX5O1iKbLx> zw;YQs?ZeW;%&NDrQ(tbVFS!}%%YfFTma${b(;t6o`%l?hxvCi^SL(s){J6fQjJH?M z&L{MLE@fxIc8%DndiES=^Q&Ko4OZKX@tzU2eWHF?8*v9j+|>*(+dk$eu&8mD!D&I{ z`XS!Ff1bm>|FB_V46Hi?uY!X+ zn*V=Te*eGj%P{|p0b}4V7;u~Iv$lT1V{ToUN^|Xmaz#;5y)vLTp;8s?_vtwJ`wv6f XNgPWfmH`<->>|L^V3jfOq6|C%^s@(W literal 0 HcmV?d00001 diff --git a/burstpdf.py b/burstpdf.py index 596c203..ac20d02 100755 --- a/burstpdf.py +++ b/burstpdf.py @@ -5,7 +5,7 @@ from PIL import Image import time #DECLARE CONSTANTS -PDF_PATH = (input("What pdf do you want to use? (include extention as example.pdf): ")) +PDF_PATH = ("scans/out.pdf") DPI = 200 FIRST_PAGE = None LAST_PAGE = None diff --git a/crop.py b/crop.py deleted file mode 100644 index ce0ca1b..0000000 --- a/crop.py +++ /dev/null @@ -1,67 +0,0 @@ -import cv2 -import time -import logging - -d = 1 - -while True: - try: - threshold = 25 - time.sleep(1) - - input = ('input%d.jpg'%d) - page = ('page%d.jpg'%d) - - print("Value of d is:",d,"\n","Page name:",input) - img = cv2.imread(input, 0) # load grayscale version - - # the indeces where the useful region starts and ends - hStrart = 0 - hEnd = img.shape[0] - vStart = 0 - vEnd = img.shape[1] - - # get row and column maxes for each row and column - hMax = img.max(1) - vMax = img.max(0) - - hDone_flag = False - vDone_flag = False - - # go through the list of max and begin where the pixel value is greater - # than the threshold - for i in range(hMax.size): - if not hDone_flag: - if hMax[i] > threshold: - hStart = i - hDone_flag = True - - if hDone_flag: - if hMax[i] < threshold: - hEnd = i - break - - for i in range(vMax.size): - if not vDone_flag: - if vMax[i] > threshold: - vStart = i - vDone_flag = True - - if vDone_flag: - if vMax[i] < threshold: - vEnd = i - break - - # load the color image and choose only the useful area from it - img2 = (cv2.imread(input))[hStart:hEnd, vStart:vEnd,:] - - # write the cropped image - cv2.imwrite(page, img2) - - d+=1 - print("Value of d is:", d) - - except: - logging.exception("message") - print("All pages must be ready!") - break diff --git a/mask_crop.py b/mask_crop.py new file mode 100644 index 0000000..4b9f2ea --- /dev/null +++ b/mask_crop.py @@ -0,0 +1,34 @@ +import cv2 +import logging + +d = 1 + +while True: + try: + output = ('cropped/page%d.jpg'%d) + + # Load image, convert to grayscale, and find edges + image = cv2.imread('rotated/input%d.jpg'%d) + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU + cv2.THRESH_BINARY)[1] + + # Find contour and sort by contour area + cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cnts = cnts[0] if len(cnts) == 2 else cnts[1] + cnts = sorted(cnts, key=cv2.contourArea, reverse=True) + + # Find bounding box and extract ROI + for c in cnts: + x,y,w,h = cv2.boundingRect(c) + ROI = image[y:y+h, x:x+w] + break + + cv2.imwrite(output,ROI) + cv2.waitKey() + + d+=1 + + except: + logging.exception("message") + print("All pages must be ready!") + break diff --git a/merge_files.sh b/merge_files.sh index b25b3e0..004c02c 100755 --- a/merge_files.sh +++ b/merge_files.sh @@ -4,4 +4,4 @@ cd ocred pwd -pdftk *.pdf cat output final.pdf +pdfunite *.pdf out.pdf diff --git a/merge_scans.sh b/merge_scans.sh new file mode 100755 index 0000000..21211f9 --- /dev/null +++ b/merge_scans.sh @@ -0,0 +1,7 @@ +#!/bin/bash +#line 3 means here +# cd "$(dirname "$0")" + +cd scans +pwd +convert *.jpg out.pdf diff --git a/tesseract_ocr.py b/tesseract_ocr.py index 0d62b57..2e91780 100755 --- a/tesseract_ocr.py +++ b/tesseract_ocr.py @@ -7,7 +7,7 @@ i = 1 while True: try: - img = Image.open("split/page%i.jpg"%i) + img = Image.open("cropped/page%i.jpg"%i) print(img) pdf = pytesseract.image_to_pdf_or_hocr(img, lang="eng", extension='pdf') time.sleep(1) diff --git a/workshop_stream.sh b/workshop_stream.sh index c3f04b1..187732f 100755 --- a/workshop_stream.sh +++ b/workshop_stream.sh @@ -1,8 +1,10 @@ mkdir split mkdir rotated mkdir ocred +mkdir cropped +./merge_scans.sh python3 burstpdf.py python3 rotation.py -python3 crop.py +python3 mask_crop.py python3 tesseract_ocr.py ./merge_files.sh