diff --git a/.DS_Store b/.DS_Store index a1888eb..b9d3dc6 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/bounding_box.py b/bounding_box.py old mode 100644 new mode 100755 index 0f4c481..ff55ba7 --- a/bounding_box.py +++ b/bounding_box.py @@ -29,6 +29,6 @@ while True: d+=1 except: - logging.exception("message") + # logging.exception("message") print("All pages must be ready!") break diff --git a/burstpdf.py b/burstpdf.py deleted file mode 100755 index 32e0e9c..0000000 --- a/burstpdf.py +++ /dev/null @@ -1,43 +0,0 @@ -#Based in the code in https://iq.opengenus.org/pdf_to_image_in_python/ - -import pdf2image -from PIL import Image -import time - -#DECLARE CONSTANTS -PDF_PATH = ("scans/out.pdf") -DPI = 72 -FIRST_PAGE = None -LAST_PAGE = None -FORMAT = 'jpg' -THREAD_COUNT = 1 -USERPWD = None -USE_CROPBOX = False -STRICT = False - -def pdftopil(): - #This method reads a pdf and converts it into a sequence of images - #PDF_PATH sets the path to the PDF file - #dpi parameter assists in adjusting the resolution of the image - #first_page parameter allows you to set a first page to be processed by pdftoppm - #last_page parameter allows you to set a last page to be processed by pdftoppm - #fmt parameter allows to set the format of pdftoppm conversion (PpmImageFile, TIFF) - #thread_count parameter allows you to set how many thread will be used for conversion. - #userpw parameter allows you to set a password to unlock the converted PDF - #use_cropbox parameter allows you to use the crop box instead of the media box when converting - #strict parameter allows you to catch pdftoppm syntax error with a custom type PDFSyntaxError - - start_time = time.time() - pil_images = pdf2image.convert_from_path(PDF_PATH, dpi=DPI, first_page=FIRST_PAGE, last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD, use_cropbox=USE_CROPBOX, strict=STRICT) - print ("Time taken : " + str(time.time() - start_time)) - return pil_images - -def save_images(pil_images): - d = 0 - for image in pil_images: - image.save(("split/input%d"%d) + ".jpg") - d += 1 - -if __name__ == "__main__": - pil_images = pdftopil() - save_images(pil_images) diff --git a/change_res.sh b/change_res.sh new file mode 100755 index 0000000..0b1dd0e --- /dev/null +++ b/change_res.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +i=0 +for img in `ls scans/*.jpg`; do + convert $img -density 72 split/input$i.jpg + i=$((i+1)); +done diff --git a/delete_and_start_over.sh b/delete_and_start_over.sh new file mode 100755 index 0000000..6eda002 --- /dev/null +++ b/delete_and_start_over.sh @@ -0,0 +1,6 @@ +rm -R scans split rotated bounding_box ocred +mkdir -p scans +mv out.pdf $(date +%F-%H:%M).pdf && touch out.pdf +sleep 2 +cp *.pdf ~/Desktop +rm *.pdf diff --git a/merge_scans.sh b/merge_scans.sh deleted file mode 100755 index 21211f9..0000000 --- a/merge_scans.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -#line 3 means here -# cd "$(dirname "$0")" - -cd scans -pwd -convert *.jpg out.pdf diff --git a/mirror_crop.py b/mirror_crop.py old mode 100644 new mode 100755 diff --git a/readme.md b/readme.md old mode 100644 new mode 100755 index a417292..ea9add1 --- a/readme.md +++ b/readme.md @@ -55,7 +55,7 @@ sudo pip3 install pdf2image Pillow opencv-python pytesseract

Make all the files executable.

```bash -sudo chmod 777 merge_scans.sh workshop_stream.sh marge_files.sh +sudo chmod 777 merge_scans.sh workshop_stream.sh rename_scans.sh change_res.sh delete_and_start_over.sh ```

In case you want to skip any of the scripts just comment out in the shell code, workshop_stream.sh.

@@ -116,15 +116,15 @@ mkdir bounding_box mkdir cropped ``` ###Merge the files in the directory scans -

All the scans will be appended to one pdf called out.pdf

+

All the scans will be renamed

```bash -./merge_scans.sh +./rename_scans.sh ``` ###Burst the pdf in scans -

Burst this pdf, renaming all the files so they can be iterated later.

+

Change resolution of the scans so that it is lighter to process

```bash -python3 burstpdf.py +./change_res.sh ``` ###Rotate the pdfs @@ -140,7 +140,7 @@ python3 bounding_box.py ``` ###Crop the mirror -

The pages are now cropped, but the mirror is still visible in the middle. I commented it out because if the cameras are positioned correctly there is no need for this step.

+

The pages are now cropped, but the mirror may still be visible in the edge. This happens if the cameras are not adjusted properly. I commented it out because if the cameras are positioned correctly there is no need for this step.

```bash python3 mirror_crop.py ``` @@ -156,6 +156,12 @@ python3 tesseract_ocr.py ```bash ./merge_files.sh ``` + +##START OVER +

Just run delete_and_start_over.sh and start over

+```bash +./delete_and_start_over.sh +```

## License The package is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT). diff --git a/remove.sh b/remove.sh old mode 100644 new mode 100755 index 8c11f7b..7363bcb --- a/remove.sh +++ b/remove.sh @@ -1,5 +1,4 @@ cd split pwd -rm page0.jpg +rm input0.jpg rm `ls *.jpg | tail -n 1` -rm .DS_Store diff --git a/rename_scans.sh b/rename_scans.sh new file mode 100755 index 0000000..d509cc3 --- /dev/null +++ b/rename_scans.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +i=0 +for img in `ls scans/*.jpg`; do + mv $img scans/input$i.jpg + i=$((i+1)); +done diff --git a/rotation.py b/rotation.py old mode 100644 new mode 100755 diff --git a/scans/.DS_Store b/scans/.DS_Store deleted file mode 100644 index 5008ddf..0000000 Binary files a/scans/.DS_Store and /dev/null differ diff --git a/workshop_stream.sh b/workshop_stream.sh old mode 100644 new mode 100755 index b64026f..9cac6e5 --- a/workshop_stream.sh +++ b/workshop_stream.sh @@ -1,13 +1,13 @@ -./merge_scans.sh +./rename_scans.sh mkdir -p split -python3 burstpdf.py +./change_res.sh +./remove.sh mkdir -p rotated python3 rotation.py mkdir -p bounding_box python3 bounding_box.py -mkdir -p cropped +# mkdir -p cropped # python3 mirror_crop.py mkdir -p ocred python3 tesseract_ocr.py -./remove.sh ./merge_files.sh