master
Pedro Sá Couto 4 years ago
parent d67bc4af5b
commit 5588614b44

BIN
.DS_Store vendored

Binary file not shown.

@ -29,6 +29,6 @@ while True:
d+=1
except:
logging.exception("message")
# logging.exception("message")
print("All pages must be ready!")
break

@ -1,43 +0,0 @@
#Based in the code in https://iq.opengenus.org/pdf_to_image_in_python/
import pdf2image
from PIL import Image
import time
#DECLARE CONSTANTS
PDF_PATH = ("scans/out.pdf")
DPI = 72
FIRST_PAGE = None
LAST_PAGE = None
FORMAT = 'jpg'
THREAD_COUNT = 1
USERPWD = None
USE_CROPBOX = False
STRICT = False
def pdftopil():
#This method reads a pdf and converts it into a sequence of images
#PDF_PATH sets the path to the PDF file
#dpi parameter assists in adjusting the resolution of the image
#first_page parameter allows you to set a first page to be processed by pdftoppm
#last_page parameter allows you to set a last page to be processed by pdftoppm
#fmt parameter allows to set the format of pdftoppm conversion (PpmImageFile, TIFF)
#thread_count parameter allows you to set how many thread will be used for conversion.
#userpw parameter allows you to set a password to unlock the converted PDF
#use_cropbox parameter allows you to use the crop box instead of the media box when converting
#strict parameter allows you to catch pdftoppm syntax error with a custom type PDFSyntaxError
start_time = time.time()
pil_images = pdf2image.convert_from_path(PDF_PATH, dpi=DPI, first_page=FIRST_PAGE, last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD, use_cropbox=USE_CROPBOX, strict=STRICT)
print ("Time taken : " + str(time.time() - start_time))
return pil_images
def save_images(pil_images):
d = 0
for image in pil_images:
image.save(("split/input%d"%d) + ".jpg")
d += 1
if __name__ == "__main__":
pil_images = pdftopil()
save_images(pil_images)

@ -0,0 +1,7 @@
#!/bin/bash
i=0
for img in `ls scans/*.jpg`; do
convert $img -density 72 split/input$i.jpg
i=$((i+1));
done

@ -0,0 +1,6 @@
rm -R scans split rotated bounding_box ocred
mkdir -p scans
mv out.pdf $(date +%F-%H:%M).pdf && touch out.pdf
sleep 2
cp *.pdf ~/Desktop
rm *.pdf

@ -1,7 +0,0 @@
#!/bin/bash
#line 3 means here
# cd "$(dirname "$0")"
cd scans
pwd
convert *.jpg out.pdf

@ -55,7 +55,7 @@ sudo pip3 install pdf2image Pillow opencv-python pytesseract
<p>Make all the files executable.</p>
```bash
sudo chmod 777 merge_scans.sh workshop_stream.sh marge_files.sh
sudo chmod 777 merge_scans.sh workshop_stream.sh rename_scans.sh change_res.sh delete_and_start_over.sh
```
<p>In case you want to skip any of the scripts just comment out in the shell code, <em>workshop_stream.sh</em>.</p>
@ -116,15 +116,15 @@ mkdir bounding_box
mkdir cropped
```
###Merge the files in the directory <em>scans</em>
<p>All the scans will be appended to one pdf called out.pdf</p>
<p>All the scans will be renamed</p>
```bash
./merge_scans.sh
./rename_scans.sh
```
###Burst the pdf in <em>scans</em>
<p>Burst this pdf, renaming all the files so they can be iterated later.</p>
<p>Change resolution of the scans so that it is lighter to process</p>
```bash
python3 burstpdf.py
./change_res.sh
```
###Rotate the pdfs
@ -140,7 +140,7 @@ python3 bounding_box.py
```
###Crop the mirror
<p>The pages are now cropped, but the mirror is still visible in the middle. I commented it out because if the cameras are positioned correctly there is no need for this step.</p>
<p>The pages are now cropped, but the mirror may still be visible in the edge. This happens if the cameras are not adjusted properly. I commented it out because if the cameras are positioned correctly there is no need for this step.</p>
```bash
python3 mirror_crop.py
```
@ -156,6 +156,12 @@ python3 tesseract_ocr.py
```bash
./merge_files.sh
```
##START OVER
<p>Just run delete_and_start_over.sh and start over</p>
```bash
./delete_and_start_over.sh
```
<br><br>
## License
The package is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).

@ -1,5 +1,4 @@
cd split
pwd
rm page0.jpg
rm input0.jpg
rm `ls *.jpg | tail -n 1`
rm .DS_Store

@ -0,0 +1,7 @@
#!/bin/bash
i=0
for img in `ls scans/*.jpg`; do
mv $img scans/input$i.jpg
i=$((i+1));
done

BIN
scans/.DS_Store vendored

Binary file not shown.

@ -1,13 +1,13 @@
./merge_scans.sh
./rename_scans.sh
mkdir -p split
python3 burstpdf.py
./change_res.sh
./remove.sh
mkdir -p rotated
python3 rotation.py
mkdir -p bounding_box
python3 bounding_box.py
mkdir -p cropped
# mkdir -p cropped
# python3 mirror_crop.py
mkdir -p ocred
python3 tesseract_ocr.py
./remove.sh
./merge_files.sh

Loading…
Cancel
Save