working
parent
d67bc4af5b
commit
5588614b44
@ -1,43 +0,0 @@
|
|||||||
#Based in the code in https://iq.opengenus.org/pdf_to_image_in_python/
|
|
||||||
|
|
||||||
import pdf2image
|
|
||||||
from PIL import Image
|
|
||||||
import time
|
|
||||||
|
|
||||||
#DECLARE CONSTANTS
|
|
||||||
PDF_PATH = ("scans/out.pdf")
|
|
||||||
DPI = 72
|
|
||||||
FIRST_PAGE = None
|
|
||||||
LAST_PAGE = None
|
|
||||||
FORMAT = 'jpg'
|
|
||||||
THREAD_COUNT = 1
|
|
||||||
USERPWD = None
|
|
||||||
USE_CROPBOX = False
|
|
||||||
STRICT = False
|
|
||||||
|
|
||||||
def pdftopil():
|
|
||||||
#This method reads a pdf and converts it into a sequence of images
|
|
||||||
#PDF_PATH sets the path to the PDF file
|
|
||||||
#dpi parameter assists in adjusting the resolution of the image
|
|
||||||
#first_page parameter allows you to set a first page to be processed by pdftoppm
|
|
||||||
#last_page parameter allows you to set a last page to be processed by pdftoppm
|
|
||||||
#fmt parameter allows to set the format of pdftoppm conversion (PpmImageFile, TIFF)
|
|
||||||
#thread_count parameter allows you to set how many thread will be used for conversion.
|
|
||||||
#userpw parameter allows you to set a password to unlock the converted PDF
|
|
||||||
#use_cropbox parameter allows you to use the crop box instead of the media box when converting
|
|
||||||
#strict parameter allows you to catch pdftoppm syntax error with a custom type PDFSyntaxError
|
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
pil_images = pdf2image.convert_from_path(PDF_PATH, dpi=DPI, first_page=FIRST_PAGE, last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD, use_cropbox=USE_CROPBOX, strict=STRICT)
|
|
||||||
print ("Time taken : " + str(time.time() - start_time))
|
|
||||||
return pil_images
|
|
||||||
|
|
||||||
def save_images(pil_images):
|
|
||||||
d = 0
|
|
||||||
for image in pil_images:
|
|
||||||
image.save(("split/input%d"%d) + ".jpg")
|
|
||||||
d += 1
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
pil_images = pdftopil()
|
|
||||||
save_images(pil_images)
|
|
@ -0,0 +1,7 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
i=0
|
||||||
|
for img in `ls scans/*.jpg`; do
|
||||||
|
convert $img -density 72 split/input$i.jpg
|
||||||
|
i=$((i+1));
|
||||||
|
done
|
@ -0,0 +1,6 @@
|
|||||||
|
rm -R scans split rotated bounding_box ocred
|
||||||
|
mkdir -p scans
|
||||||
|
mv out.pdf $(date +%F-%H:%M).pdf && touch out.pdf
|
||||||
|
sleep 2
|
||||||
|
cp *.pdf ~/Desktop
|
||||||
|
rm *.pdf
|
@ -1,7 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
#line 3 means here
|
|
||||||
# cd "$(dirname "$0")"
|
|
||||||
|
|
||||||
cd scans
|
|
||||||
pwd
|
|
||||||
convert *.jpg out.pdf
|
|
@ -0,0 +1,7 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
i=0
|
||||||
|
for img in `ls scans/*.jpg`; do
|
||||||
|
mv $img scans/input$i.jpg
|
||||||
|
i=$((i+1));
|
||||||
|
done
|
Binary file not shown.
@ -1,13 +1,13 @@
|
|||||||
./merge_scans.sh
|
./rename_scans.sh
|
||||||
mkdir -p split
|
mkdir -p split
|
||||||
python3 burstpdf.py
|
./change_res.sh
|
||||||
|
./remove.sh
|
||||||
mkdir -p rotated
|
mkdir -p rotated
|
||||||
python3 rotation.py
|
python3 rotation.py
|
||||||
mkdir -p bounding_box
|
mkdir -p bounding_box
|
||||||
python3 bounding_box.py
|
python3 bounding_box.py
|
||||||
mkdir -p cropped
|
# mkdir -p cropped
|
||||||
# python3 mirror_crop.py
|
# python3 mirror_crop.py
|
||||||
mkdir -p ocred
|
mkdir -p ocred
|
||||||
python3 tesseract_ocr.py
|
python3 tesseract_ocr.py
|
||||||
./remove.sh
|
|
||||||
./merge_files.sh
|
./merge_files.sh
|
||||||
|
Loading…
Reference in New Issue