working
parent
d67bc4af5b
commit
5588614b44
@ -1,43 +0,0 @@
|
||||
#Based in the code in https://iq.opengenus.org/pdf_to_image_in_python/
|
||||
|
||||
import pdf2image
|
||||
from PIL import Image
|
||||
import time
|
||||
|
||||
#DECLARE CONSTANTS
|
||||
PDF_PATH = ("scans/out.pdf")
|
||||
DPI = 72
|
||||
FIRST_PAGE = None
|
||||
LAST_PAGE = None
|
||||
FORMAT = 'jpg'
|
||||
THREAD_COUNT = 1
|
||||
USERPWD = None
|
||||
USE_CROPBOX = False
|
||||
STRICT = False
|
||||
|
||||
def pdftopil():
|
||||
#This method reads a pdf and converts it into a sequence of images
|
||||
#PDF_PATH sets the path to the PDF file
|
||||
#dpi parameter assists in adjusting the resolution of the image
|
||||
#first_page parameter allows you to set a first page to be processed by pdftoppm
|
||||
#last_page parameter allows you to set a last page to be processed by pdftoppm
|
||||
#fmt parameter allows to set the format of pdftoppm conversion (PpmImageFile, TIFF)
|
||||
#thread_count parameter allows you to set how many thread will be used for conversion.
|
||||
#userpw parameter allows you to set a password to unlock the converted PDF
|
||||
#use_cropbox parameter allows you to use the crop box instead of the media box when converting
|
||||
#strict parameter allows you to catch pdftoppm syntax error with a custom type PDFSyntaxError
|
||||
|
||||
start_time = time.time()
|
||||
pil_images = pdf2image.convert_from_path(PDF_PATH, dpi=DPI, first_page=FIRST_PAGE, last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD, use_cropbox=USE_CROPBOX, strict=STRICT)
|
||||
print ("Time taken : " + str(time.time() - start_time))
|
||||
return pil_images
|
||||
|
||||
def save_images(pil_images):
|
||||
d = 0
|
||||
for image in pil_images:
|
||||
image.save(("split/input%d"%d) + ".jpg")
|
||||
d += 1
|
||||
|
||||
if __name__ == "__main__":
|
||||
pil_images = pdftopil()
|
||||
save_images(pil_images)
|
@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
i=0
|
||||
for img in `ls scans/*.jpg`; do
|
||||
convert $img -density 72 split/input$i.jpg
|
||||
i=$((i+1));
|
||||
done
|
@ -0,0 +1,6 @@
|
||||
rm -R scans split rotated bounding_box ocred
|
||||
mkdir -p scans
|
||||
mv out.pdf $(date +%F-%H:%M).pdf && touch out.pdf
|
||||
sleep 2
|
||||
cp *.pdf ~/Desktop
|
||||
rm *.pdf
|
@ -1,7 +0,0 @@
|
||||
#!/bin/bash
|
||||
#line 3 means here
|
||||
# cd "$(dirname "$0")"
|
||||
|
||||
cd scans
|
||||
pwd
|
||||
convert *.jpg out.pdf
|
@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
i=0
|
||||
for img in `ls scans/*.jpg`; do
|
||||
mv $img scans/input$i.jpg
|
||||
i=$((i+1));
|
||||
done
|
Binary file not shown.
@ -1,13 +1,13 @@
|
||||
./merge_scans.sh
|
||||
./rename_scans.sh
|
||||
mkdir -p split
|
||||
python3 burstpdf.py
|
||||
./change_res.sh
|
||||
./remove.sh
|
||||
mkdir -p rotated
|
||||
python3 rotation.py
|
||||
mkdir -p bounding_box
|
||||
python3 bounding_box.py
|
||||
mkdir -p cropped
|
||||
# mkdir -p cropped
|
||||
# python3 mirror_crop.py
|
||||
mkdir -p ocred
|
||||
python3 tesseract_ocr.py
|
||||
./remove.sh
|
||||
./merge_files.sh
|
||||
|
Loading…
Reference in New Issue