You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2.7 KiB

In [ ]:
from jinja2 import Template 
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
In [ ]:
url = "https://archive.org/details/cd-roms?and[]=mediatype%3A%22image%22"
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, 'html.parser')
In [ ]:
titles = soup.find_all("img", src=True)
titles2 = soup.find_all("img", source=True)

allimages = titles + titles2
In [ ]:
 
In [ ]:
sources = []
domain = urlparse(url)
full_domain = domain.scheme + '://' + domain.hostname


for img in allimages:
    source = img.get ('src')
    if not source:
        source=img.get('source')
    if source.startswith('/'):
        source=full_domain+source
    
    sources.append(source)
In [ ]:
template_file = open('pirate-downloading-to-pdf.html').read()
template = Template(template_file)

html = template.render(sources=sources)
In [ ]:
#Save
output = open('piratezine.html', 'w')
output.write(html)
output.close()
In [ ]:
! weasyprint piratezine.html -s pirate-downloading-to-pdf.css piratezine.pdf
In [ ]: