You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2.7 KiB
2.7 KiB
In [ ]:
from jinja2 import Template from urllib.parse import urlparse
In [ ]:
import requests from bs4 import BeautifulSoup url = "https://archive.org/details/cd-roms?and[]=mediatype%3A%22image%22" response = requests.get(url) html = response.content soup = BeautifulSoup(html, 'html.parser')
In [ ]:
titles = scraped.find_all("img", src=True) titles2 = scraped.find_all("img", source=True) allimages = titles + titles2
In [ ]:
In [ ]:
sources = [] domain = urlparse(url) full_domain = domain.scheme + '://' + domain.hostname for img in allimages: source = img.get ('src') if not source: source=img.get('source') if source.startswith('/'): source=full_domain+source sources.append(source)
In [ ]:
template_file = open('pirate-downloading-to-pdf.html').read() template = Template(template_file) html = template.render(sources=sources)
In [ ]:
#Save output = open('piratezine.html', 'w') output.write(html) output.close()
In [ ]:
! weasyprint piratezine.html -s pirate-downloading-to-pdf.css piratezine.pdf
In [ ]: