{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from jinja2 import Template \n", "from urllib.parse import urlparse\n", "import requests\n", "from bs4 import BeautifulSoup" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "url = \"https://archive.org/details/cd-roms?and[]=mediatype%3A%22image%22\"\n", "response = requests.get(url)\n", "html = response.content\n", "soup = BeautifulSoup(html, 'html.parser')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "titles = soup.find_all(\"img\", src=True)\n", "titles2 = soup.find_all(\"img\", source=True)\n", "\n", "allimages = titles + titles2\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sources = []\n", "domain = urlparse(url)\n", "full_domain = domain.scheme + '://' + domain.hostname\n", "\n", "\n", "for img in allimages:\n", " source = img.get ('src')\n", " if not source:\n", " source=img.get('source')\n", " if source.startswith('/'):\n", " source=full_domain+source\n", " \n", " sources.append(source)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "template_file = open('pirate-downloading-to-pdf.html').read()\n", "template = Template(template_file)\n", "\n", "html = template.render(sources=sources)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Save\n", "output = open('piratezine.html', 'w')\n", "output.write(html)\n", "output.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! weasyprint piratezine.html -s pirate-downloading-to-pdf.css piratezine.pdf" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }