From 7370c409cfe64202745330db930f86ec3175681b Mon Sep 17 00:00:00 2001 From: Pongie147 Date: Thu, 4 Feb 2021 18:49:31 +0100 Subject: [PATCH] Here's a new notebook for pirate downloading and pdf making --- .../pirate-downloading-to-pdf.css | 13 ++ .../pirate-downloading-to-pdf.html | 17 +++ .../pirate-downloading-to-pdf.ipynb | 130 ++++++++++++++++++ 3 files changed, 160 insertions(+) create mode 100644 downloading-party/pirate-downloading-to-pdf.css create mode 100644 downloading-party/pirate-downloading-to-pdf.html create mode 100644 downloading-party/pirate-downloading-to-pdf.ipynb diff --git a/downloading-party/pirate-downloading-to-pdf.css b/downloading-party/pirate-downloading-to-pdf.css new file mode 100644 index 0000000..122089b --- /dev/null +++ b/downloading-party/pirate-downloading-to-pdf.css @@ -0,0 +1,13 @@ +@page { size: A4; + margin: 10mm; + background-color: pink; +} + +body{} + +img { + width: 100%; + height: auto; + page-break-before: always; +} + diff --git a/downloading-party/pirate-downloading-to-pdf.html b/downloading-party/pirate-downloading-to-pdf.html new file mode 100644 index 0000000..9d41847 --- /dev/null +++ b/downloading-party/pirate-downloading-to-pdf.html @@ -0,0 +1,17 @@ + + + + + This is my page generated with Jinja! + + + + + {% for source in sources %} + + + +{% endfor %} + + + diff --git a/downloading-party/pirate-downloading-to-pdf.ipynb b/downloading-party/pirate-downloading-to-pdf.ipynb new file mode 100644 index 0000000..1af7155 --- /dev/null +++ b/downloading-party/pirate-downloading-to-pdf.ipynb @@ -0,0 +1,130 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from jinja2 import Template \n", + "from urllib.parse import urlparse" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "\n", + "url = \"https://archive.org/details/cd-roms?and[]=mediatype%3A%22image%22\"\n", + "response = requests.get(url)\n", + "html = response.content\n", + "soup = BeautifulSoup(html, 'html.parser')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "titles = scraped.find_all(\"img\", src=True)\n", + "titles2 = scraped.find_all(\"img\", source=True)\n", + "\n", + "allimages = titles + titles2\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sources = []\n", + "domain = urlparse(url)\n", + "full_domain = domain.scheme + '://' + domain.hostname\n", + "\n", + "\n", + "for img in allimages:\n", + " source = img.get ('src')\n", + " if not source:\n", + " source=img.get('source')\n", + " if source.startswith('/'):\n", + " source=full_domain+source\n", + " \n", + " sources.append(source)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "template_file = open('pirate-downloading-to-pdf.html').read()\n", + "template = Template(template_file)\n", + "\n", + "html = template.render(sources=sources)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Save\n", + "output = open('piratezine.html', 'w')\n", + "output.write(html)\n", + "output.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! weasyprint piratezine.html -s pirate-downloading-to-pdf.css piratezine.pdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}