Here's a new notebook for pirate downloading and pdf making

4 years ago · 7370c409cf
parent 6a83f8eed1
commit 7370c409cf
3 changed files with 160 additions and 0 deletions
--- a/downloading-party/pirate-downloading-to-pdf.css
+++ b/downloading-party/pirate-downloading-to-pdf.css
@ -0,0 +1,13 @@
+@page { size: A4;
+        margin: 10mm; 
+         background-color: pink;
+}
+
+body{}
+
+img {
+    width: 100%;
+    height: auto;
+    page-break-before: always;
+}
+
--- a/downloading-party/pirate-downloading-to-pdf.html
+++ b/downloading-party/pirate-downloading-to-pdf.html
@ -0,0 +1,17 @@
+<!DOCTYPE html>
+<html>
+<head>
+	<meta charset="utf-8">
+	<title>This is my page generated with Jinja!</title>
+    <link rel="stylesheet" href="pirate-downloading-to-pdf.css">
+</head>
+<body>
+    
+    {% for source in sources %}
+
+    <img src='{{ source }}'/>
+    
+{% endfor %}
+    
+</body>
+</html>
--- a/downloading-party/pirate-downloading-to-pdf.ipynb
+++ b/downloading-party/pirate-downloading-to-pdf.ipynb
@ -0,0 +1,130 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from jinja2 import Template \n",
+    "from urllib.parse import urlparse"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "from bs4 import BeautifulSoup\n",
+    "\n",
+    "url = \"https://archive.org/details/cd-roms?and[]=mediatype%3A%22image%22\"\n",
+    "response = requests.get(url)\n",
+    "html = response.content\n",
+    "soup = BeautifulSoup(html, 'html.parser')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "titles = scraped.find_all(\"img\", src=True)\n",
+    "titles2 = scraped.find_all(\"img\", source=True)\n",
+    "\n",
+    "allimages = titles + titles2\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sources = []\n",
+    "domain = urlparse(url)\n",
+    "full_domain = domain.scheme + '://' + domain.hostname\n",
+    "\n",
+    "\n",
+    "for img in allimages:\n",
+    "    source = img.get ('src')\n",
+    "    if not source:\n",
+    "        source=img.get('source')\n",
+    "    if source.startswith('/'):\n",
+    "        source=full_domain+source\n",
+    "    \n",
+    "    sources.append(source)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template_file = open('pirate-downloading-to-pdf.html').read()\n",
+    "template = Template(template_file)\n",
+    "\n",
+    "html = template.render(sources=sources)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Save\n",
+    "output = open('piratezine.html', 'w')\n",
+    "output.write(html)\n",
+    "output.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! weasyprint piratezine.html -s pirate-downloading-to-pdf.css piratezine.pdf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}