Here's a new notebook for pirate downloading and pdf making

4 years ago · 7370c409cf
parent 6a83f8eed1
commit 7370c409cf
3 changed files with 160 additions and 0 deletions
--- a/downloading-party/pirate-downloading-to-pdf.css
+++ b/downloading-party/pirate-downloading-to-pdf.css
@ -0,0 +1,13 @@
@page { size: A4;
        margin: 10mm; 
         background-color: pink;
 }
 body{}
 img {
    width: 100%;
    height: auto;
    page-break-before: always;
 }
--- a/downloading-party/pirate-downloading-to-pdf.html
+++ b/downloading-party/pirate-downloading-to-pdf.html
@ -0,0 +1,17 @@
 <!DOCTYPE html>
 <html>
 <head>
 	<meta charset="utf-8">
 	<title>This is my page generated with Jinja!</title>
    <link rel="stylesheet" href="pirate-downloading-to-pdf.css">
 </head>
 <body>
    {% for source in sources %}
    <img src='{{ source }}'/>
 {% endfor %}
 </body>
 </html>
--- a/downloading-party/pirate-downloading-to-pdf.ipynb
+++ b/downloading-party/pirate-downloading-to-pdf.ipynb
@ -0,0 +1,130 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from jinja2 import Template \n",
    "from urllib.parse import urlparse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "\n",
    "url = \"https://archive.org/details/cd-roms?and[]=mediatype%3A%22image%22\"\n",
    "response = requests.get(url)\n",
    "html = response.content\n",
    "soup = BeautifulSoup(html, 'html.parser')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "titles = scraped.find_all(\"img\", src=True)\n",
    "titles2 = scraped.find_all(\"img\", source=True)\n",
    "\n",
    "allimages = titles + titles2\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sources = []\n",
    "domain = urlparse(url)\n",
    "full_domain = domain.scheme + '://' + domain.hostname\n",
    "\n",
    "\n",
    "for img in allimages:\n",
    "    source = img.get ('src')\n",
    "    if not source:\n",
    "        source=img.get('source')\n",
    "    if source.startswith('/'):\n",
    "        source=full_domain+source\n",
    "    \n",
    "    sources.append(source)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "template_file = open('pirate-downloading-to-pdf.html').read()\n",
    "template = Template(template_file)\n",
    "\n",
    "html = template.render(sources=sources)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Save\n",
    "output = open('piratezine.html', 'w')\n",
    "output.write(html)\n",
    "output.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! weasyprint piratezine.html -s pirate-downloading-to-pdf.css piratezine.pdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }