notebook

3 years ago · 435f1d6501
commit 435f1d6501
1 changed files with 377 additions and 0 deletions
--- a/archiving_sandbox.ipynb
+++ b/archiving_sandbox.ipynb
@ -0,0 +1,377 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Archiving the sandbox\n",
+    "\n",
+    "* http://hub.xpub.nl/sandbot/\n",
+    "\n",
+    "In the \"spider\", the patterns you add are regex's that both **filter the URLs that you want to copy** (so matching a pattern means it becomes part of the archive) and it also **specifies how to rewrite the URL to a local path**\n",
+    "\n",
+    "URLs that don't match the patterns stay \"as-is\" allowing you to still have links \"outside\" to external things.\n",
+    "\n",
+    "URLs that match that pattern get relativized to a local path, and the spider makes sure the linked files also get downloaded (and again spidered)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "from urllib.parse import urljoin, urlparse, quote as urlquote, unquote as urlunquote\n",
+    "import html5lib\n",
+    "from xml.etree import ElementTree as ET\n",
+    "import os\n",
+    "import re\n",
+    "import sys\n",
+    "from hashlib import md5\n",
+    "\n",
+    "import urllib3\n",
+    "# https://stackoverflow.com/questions/27981545/suppress-insecurerequestwarning-unverified-https-request-is-being-made-in-pytho\n",
+    "urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Three ways to [back reference in re](https://docs.python.org/3/library/re.html#re.sub):\n",
+    "\n",
+    "    \\g<name>\n",
+    "    \\g<1>\n",
+    "    \\1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It's really useful when saving resources to use the appropriate/conventional file extension for that kind of file. URLs don't always include a filename with an extension, the HTTP protocol specifies file types using the \"Content-type\" header and a value, known as a MIME type, a protocol originally developed for specifying file types in email attachments. The EXT dictionary provides a mapping of MIME types we expect to receive to suitable file extensions."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "EXT = {\n",
+    "    \"text/html\": \"html\",\n",
+    "    \"text/css\": \"css\",\n",
+    "    \"image/jpeg\": \"jpg\",\n",
+    "    \"image/png\": \"png\",\n",
+    "    \"image/gif\": \"gif\",\n",
+    "    \"application/javascript\": \"js\",\n",
+    "    \"text/javascript\": \"js\",\n",
+    "    \"video/webm\": \"webm\"\n",
+    "}\n",
+    "\n",
+    "def memoize(f):\n",
+    "    memo = {}\n",
+    "    def helper(x):\n",
+    "        if x not in memo:            \n",
+    "            memo[x] = f(x)\n",
+    "        return memo[x]\n",
+    "    return helper\n",
+    "\n",
+    "@memoize\n",
+    "def ext_for (url):\n",
+    "    # try / allow simple extension test to override\n",
+    "    parts = urlparse(url)\n",
+    "    if parts.path:\n",
+    "        ext = os.path.splitext(parts.path)[1].lstrip(\".\").lower()\n",
+    "        if ext in ('html', 'js', 'css', 'gif', 'jpg', 'png', 'jpeg', 'mp3', 'ogg', 'ogv', 'webm', 'mp4', 'svg', 'webp'):\n",
+    "            return ext\n",
+    "    try:\n",
+    "        r = requests.head(url, verify=False)\n",
+    "        ct = r.headers['content-type'].split(\";\")[0]\n",
+    "        if ct not in EXT:\n",
+    "            print (f\"Warning, unknown extension for content-type {ct}, using bin\", file=sys.stderr)\n",
+    "        return EXT.get(ct, \"bin\")\n",
+    "    except Exception as e:\n",
+    "        print (f\"Exception {url}: {e}\", file=sys.stderr)\n",
+    "        return \"bin\"\n",
+    "\n",
+    "def split_fragment(href):\n",
+    "    try:\n",
+    "        ri = href.rindex(\"#\")\n",
+    "        return href[:ri], href[ri:]\n",
+    "    except ValueError:\n",
+    "        return href, ''\n",
+    "\n",
+    "def split_query(href):\n",
+    "    try:\n",
+    "        ri = href.rindex(\"?\")\n",
+    "        return href[:ri], href[ri:]\n",
+    "    except ValueError:\n",
+    "        return href, ''\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With all the functions in place, the actual spider loop is relatively straightforward. There's a todo list, and a set called done to remember what URLs have already been downloaded."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "2021 05 24: Adding optional encoding to make force encoding on HTML pages (requests seems to get it wrong sometimes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Spider:\n",
+    "    def __init__(self, output_path=\".\", \\\n",
+    "                 skip_existing_files=False, \\\n",
+    "                 pattern_flags=0, \\\n",
+    "                 verbose=False, \\\n",
+    "                 preserve_query=True, \\\n",
+    "                 additional_attributes=None,\n",
+    "                 encoding=None):\n",
+    "        self.verbose = verbose\n",
+    "        self.pattern_flags = pattern_flags\n",
+    "        self.patterns = []\n",
+    "        self.output_path = output_path\n",
+    "        self.skip_existing_files = skip_existing_files\n",
+    "        self.preserve_query = preserve_query\n",
+    "        self.additional_attributes = []\n",
+    "        self.encoding = encoding\n",
+    "        if additional_attributes:\n",
+    "            if type(additional_attributes) == str:\n",
+    "                self.additional_attributes.append(additional_attributes)\n",
+    "            else:\n",
+    "                self.additional_attributes.extend(additional_attributes)\n",
+    "        self.rewrites = []\n",
+    "\n",
+    "    def add_pattern (self, search, replace, post_process=None):\n",
+    "        \"\"\" nb:the replace should always form a local path, with no query\n",
+    "            as re.sub is used, the search should probably capture the entire string ^$\n",
+    "            otherwise unmatched trailing stuff (for instance) can suddenly appear at the end\n",
+    "            (would this be a nicer way to allow queries to be preserved?? ... but then would need to change the code to reparse query in the local path)\n",
+    "        \"\"\"\n",
+    "        if type(search) == str:\n",
+    "            search = re.compile(search, flags=self.pattern_flags)\n",
+    "        self.patterns.append((search, replace, post_process))\n",
+    "\n",
+    "    def sub_pattern (self, url):\n",
+    "        for psearch, preplace, post_process in self.patterns:\n",
+    "            m = psearch.search(url)\n",
+    "            if m:\n",
+    "                ret = psearch.sub(preplace, url)\n",
+    "                if post_process:\n",
+    "                    ret = post_process(ret)\n",
+    "                return ret\n",
+    "\n",
+    "    def url_to_local_path (self, url):\n",
+    "        ret = self.sub_pattern(url)\n",
+    "        if ret:\n",
+    "            ret = urlunquote(ret)\n",
+    "        return ret\n",
+    "\n",
+    "    def generic_url_to_path (self, url):\n",
+    "        md5hash = md5(url.encode()).hexdigest()\n",
+    "        parts = urlparse(url)\n",
+    "        ext = ext_for(url)\n",
+    "        return f\"ext/{md5hash}.{ext}\"\n",
+    "\n",
+    "    def url_to_path (self, url):\n",
+    "        l = self.url_to_local_path(url)\n",
+    "        if l:\n",
+    "            return l\n",
+    "        else:\n",
+    "            return self.generic_url_to_path(url)\n",
+    "\n",
+    "    def localize (self, url):\n",
+    "        if url not in self.done and url not in self.todo:\n",
+    "            self.todo.append(url)\n",
+    "        ret = self.url_to_path(url)\n",
+    "        # print (f\"localize {url} => {ret}\")\n",
+    "        return ret\n",
+    "\n",
+    "    def should_localize(self, url):\n",
+    "        return self.url_to_local_path(url) is not None\n",
+    "\n",
+    "    def relpath (self, to_file, from_file):\n",
+    "        return os.path.relpath(to_file, os.path.dirname(from_file))\n",
+    "            \n",
+    "    def download(self, url):\n",
+    "        path = self.url_to_path(url)\n",
+    "        usepath = os.path.join(self.output_path, path)\n",
+    "        if self.skip_existing_files and os.path.exists(usepath):\n",
+    "            if self.verbose:\n",
+    "                print (\"File already exists, skipping...\")\n",
+    "            return # why do I need to add this back ?! (2021-03-06)\n",
+    "        #if self.verbose:\n",
+    "        additional_attributes = []\n",
+    "        if self.additional_attributes:\n",
+    "            additional_attributes.extend(self.additional_attributes)\n",
+    "        all_attributes = [\"href\"] + additional_attributes\n",
+    "        self.rewrites.append((url, usepath))\n",
+    "        print (f\"{url} => {usepath}\")\n",
+    "        if os.path.dirname(usepath):\n",
+    "            os.makedirs(os.path.dirname(usepath), exist_ok=True)\n",
+    "        try:\n",
+    "            r = requests.get(url, verify=False)\n",
+    "            if r.headers[\"content-type\"].startswith(\"text/html\"):\n",
+    "                if self.encoding:\n",
+    "                    r.encoding = self.encoding\n",
+    "                t = html5lib.parse(r.text, namespaceHTMLElements=False)\n",
+    "\n",
+    "                for elt in t.findall(\".//*[@src]\"):\n",
+    "                    src = urljoin(url, elt.attrib.get(\"src\"))\n",
+    "                    # print (elt.tag, src, url_to_path(src))\n",
+    "                    local_link = self.localize(src)\n",
+    "                    elt.attrib[\"src\"] = urlquote(self.relpath(local_link, path))\n",
+    "                for attribname in all_attributes:\n",
+    "                    for elt in t.findall(f\".//*[@{attribname}]\"):\n",
+    "                        href = urljoin(url, elt.attrib.get(attribname))\n",
+    "                        href, fragment = split_fragment(href)\n",
+    "                        if self.preserve_query:\n",
+    "                            href_noquery, query = split_query(href)\n",
+    "                        else:\n",
+    "                            query = ''\n",
+    "                        # print (elt.tag, href, url_to_path(href))\n",
+    "                        if (elt.tag == \"link\" and elt.attrib.get(\"rel\") == \"stylesheet\") or \\\n",
+    "                            (elt.tag == \"a\" and self.should_localize(href)) or \\\n",
+    "                            (attribname in additional_attributes and self.should_localize(href)):\n",
+    "                            # localize: force/ensure download href, return local path\n",
+    "                            local_link = self.localize(href)\n",
+    "                            # need path == current document path\n",
+    "                            elt.attrib[attribname] = urlquote(self.relpath(local_link, path)) + query + fragment\n",
+    "\n",
+    "                with open(usepath, \"w\") as fout:\n",
+    "                    print(ET.tostring(t, method=\"html\", encoding=\"unicode\"), file=fout)\n",
+    "            elif r.headers[\"content-type\"] == \"text/css\":\n",
+    "                if self.encoding:\n",
+    "                    r.encoding = self.encoding\n",
+    "                src = r.text\n",
+    "                def css_sub(m):\n",
+    "                    href = urljoin(url, m.group(2))\n",
+    "                    if self.should_localize(href):\n",
+    "                        local_link = self.localize(href)\n",
+    "                        return \"url(\"+m.group(1)+urlquote(self.relpath(local_link, path))+m.group(3)+\")\"\n",
+    "                    return m.group(0)                            \n",
+    "                newsrc = re.sub(r\"\"\"url\\((['\" ]*)(.+?)(['\" ]*)\\)\"\"\", css_sub, src)\n",
+    "                with open(usepath, \"w\") as fout:\n",
+    "                    print(newsrc, file=fout)\n",
+    "            else:\n",
+    "                # print (\"Downloading binary...\")\n",
+    "                with open(usepath, 'wb') as fd:\n",
+    "                    for chunk in r.iter_content(chunk_size=1024):\n",
+    "                        fd.write(chunk)\n",
+    "        except Exception as e:\n",
+    "            print (f\"Exception {url}: {e}\", file=sys.stderr)\n",
+    " \n",
+    "    def spider (self, url):\n",
+    "        self.done = set()\n",
+    "        self.todo = [url]\n",
+    "        count = 0\n",
+    "        while self.todo:\n",
+    "            url = self.todo[0]\n",
+    "            self.todo = self.todo[1:]\n",
+    "            self.done.add(url)\n",
+    "            self.download(url)\n",
+    "            count +=1 "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# cleanup\n",
+    "!rm -rf sandbox_archive/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# remove all html\n",
+    "def rm_html (path):\n",
+    "    rmlist = []\n",
+    "    for root, dirs, files in os.walk(\"sandbox_archive\"):\n",
+    "        for f in files:\n",
+    "            if os.path.splitext(f)[1] == \".html\":\n",
+    "                rmlist.append(os.path.join(root, f))\n",
+    "    for f in rmlist:\n",
+    "        print (f)\n",
+    "        os.remove(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rm_html(\"sandbox_archive\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spider = Spider(\"sandbox_archive\", skip_existing_files=True, additional_attributes=\"data-url\", encoding=\"utf-8\")\n",
+    "spider.add_pattern(r\"^https?://hub\\.xpub\\.nl/sandbox/$\", \"index.html\")\n",
+    "spider.add_pattern(r\"^https?://hub\\.xpub\\.nl/sandbox/(.+)$\", \"\\g<1>\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "spider.spider(\"https://hub.xpub.nl/sandbox/\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}