commit 435f1d65014a0ca7938d25aeb2040cb87e731821 Author: Michael Murtaugh Date: Tue Jun 29 11:55:26 2021 +0200 notebook diff --git a/archiving_sandbox.ipynb b/archiving_sandbox.ipynb new file mode 100644 index 0000000..e371b42 --- /dev/null +++ b/archiving_sandbox.ipynb @@ -0,0 +1,377 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Archiving the sandbox\n", + "\n", + "*\n", + "\n", + "In the \"spider\", the patterns you add are regex's that both **filter the URLs that you want to copy** (so matching a pattern means it becomes part of the archive) and it also **specifies how to rewrite the URL to a local path**\n", + "\n", + "URLs that don't match the patterns stay \"as-is\" allowing you to still have links \"outside\" to external things.\n", + "\n", + "URLs that match that pattern get relativized to a local path, and the spider makes sure the linked files also get downloaded (and again spidered)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from urllib.parse import urljoin, urlparse, quote as urlquote, unquote as urlunquote\n", + "import html5lib\n", + "from xml.etree import ElementTree as ET\n", + "import os\n", + "import re\n", + "import sys\n", + "from hashlib import md5\n", + "\n", + "import urllib3\n", + "#\n", + "urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Three ways to [back reference in re](\n", + "\n", + " \\g\n", + " \\g<1>\n", + " \\1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's really useful when saving resources to use the appropriate/conventional file extension for that kind of file. URLs don't always include a filename with an extension, the HTTP protocol specifies file types using the \"Content-type\" header and a value, known as a MIME type, a protocol originally developed for specifying file types in email attachments. The EXT dictionary provides a mapping of MIME types we expect to receive to suitable file extensions." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "EXT = {\n", + " \"text/html\": \"html\",\n", + " \"text/css\": \"css\",\n", + " \"image/jpeg\": \"jpg\",\n", + " \"image/png\": \"png\",\n", + " \"image/gif\": \"gif\",\n", + " \"application/javascript\": \"js\",\n", + " \"text/javascript\": \"js\",\n", + " \"video/webm\": \"webm\"\n", + "}\n", + "\n", + "def memoize(f):\n", + " memo = {}\n", + " def helper(x):\n", + " if x not in memo: \n", + " memo[x] = f(x)\n", + " return memo[x]\n", + " return helper\n", + "\n", + "@memoize\n", + "def ext_for (url):\n", + " # try / allow simple extension test to override\n", + " parts = urlparse(url)\n", + " if parts.path:\n", + " ext = os.path.splitext(parts.path)[1].lstrip(\".\").lower()\n", + " if ext in ('html', 'js', 'css', 'gif', 'jpg', 'png', 'jpeg', 'mp3', 'ogg', 'ogv', 'webm', 'mp4', 'svg', 'webp'):\n", + " return ext\n", + " try:\n", + " r = requests.head(url, verify=False)\n", + " ct = r.headers['content-type'].split(\";\")[0]\n", + " if ct not in EXT:\n", + " print (f\"Warning, unknown extension for content-type {ct}, using bin\", file=sys.stderr)\n", + " return EXT.get(ct, \"bin\")\n", + " except Exception as e:\n", + " print (f\"Exception {url}: {e}\", file=sys.stderr)\n", + " return \"bin\"\n", + "\n", + "def split_fragment(href):\n", + " try:\n", + " ri = href.rindex(\"#\")\n", + " return href[:ri], href[ri:]\n", + " except ValueError:\n", + " return href, ''\n", + "\n", + "def split_query(href):\n", + " try:\n", + " ri = href.rindex(\"?\")\n", + " return href[:ri], href[ri:]\n", + " except ValueError:\n", + " return href, ''\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With all the functions in place, the actual spider loop is relatively straightforward. There's a todo list, and a set called done to remember what URLs have already been downloaded." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2021 05 24: Adding optional encoding to make force encoding on HTML pages (requests seems to get it wrong sometimes)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "class Spider:\n", + " def __init__(self, output_path=\".\", \\\n", + " skip_existing_files=False, \\\n", + " pattern_flags=0, \\\n", + " verbose=False, \\\n", + " preserve_query=True, \\\n", + " additional_attributes=None,\n", + " encoding=None):\n", + " self.verbose = verbose\n", + " self.pattern_flags = pattern_flags\n", + " self.patterns = []\n", + " self.output_path = output_path\n", + " self.skip_existing_files = skip_existing_files\n", + " self.preserve_query = preserve_query\n", + " self.additional_attributes = []\n", + " self.encoding = encoding\n", + " if additional_attributes:\n", + " if type(additional_attributes) == str:\n", + " self.additional_attributes.append(additional_attributes)\n", + " else:\n", + " self.additional_attributes.extend(additional_attributes)\n", + " self.rewrites = []\n", + "\n", + " def add_pattern (self, search, replace, post_process=None):\n", + " \"\"\" nb:the replace should always form a local path, with no query\n", + " as re.sub is used, the search should probably capture the entire string ^$\n", + " otherwise unmatched trailing stuff (for instance) can suddenly appear at the end\n", + " (would this be a nicer way to allow queries to be preserved?? ... but then would need to change the code to reparse query in the local path)\n", + " \"\"\"\n", + " if type(search) == str:\n", + " search = re.compile(search, flags=self.pattern_flags)\n", + " self.patterns.append((search, replace, post_process))\n", + "\n", + " def sub_pattern (self, url):\n", + " for psearch, preplace, post_process in self.patterns:\n", + " m =\n", + " if m:\n", + " ret = psearch.sub(preplace, url)\n", + " if post_process:\n", + " ret = post_process(ret)\n", + " return ret\n", + "\n", + " def url_to_local_path (self, url):\n", + " ret = self.sub_pattern(url)\n", + " if ret:\n", + " ret = urlunquote(ret)\n", + " return ret\n", + "\n", + " def generic_url_to_path (self, url):\n", + " md5hash = md5(url.encode()).hexdigest()\n", + " parts = urlparse(url)\n", + " ext = ext_for(url)\n", + " return f\"ext/{md5hash}.{ext}\"\n", + "\n", + " def url_to_path (self, url):\n", + " l = self.url_to_local_path(url)\n", + " if l:\n", + " return l\n", + " else:\n", + " return self.generic_url_to_path(url)\n", + "\n", + " def localize (self, url):\n", + " if url not in self.done and url not in self.todo:\n", + " self.todo.append(url)\n", + " ret = self.url_to_path(url)\n", + " # print (f\"localize {url} => {ret}\")\n", + " return ret\n", + "\n", + " def should_localize(self, url):\n", + " return self.url_to_local_path(url) is not None\n", + "\n", + " def relpath (self, to_file, from_file):\n", + " return os.path.relpath(to_file, os.path.dirname(from_file))\n", + " \n", + " def download(self, url):\n", + " path = self.url_to_path(url)\n", + " usepath = os.path.join(self.output_path, path)\n", + " if self.skip_existing_files and os.path.exists(usepath):\n", + " if self.verbose:\n", + " print (\"File already exists, skipping...\")\n", + " return # why do I need to add this back ?! (2021-03-06)\n", + " #if self.verbose:\n", + " additional_attributes = []\n", + " if self.additional_attributes:\n", + " additional_attributes.extend(self.additional_attributes)\n", + " all_attributes = [\"href\"] + additional_attributes\n", + " self.rewrites.append((url, usepath))\n", + " print (f\"{url} => {usepath}\")\n", + " if os.path.dirname(usepath):\n", + " os.makedirs(os.path.dirname(usepath), exist_ok=True)\n", + " try:\n", + " r = requests.get(url, verify=False)\n", + " if r.headers[\"content-type\"].startswith(\"text/html\"):\n", + " if self.encoding:\n", + " r.encoding = self.encoding\n", + " t = html5lib.parse(r.text, namespaceHTMLElements=False)\n", + "\n", + " for elt in t.findall(\".//*[@src]\"):\n", + " src = urljoin(url, elt.attrib.get(\"src\"))\n", + " # print (elt.tag, src, url_to_path(src))\n", + " local_link = self.localize(src)\n", + " elt.attrib[\"src\"] = urlquote(self.relpath(local_link, path))\n", + " for attribname in all_attributes:\n", + " for elt in t.findall(f\".//*[@{attribname}]\"):\n", + " href = urljoin(url, elt.attrib.get(attribname))\n", + " href, fragment = split_fragment(href)\n", + " if self.preserve_query:\n", + " href_noquery, query = split_query(href)\n", + " else:\n", + " query = ''\n", + " # print (elt.tag, href, url_to_path(href))\n", + " if (elt.tag == \"link\" and elt.attrib.get(\"rel\") == \"stylesheet\") or \\\n", + " (elt.tag == \"a\" and self.should_localize(href)) or \\\n", + " (attribname in additional_attributes and self.should_localize(href)):\n", + " # localize: force/ensure download href, return local path\n", + " local_link = self.localize(href)\n", + " # need path == current document path\n", + " elt.attrib[attribname] = urlquote(self.relpath(local_link, path)) + query + fragment\n", + "\n", + " with open(usepath, \"w\") as fout:\n", + " print(ET.tostring(t, method=\"html\", encoding=\"unicode\"), file=fout)\n", + " elif r.headers[\"content-type\"] == \"text/css\":\n", + " if self.encoding:\n", + " r.encoding = self.encoding\n", + " src = r.text\n", + " def css_sub(m):\n", + " href = urljoin(url,\n", + " if self.should_localize(href):\n", + " local_link = self.localize(href)\n", + " return \"url(\", path))\")\"\n", + " return \n", + " newsrc = re.sub(r\"\"\"url\\((['\" ]*)(.+?)(['\" ]*)\\)\"\"\", css_sub, src)\n", + " with open(usepath, \"w\") as fout:\n", + " print(newsrc, file=fout)\n", + " else:\n", + " # print (\"Downloading binary...\")\n", + " with open(usepath, 'wb') as fd:\n", + " for chunk in r.iter_content(chunk_size=1024):\n", + " fd.write(chunk)\n", + " except Exception as e:\n", + " print (f\"Exception {url}: {e}\", file=sys.stderr)\n", + " \n", + " def spider (self, url):\n", + " self.done = set()\n", + " self.todo = [url]\n", + " count = 0\n", + " while self.todo:\n", + " url = self.todo[0]\n", + " self.todo = self.todo[1:]\n", + " self.done.add(url)\n", + "\n", + " count +=1 " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# cleanup\n", + "!rm -rf sandbox_archive/" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "# remove all html\n", + "def rm_html (path):\n", + " rmlist = []\n", + " for root, dirs, files in os.walk(\"sandbox_archive\"):\n", + " for f in files:\n", + " if os.path.splitext(f)[1] == \".html\":\n", + " rmlist.append(os.path.join(root, f))\n", + " for f in rmlist:\n", + " print (f)\n", + " os.remove(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rm_html(\"sandbox_archive\")" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "spider = Spider(\"sandbox_archive\", skip_existing_files=True, additional_attributes=\"data-url\", encoding=\"utf-8\")\n", + "spider.add_pattern(r\"^https?://hub\\.xpub\\.nl/sandbox/$\", \"index.html\")\n", + "spider.add_pattern(r\"^https?://hub\\.xpub\\.nl/sandbox/(.+)$\", \"\\g<1>\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "spider.spider(\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}