updating the mediawiki api notebooks

master
manetta 4 years ago
parent 3523824c8b
commit dc63b4e9f7

@ -0,0 +1,156 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Download Images from one Wikipage\n",
"\n",
"(using the Mediawiki API)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import urllib\n",
"import json\n",
"from IPython.display import JSON # iPython JSON renderer\n",
"import sys"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### API request > list of image filenames"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"wiki = 'https://pzwiki.wdka.nl/mw-mediadesign' # no slash at the end!\n",
"page = 'Category:Situationist_Times'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"url = f'{ wiki }/api.php?action=parse&prop=images&page={ page }&format=json'\n",
"response = urllib.request.urlopen(url).read()\n",
"data = json.loads(response)\n",
"JSON(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"images = data['parse']['images']\n",
"# images = ['FILENAME.jpg', 'FILENAME2.jpg']\n",
"\n",
"# We have our variable \"images\"\n",
"print(images)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Downloading the image files"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Let's loop through this list and download each image!\n",
"for filename in images:\n",
" print('Downloading:', filename)\n",
" \n",
" filename = filename.replace(' ', '_') # let's replace spaces again with _\n",
" filename = filename.replace('.jpg', '').replace('.gif', '').replace('.png','').replace('.jpeg','').replace('.JPG','').replace('.JPEG','') # and let's remove the file extension\n",
" \n",
" # first we search for the full filename of the image\n",
" url = f'https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=query&list=allimages&aifrom={ filename }&format=json'\n",
" response = urllib.request.urlopen(url).read()\n",
" data = json.loads(response)\n",
" \n",
" # we select the first search result\n",
" # (assuming that this is the image we are looking for)\n",
" image = data['query']['allimages'][0]\n",
" \n",
" # then we download the image\n",
" image_url = image['url']\n",
" image_filename = image['name']\n",
" image_response = urllib.request.urlopen(image_url).read()\n",
" \n",
" # and we save it as a file\n",
" out = open(image_filename, 'wb') \n",
" out.write(image_response)\n",
" out.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

@ -13,6 +13,7 @@
"source": [
"This notebook:\n",
"\n",
"* continues with exploring the connections between `Hypertext` & `Dérive`\n",
"* uses the `query` & `parse` actions of the `MediaWiki API`, which we can use to work with wiki pages as (versioned and hypertextual) technotexts\n",
"\n",
"## Epicpedia\n",

@ -0,0 +1,217 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import urllib\n",
"import json\n",
"from IPython.display import JSON # iPython JSON renderer\n",
"import sys"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Download all the images from one wikipedia page :)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"wikipediapage = 'Sculpture'\n",
"\n",
"#https://en.wikipedia.org/wiki/Sculpture"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"url = f'https://en.wikipedia.org/w/api.php?action=parse&prop=images&page={ wikipediapage }&format=json'\n",
"response = urllib.request.urlopen(url).read()\n",
"data = json.loads(response)\n",
"JSON(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# We have our variable \"images\"\n",
"images = data['parse']['images']\n",
"\n",
"print(images)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"#ctrl + ? => remove all"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Let's loop through this list and download each image!\n",
"for filename in images:\n",
" try:\n",
" print('Downloading:', filename)\n",
"\n",
" filename = filename.replace(' ', '_') # let's replace spaces again with _\n",
" filename = filename.replace('.jpg', '').replace('.gif', '').replace('.png','').replace('.jpeg','').replace('.JPG','').replace('.JPEG','') # and let's remove the file extension\n",
"\n",
" # first we search for the full URL of the image\n",
" url = f'https://commons.wikimedia.org/w/api.php?action=query&list=allimages&aifrom={ filename }&format=json'\n",
" response = urllib.request.urlopen(url).read()\n",
" data = json.loads(response)\n",
" image = data['query']['allimages'][0]\n",
"\n",
" # then we download the image\n",
" image_url = image['url']\n",
" image_filename = image['name']\n",
" image_response = urllib.request.urlopen(image_url).read()\n",
"\n",
" # and we save it as a file\n",
" out = open(\"wikiimage/\"+image_filename, 'wb') \n",
" out.write(image_response)\n",
" out.close()\n",
" \n",
" except:\n",
" error = sys.exc_info()[0]\n",
" print('Skipped:', image)\n",
" print('With the error:', error)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"html = ''\n",
"\n",
"for imagelink in images:\n",
" print(imagelink)\n",
" \n",
" # let's use the \"safe\" pagenames for the filenames \n",
" # by replacing the ' ' with '_'\n",
" filename = imagelink.replace(' ', '_')\n",
" \n",
" if '.pdf' in filename:\n",
" a=f'<iframe src=\"{filename}\"></iframe>'\n",
" else:\n",
" a = f'<img src=\"{ filename }\">'\n",
"\n",
" html += a\n",
" html += '\\n'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(html)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"output = open('wikiimage/imageimage.html', 'w')\n",
"output.write(html)\n",
"output.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#git pull\n",
"#git status\n",
"#git add FILENAME\n",
"#git commit -m \"write a msg\"\n",
"#git push"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

@ -4,105 +4,28 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Mediawiki API Download Images"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Let's first test it with one image.\n",
"# For example: File:Debo 009 05 01.jpg\n",
"# Download Images from one Wikipage\n",
"\n",
"filename = 'Debo 009 05 01.jpg'\n",
"filename = filename.replace(' ', '_') # let's replace spaces again with _\n",
"filename = filename.replace('.jpg', '') # and let's remove the file extension"
"(using the Mediawiki API)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# We cannot ask the API for the URL of a specific image (:///), but we can still find it using the \"aifrom=\" parameter.\n",
"# Note: ai=allimages\n",
"url = f'https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=query&list=allimages&aifrom={ filename }&format=json'\n",
"response = urllib.request.urlopen(url).read()\n",
"data = json.loads(response)\n",
"JSON(data)"
"import urllib\n",
"import json\n",
"from IPython.display import JSON # iPython JSON renderer\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Select the first result [0], let's assume that that is always the right image that we need :)\n",
"image = data['query']['allimages'][0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(image)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(image['url'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we can use this URL to download the images!"
"### API request > list of image filenames"
]
},
{
@ -111,25 +34,20 @@
"metadata": {},
"outputs": [],
"source": [
"image_url = image['url']\n",
"image_filename = image['name']\n",
"image_response = urllib.request.urlopen(image_url).read() # We use urllib for this again, this is basically our tool to download things from the web !"
"wiki = 'https://pzwiki.wdka.nl/mw-mediadesign' # no slash at the end!\n",
"page = 'Category:Situationist_Times'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(image_response)"
"url = f'{ wiki }/api.php?action=parse&prop=images&page={ page }&format=json'\n",
"response = urllib.request.urlopen(url).read()\n",
"data = json.loads(response)\n",
"JSON(data)"
]
},
{
@ -138,33 +56,18 @@
"metadata": {},
"outputs": [],
"source": [
"out = open(image_filename, 'wb') # 'wb' stands for 'write bytes', we basically ask this file to accept data in byte format\n",
"out.write(image_response)\n",
"out.close()"
"images = data['parse']['images']\n",
"# images = ['FILENAME.jpg', 'FILENAME2.jpg']\n",
"\n",
"# We have our variable \"images\"\n",
"print(images)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Download all the images of our page"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# We have our variable \"images\"\n",
"print(images)"
"### Downloading the image files"
]
},
{
@ -180,10 +83,13 @@
" filename = filename.replace(' ', '_') # let's replace spaces again with _\n",
" filename = filename.replace('.jpg', '').replace('.gif', '').replace('.png','').replace('.jpeg','').replace('.JPG','').replace('.JPEG','') # and let's remove the file extension\n",
" \n",
" # first we search for the full URL of the image\n",
" # first we search for the full filename of the image\n",
" url = f'https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=query&list=allimages&aifrom={ filename }&format=json'\n",
" response = urllib.request.urlopen(url).read()\n",
" data = json.loads(response)\n",
" \n",
" # we select the first search result\n",
" # (assuming that this is the image we are looking for)\n",
" image = data['query']['allimages'][0]\n",
" \n",
" # then we download the image\n",

@ -0,0 +1,464 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# MediaWiki API (Dérive)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook:\n",
"\n",
"* continues with exploring the connections between `Hypertext` & `Dérive`\n",
"* saves (parts of) wiki pages as html files"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import urllib\n",
"import json\n",
"from IPython.display import JSON # iPython JSON renderer\n",
"import sys"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Parse"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's use another wiki this time: the English Wikipedia.\n",
"\n",
"You can pick any page, i took the Hypertext page for this notebook as an example: https://en.wikipedia.org/wiki/Hypertext"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# parse the wiki page Hypertext\n",
"request = 'https://en.wikipedia.org/w/api.php?action=parse&page=Hypertext&format=json'\n",
"response = urllib.request.urlopen(request).read()\n",
"data = json.loads(response)\n",
"JSON(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Wiki links dérive\n",
"\n",
"Select the wiki links from the `data` response:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"links = data['parse']['links']\n",
"JSON(links)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's save the links as a list of pagenames, to make it look like this:\n",
"\n",
"`['hyperdocuments', 'hyperwords', 'hyperworld']`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# How is \"links\" structured now?\n",
"print(links)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It helps to copy paste a small part of the output first:\n",
"\n",
"`[{'ns': 0, 'exists': '', '*': 'Metatext'}, {'ns': 0, '*': 'De man met de hoed'}]`\n",
"\n",
"and to write it differently with indentation:\n",
"\n",
"```\n",
"links = [\n",
" { \n",
" 'ns' : 0,\n",
" 'exists' : '',\n",
" '*', 'Metatext'\n",
" }, \n",
" {\n",
" 'ns' : 0,\n",
" 'exists' : '',\n",
" '*' : 'De man met de hoed'\n",
" } \n",
"]\n",
"```\n",
"\n",
"We can now loop through \"links\" and add all the pagenames to a new list called \"wikilinks\"."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"wikilinks = []\n",
"\n",
"for link in links:\n",
" \n",
" print('link:', link)\n",
" \n",
" for key, value in link.items():\n",
" print('----- key:', key)\n",
" print('----- value:', value)\n",
" print('-----')\n",
" \n",
" pagename = link['*']\n",
" print('===== pagename:', pagename)\n",
" \n",
" wikilinks.append(pagename)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"wikilinks"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Saving the links in a HTML page\n",
"\n",
"Let's convert the list of pagenames into HTML link elements (`<a href=\"\"></a>`):"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"html = ''\n",
"\n",
"for wikilink in wikilinks:\n",
" print(wikilink)\n",
" \n",
" # let's use the \"safe\" pagenames for the filenames \n",
" # by replacing the ' ' with '_'\n",
" filename = wikilink.replace(' ', '_')\n",
" \n",
" a = f'<a href=\"{ filename }.html\">{ wikilink }</a>'\n",
" html += a\n",
" html += '\\n'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print(html)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Let's save this page in a separate folder, i called it \"mediawiki-api-dérive\"\n",
"# We can make this folder here using a terminal command, but you can also do it in the interface on the left\n",
"! mkdir mediawiki-api-dérive"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"output = open('mediawiki-api-dérive/Hypertext.html', 'w')\n",
"output.write(html)\n",
"output.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Recursive parsing\n",
"\n",
"We can now repeat the steps for each wikilink that we collected!\n",
"\n",
"We can make an API request for each wikilink, \\\n",
"ask for all the links on the page \\\n",
"and save it as an HTML page."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# First we save the Hypertext page again:\n",
"\n",
"startpage = 'Hypertext'\n",
"\n",
"# parse the first wiki page\n",
"request = f'https://en.wikipedia.org/w/api.php?action=parse&page={ startpage }&format=json'\n",
"response = urllib.request.urlopen(request).read()\n",
"data = json.loads(response)\n",
"JSON(data)\n",
"\n",
"# select the links\n",
"links = data['parse']['links']\n",
"\n",
"# turn it into a list of pagenames\n",
"wikilinks = []\n",
"for link in links:\n",
" pagename = link['*']\n",
" wikilinks.append(pagename)\n",
"\n",
"# turn the wikilinks into a set of <a href=\"\"></a> links\n",
"html = ''\n",
"for wikilink in wikilinks:\n",
" filename = wikilink.replace(' ', '_')\n",
" a = f'<a href=\"{ filename }.html\">{ wikilink }</a>'\n",
" html += a\n",
" html += '\\n'\n",
"\n",
"# save it as a HTML page\n",
"startpage = startpage.replace(' ', '_') # let's again stay safe on the filename side\n",
"output = open(f'mediawiki-api-dérive/{ startpage }.html', 'w')\n",
"output.write(html)\n",
"output.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Then we loop through the list of wikilinks\n",
"# and repeat the steps for each page\n",
" \n",
"for wikilink in wikilinks:\n",
" \n",
" # let's copy the current wikilink pagename, to avoid confusion later\n",
" currentwikilink = wikilink \n",
" print('Now requesting:', currentwikilink)\n",
" \n",
" # parse this wiki page\n",
" wikilink = wikilink.replace(' ', '_')\n",
" request = f'https://en.wikipedia.org/w/api.php?action=parse&page={ wikilink }&format=json'\n",
" \n",
" # --> we insert a \"try and error\" condition, \n",
" # to catch errors in case a page does not exist \n",
" try: \n",
" \n",
" # continue the parse request\n",
" response = urllib.request.urlopen(request).read()\n",
" data = json.loads(response)\n",
" JSON(data)\n",
"\n",
" # select the links\n",
" links = data['parse']['links']\n",
"\n",
" # turn it into a list of pagenames\n",
" wikilinks = []\n",
" for link in links:\n",
" pagename = link['*']\n",
" wikilinks.append(pagename)\n",
"\n",
" # turn the wikilinks into a set of <a href=\"\"></a> links\n",
" html = ''\n",
" for wikilink in wikilinks:\n",
" filename = wikilink.replace(' ', '_')\n",
" a = f'<a href=\"{ filename }.html\">{ wikilink }</a>'\n",
" html += a\n",
" html += '\\n'\n",
"\n",
" # save it as a HTML page\n",
" currentwikilink = currentwikilink.replace(' ', '_') # let's again stay safe on the filename side\n",
" output = open(f'mediawiki-api-dérive/{ currentwikilink }.html', 'w')\n",
" output.write(html)\n",
" output.close()\n",
" \n",
" except:\n",
" error = sys.exc_info()[0]\n",
" print('Skipped:', wikilink)\n",
" print('With the error:', error)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## What's next?\n",
"\n",
"?\n",
"\n",
"You could add more loops to the recursive parsing, adding more layers ...\n",
"\n",
"You could request all images of a page (instead of links) ...\n",
"\n",
"or something else the API offers ... (contributors, text, etc)\n",
"\n",
"or ..."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

@ -0,0 +1,438 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# MediaWiki API (part 2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook:\n",
"\n",
"* uses the `query` & `parse` actions of the `MediaWiki API`, which we can use to work with wiki pages as (versioned and hypertextual) technotexts\n",
"\n",
"## Epicpedia\n",
"\n",
"Reference: Epicpedia (2008), Annemieke van der Hoek \\\n",
"(from: https://diversions.constantvzw.org/wiki/index.php?title=Eventual_Consistency#Towards_diffractive_technotexts)\n",
"\n",
"> In Epicpedia (2008), Annemieke van der Hoek creates a work that makes use of the underlying history that lies beneath the surface of each Wikipedia article.[20] Inspired by the work of Berthold Brecht and the notion of Epic Theater, Epicpedia presents Wikipedia articles as screenplays, where each edit becomes an utterance performed by a cast of characters (both major and minor) that takes place over a span of time, typically many years. The work uses the API of wikipedia to retrieve for a given article the sequence of revisions, their corresponding user handles, the summary message (that allows editors to describe the nature of their edit), and the timestamp to then produce a differential reading. \n",
"\n",
"![](https://diversions.constantvzw.org/wiki/images/b/b0/Epicpedia_EpicTheater02.png)\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import urllib\n",
"import json\n",
"from IPython.display import JSON # iPython JSON renderer"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Query & Parse"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We will work again with the `Dérive` page on the wiki: https://pzwiki.wdka.nl/mediadesign/D%C3%A9rive (i moved it here, to make the URL a bit simpler)\n",
"\n",
"And use the `API help page` on the PZI wiki as our main reference: https://pzwiki.wdka.nl/mw-mediadesign/api.php"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# query the wiki page Dérive\n",
"request = 'https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=query&titles=D%C3%A9rive&format=json'\n",
"response = urllib.request.urlopen(request).read()\n",
"data = json.loads(response)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"JSON(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# parse the wiki page Dérive\n",
"request = 'https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=parse&page=D%C3%A9rive&format=json'\n",
"response = urllib.request.urlopen(request).read()\n",
"data = json.loads(response)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"JSON(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Links, contributors, edit history"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can ask the API for different kind of material/information about the page.\n",
"\n",
"Such as:\n",
"\n",
"* a list of wiki links\n",
"* a list of external links\n",
"* a list of images\n",
"* a list of edits\n",
"* a list of contributors\n",
"* page information\n",
"* reverse links (What links here?)\n",
"* ...\n",
"\n",
"We can use the query action again, to ask for these things:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# wiki links: prop=links\n",
"request = 'https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=query&prop=links&titles=D%C3%A9rive&format=json'\n",
"response = urllib.request.urlopen(request).read()\n",
"data = json.loads(response)\n",
"JSON(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# external links: prop=extlinks\n",
"request = 'https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=query&prop=extlinks&titles=D%C3%A9rive&format=json'\n",
"response = urllib.request.urlopen(request).read()\n",
"data = json.loads(response)\n",
"JSON(data)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"application/json": {
"batchcomplete": "",
"query": {
"pages": {
"33524": {
"images": [
{
"ns": 6,
"title": "File:Debo 009 05 01.jpg"
},
{
"ns": 6,
"title": "File:Sex-majik-2004.gif"
}
],
"ns": 0,
"pageid": 33524,
"title": "Dérive"
}
}
}
},
"text/plain": [
"<IPython.core.display.JSON object>"
]
},
"execution_count": 3,
"metadata": {
"application/json": {
"expanded": false,
"root": "root"
}
},
"output_type": "execute_result"
}
],
"source": [
"# images: prop=images\n",
"request = 'https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=query&prop=images&titles=D%C3%A9rive&format=json'\n",
"response = urllib.request.urlopen(request).read()\n",
"data = json.loads(response)\n",
"JSON(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# edit history: prop=revisions\n",
"request = 'https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=query&prop=revisions&titles=D%C3%A9rive&format=json'\n",
"response = urllib.request.urlopen(request).read()\n",
"data = json.loads(response)\n",
"JSON(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# contributors: prop=contributors\n",
"request = 'https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=query&prop=contributors&titles=D%C3%A9rive&format=json'\n",
"response = urllib.request.urlopen(request).read()\n",
"data = json.loads(response)\n",
"JSON(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# page information: prop=info\n",
"request = 'https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=query&prop=info&titles=D%C3%A9rive&format=json'\n",
"response = urllib.request.urlopen(request).read()\n",
"data = json.loads(response)\n",
"JSON(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# reverse links (What links here?): prop=linkshere + lhlimit=25 (max. nr of results)\n",
"request = 'https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=query&prop=linkshere&lhlimit=100&titles=Prototyping&format=json'\n",
"response = urllib.request.urlopen(request).read()\n",
"data = json.loads(response)\n",
"JSON(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Use the `data` responses in Python (and save data in variables)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# For example with the action=parse request\n",
"request = 'https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=parse&page=D%C3%A9rive&format=json'\n",
"response = urllib.request.urlopen(request).read()\n",
"data = json.loads(response)\n",
"JSON(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"text = data['parse']['text']['*']\n",
"print(text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"title = data['parse']['title']\n",
"print(title)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"images = data['parse']['images']\n",
"print(images)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Use these variables to generate HTML pages "
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"# open a HTML file to write to \n",
"output = open('myfilename.html', 'w')"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2813"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# write to this HTML file you just opened\n",
"output.write(text)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"# close the file again (Jupyter needs this to actually write a file)\n",
"output.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Use these variables to generate HTML pages (using the template language Jinja)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Jinja (template language): https://jinja.palletsprojects.com/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

@ -0,0 +1,731 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Today's iPython errors"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# to hide the warning for today\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\", category=DeprecationWarning)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# MediaWiki API"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Let's start with an API reqeust example, using the PZI wiki:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# When you visit .... https://pzwiki.wdka.nl/mw-mediadesign/ ........ the URL magically turns into ........ https://pzwiki.wdka.nl/mediadesign/Main_Page\n",
"# This is probably something configured on the server (which is the XPUB XVM server, the wiki is installed there)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# How to access the API?\n",
"\n",
"# Visit in the browser: "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://pzwiki.wdka.nl/mw-mediadesign/api.php (This is the main access point of the API of the PZI wiki.)\n",
"\n",
"https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=query&titles=Main%20page&format=json (This is an example of an API request.)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# What's in this URL?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# api.php "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ?action=query"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# &"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# &titles=Main%page"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# &format=json"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Documentation page of the MediaWiki API: https://pzwiki.wdka.nl/mw-mediadesign/api.php"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Dérive in the API"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Wander around in the documentation page, edit the URL, make a couple requests!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Try to use the actions: \"query\" and \"parse\". \n",
"# We will focus on these two today."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# (paste your requests on the pad)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Use the API in a Notebook"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Using urllib & json"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import urllib\n",
"import json"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"url = 'https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=query&titles=Main%20page&format=json'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"request = urllib.request.urlopen(url).read()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = json.loads(request)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Display JSON in Notebooks nicely, using iPython"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import JSON"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"JSON(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Try different *query* and *parse* actions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Let's write the URL in two parts:\n",
"# - main domain\n",
"# - API request"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"wiki = 'https://pzwiki.wdka.nl/mw-mediadesign'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"query = f'{ wiki }/api.php?action=query&titles=Category:Situationist_Times&format=json'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"parse = f'{ wiki }/api.php?action=parse&page=Category:Situationist_Times&format=json'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"querylinks = f'{ wiki }/api.php?action=query&prop=links&titles=Main%20Page'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Documentation page for query: https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=help&modules=query\n",
"\n",
"Documentation page for parse: https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=help&modules=parse"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# make the request here in the notebook\n",
"request = urllib.request.urlopen(url).read()\n",
"data = json.loads(request)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save HTML as files"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# try to use .open() and .write() to open and write the HTML to a file"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## $ cp to /var/www/html/PrototypingTimes/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Let's publish the HTML files that you just created"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# We can use terminal commands here (\"bash commands\" to be more precise), by using the \"!\" as the first character in a cell.\n",
"# For example:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! figlet hello"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# So, to copy files and folders over to the \"PrototypingTimes\" folder, we can use $ cp (from copy).\n",
"# The folder \"PrototypingTimes\" sits on the Sandbot server on this path: /var/www/html/PrototypingTimes/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# /var/www/html/PrototypingTimes/ == https://hub.xpub.nl/sandbot/PrototypingTimes/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# So to copy a file there, you can use this command:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! cp YOURFUNNYFILENAME.html /var/www/html/PrototypingTimes/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# And in case you want to copy over folders, you can use $ cp -r (-r for recursive)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! cp -r YOURFOLDERNAME /var/www/html/PrototypingTimes/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"### Let's also publish this notebook as .html file?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# First, we can convert it to a .html file, using jupyter command line tools:\n",
"# (https://nbconvert.readthedocs.io/en/latest/usage.html)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! jupyter nbconvert YOURNOTEBOOK.ipynb --to html "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# And then, copy it to the /var/www/html/PrototypingTimes/ folder with $ cp (as we just did above)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading…
Cancel
Save