{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import urllib\n", "import json\n", "from IPython.display import JSON # iPython JSON renderer\n", "import sys" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Download all the images from one wikipedia page :)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "wikipediapage = 'Sculpture'\n", "\n", "#https://en.wikipedia.org/wiki/Sculpture" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "url = f'https://en.wikipedia.org/w/api.php?action=parse&prop=images&page={ wikipediapage }&format=json'\n", "response = urllib.request.urlopen(url).read()\n", "data = json.loads(response)\n", "JSON(data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# We have our variable \"images\"\n", "images = data['parse']['images']\n", "\n", "print(images)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#ctrl + ? => remove all" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Let's loop through this list and download each image!\n", "for filename in images:\n", " try:\n", " print('Downloading:', filename)\n", "\n", " filename = filename.replace(' ', '_') # let's replace spaces again with _\n", " filename = filename.replace('.jpg', '').replace('.gif', '').replace('.png','').replace('.jpeg','').replace('.JPG','').replace('.JPEG','') # and let's remove the file extension\n", "\n", " # first we search for the full URL of the image\n", " url = f'https://commons.wikimedia.org/w/api.php?action=query&list=allimages&aifrom={ filename }&format=json'\n", " response = urllib.request.urlopen(url).read()\n", " data = json.loads(response)\n", " image = data['query']['allimages'][0]\n", "\n", " # then we download the image\n", " image_url = image['url']\n", " image_filename = image['name']\n", " image_response = urllib.request.urlopen(image_url).read()\n", "\n", " # and we save it as a file\n", " out = open(\"wikiimage/\"+image_filename, 'wb') \n", " out.write(image_response)\n", " out.close()\n", " \n", " except:\n", " error = sys.exc_info()[0]\n", " print('Skipped:', image)\n", " print('With the error:', error)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "html = ''\n", "\n", "for imagelink in images:\n", " print(imagelink)\n", " \n", " # let's use the \"safe\" pagenames for the filenames \n", " # by replacing the ' ' with '_'\n", " filename = imagelink.replace(' ', '_')\n", " \n", " if '.pdf' in filename:\n", " a=f''\n", " else:\n", " a = f''\n", "\n", " html += a\n", " html += '\\n'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(html)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "output = open('wikiimage/imageimage.html', 'w')\n", "output.write(html)\n", "output.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#git pull\n", "#git status\n", "#git add FILENAME\n", "#git commit -m \"write a msg\"\n", "#git push" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }