please look from Download all the images of our page

master
eunalee 4 years ago
parent 75b3b84380
commit 1450807cc0

@ -0,0 +1,555 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Mediawiki API Download Images"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import urllib\n",
"import json\n",
"from IPython.display import JSON # iPython JSON renderer"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Let's first test it with one image.\n",
"# For example: File:Debo 009 05 01.jpg\n",
"\n",
"filename = 'Debo 009 05 01.jpg'\n",
"filename = filename.replace(' ', '_') # let's replace spaces again with _\n",
"filename = filename.replace('.jpg', '') # and let's remove the file extension"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"application/json": {
"batchcomplete": "",
"continue": {
"aicontinue": "Deck_6.jpg",
"continue": "-||"
},
"query": {
"allimages": [
{
"descriptionshorturl": "https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33518",
"descriptionurl": "https://pzwiki.wdka.nl/mediadesign/File:Debo_009_05_01.jpg",
"name": "Debo_009_05_01.jpg",
"ns": 6,
"timestamp": "2021-01-21T14:54:44Z",
"title": "File:Debo 009 05 01.jpg",
"url": "https://pzwiki.wdka.nl/mw-mediadesign/images/c/c8/Debo_009_05_01.jpg"
},
{
"descriptionshorturl": "https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=14589",
"descriptionurl": "https://pzwiki.wdka.nl/mediadesign/File:Debord-societysml.gif",
"name": "Debord-societysml.gif",
"ns": 6,
"timestamp": "2014-11-30T00:19:20Z",
"title": "File:Debord-societysml.gif",
"url": "https://pzwiki.wdka.nl/mw-mediadesign/images/b/ba/Debord-societysml.gif"
},
{
"descriptionshorturl": "https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=4462",
"descriptionurl": "https://pzwiki.wdka.nl/mediadesign/File:Dec_6_AWU.pdf",
"name": "Dec_6_AWU.pdf",
"ns": 6,
"timestamp": "2011-12-06T15:23:11Z",
"title": "File:Dec 6 AWU.pdf",
"url": "https://pzwiki.wdka.nl/mw-mediadesign/images/7/70/Dec_6_AWU.pdf"
},
{
"descriptionshorturl": "https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=4463",
"descriptionurl": "https://pzwiki.wdka.nl/mediadesign/File:Dec_6_AWUII.pdf",
"name": "Dec_6_AWUII.pdf",
"ns": 6,
"timestamp": "2011-12-06T16:34:43Z",
"title": "File:Dec 6 AWUII.pdf",
"url": "https://pzwiki.wdka.nl/mw-mediadesign/images/f/fd/Dec_6_AWUII.pdf"
},
{
"descriptionshorturl": "https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=2090",
"descriptionurl": "https://pzwiki.wdka.nl/mediadesign/File:December.gif",
"name": "December.gif",
"ns": 6,
"timestamp": "2010-12-14T21:07:54Z",
"title": "File:December.gif",
"url": "https://pzwiki.wdka.nl/mw-mediadesign/images/3/3f/December.gif"
},
{
"descriptionshorturl": "https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33093",
"descriptionurl": "https://pzwiki.wdka.nl/mediadesign/File:Deck_1.jpg",
"name": "Deck_1.jpg",
"ns": 6,
"timestamp": "2020-11-23T14:31:00Z",
"title": "File:Deck 1.jpg",
"url": "https://pzwiki.wdka.nl/mw-mediadesign/images/7/74/Deck_1.jpg"
},
{
"descriptionshorturl": "https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33095",
"descriptionurl": "https://pzwiki.wdka.nl/mediadesign/File:Deck_2.jpg",
"name": "Deck_2.jpg",
"ns": 6,
"timestamp": "2020-11-23T14:31:00Z",
"title": "File:Deck 2.jpg",
"url": "https://pzwiki.wdka.nl/mw-mediadesign/images/0/08/Deck_2.jpg"
},
{
"descriptionshorturl": "https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33084",
"descriptionurl": "https://pzwiki.wdka.nl/mediadesign/File:Deck_3.jpg",
"name": "Deck_3.jpg",
"ns": 6,
"timestamp": "2020-11-23T14:30:52Z",
"title": "File:Deck 3.jpg",
"url": "https://pzwiki.wdka.nl/mw-mediadesign/images/f/f5/Deck_3.jpg"
},
{
"descriptionshorturl": "https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33088",
"descriptionurl": "https://pzwiki.wdka.nl/mediadesign/File:Deck_4.jpg",
"name": "Deck_4.jpg",
"ns": 6,
"timestamp": "2020-11-23T14:30:52Z",
"title": "File:Deck 4.jpg",
"url": "https://pzwiki.wdka.nl/mw-mediadesign/images/2/24/Deck_4.jpg"
},
{
"descriptionshorturl": "https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33085",
"descriptionurl": "https://pzwiki.wdka.nl/mediadesign/File:Deck_5.jpg",
"name": "Deck_5.jpg",
"ns": 6,
"timestamp": "2020-11-23T14:30:52Z",
"title": "File:Deck 5.jpg",
"url": "https://pzwiki.wdka.nl/mw-mediadesign/images/9/93/Deck_5.jpg"
}
]
}
},
"text/plain": [
"<IPython.core.display.JSON object>"
]
},
"execution_count": 5,
"metadata": {
"application/json": {
"expanded": false,
"root": "root"
}
},
"output_type": "execute_result"
}
],
"source": [
"# We cannot ask the API for the URL of a specific image (:///), but we can still find it using the \"aifrom=\" parameter.\n",
"# Note: ai=allimages\n",
"url = f'https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=query&list=allimages&aifrom={ filename }&format=json'\n",
"response = urllib.request.urlopen(url).read()\n",
"data = json.loads(response)\n",
"JSON(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Select the first result [0], let's assume that that is always the right image that we need :)\n",
"image = data['query']['allimages'][0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'name': 'Debo_009_05_01.jpg', 'timestamp': '2021-01-21T14:54:44Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/c/c8/Debo_009_05_01.jpg', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Debo_009_05_01.jpg', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33518', 'ns': 6, 'title': 'File:Debo 009 05 01.jpg'}\n"
]
}
],
"source": [
"print(image)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"https://pzwiki.wdka.nl/mw-mediadesign/images/c/c8/Debo_009_05_01.jpg\n"
]
}
],
"source": [
"print(image['url'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we can use this URL to download the images!"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"image_url = image['url']\n",
"image_filename = image['name']\n",
"image_response = urllib.request.urlopen(image_url).read() # We use urllib for this again, this is basically our tool to download things from the web !"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Download all the images of our page"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[{'name': 'Debo_009_05_01.jpg', 'timestamp': '2021-01-21T14:54:44Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/c/c8/Debo_009_05_01.jpg', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Debo_009_05_01.jpg', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33518', 'ns': 6, 'title': 'File:Debo 009 05 01.jpg'}, {'name': 'Debord-societysml.gif', 'timestamp': '2014-11-30T00:19:20Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/b/ba/Debord-societysml.gif', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Debord-societysml.gif', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=14589', 'ns': 6, 'title': 'File:Debord-societysml.gif'}, {'name': 'Dec_6_AWU.pdf', 'timestamp': '2011-12-06T15:23:11Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/7/70/Dec_6_AWU.pdf', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Dec_6_AWU.pdf', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=4462', 'ns': 6, 'title': 'File:Dec 6 AWU.pdf'}, {'name': 'Dec_6_AWUII.pdf', 'timestamp': '2011-12-06T16:34:43Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/f/fd/Dec_6_AWUII.pdf', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Dec_6_AWUII.pdf', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=4463', 'ns': 6, 'title': 'File:Dec 6 AWUII.pdf'}, {'name': 'December.gif', 'timestamp': '2010-12-14T21:07:54Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/3/3f/December.gif', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:December.gif', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=2090', 'ns': 6, 'title': 'File:December.gif'}, {'name': 'Deck_1.jpg', 'timestamp': '2020-11-23T14:31:00Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/7/74/Deck_1.jpg', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Deck_1.jpg', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33093', 'ns': 6, 'title': 'File:Deck 1.jpg'}, {'name': 'Deck_2.jpg', 'timestamp': '2020-11-23T14:31:00Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/0/08/Deck_2.jpg', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Deck_2.jpg', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33095', 'ns': 6, 'title': 'File:Deck 2.jpg'}, {'name': 'Deck_3.jpg', 'timestamp': '2020-11-23T14:30:52Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/f/f5/Deck_3.jpg', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Deck_3.jpg', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33084', 'ns': 6, 'title': 'File:Deck 3.jpg'}, {'name': 'Deck_4.jpg', 'timestamp': '2020-11-23T14:30:52Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/2/24/Deck_4.jpg', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Deck_4.jpg', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33088', 'ns': 6, 'title': 'File:Deck 4.jpg'}, {'name': 'Deck_5.jpg', 'timestamp': '2020-11-23T14:30:52Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/9/93/Deck_5.jpg', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Deck_5.jpg', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33085', 'ns': 6, 'title': 'File:Deck 5.jpg'}]\n"
]
}
],
"source": [
"# We have our variable \"images\"\n",
"images = data['query']['allimages']\n",
"\n",
"print(images)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Debo_009_05_01.jpg\n",
"Debord-societysml.gif\n",
"Dec_6_AWU.pdf\n",
"Dec_6_AWUII.pdf\n",
"December.gif\n",
"Deck_1.jpg\n",
"Deck_2.jpg\n",
"Deck_3.jpg\n",
"Deck_4.jpg\n",
"Deck_5.jpg\n",
"['Debo_009_05_01.jpg', 'Debord-societysml.gif', 'Dec_6_AWU.pdf', 'Dec_6_AWUII.pdf', 'December.gif', 'Deck_1.jpg', 'Deck_2.jpg', 'Deck_3.jpg', 'Deck_4.jpg', 'Deck_5.jpg']\n"
]
}
],
"source": [
"images1 = data['query']['allimages']\n",
"images=[]\n",
"\n",
"for item in images1:\n",
" filename = item['name']\n",
" \n",
" print(filename)\n",
" \n",
" images.append(filename)\n",
" \n",
"print(images)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading: Debo_009_05_01.jpg\n",
"Downloading: Debord-societysml.gif\n",
"Downloading: Dec_6_AWU.pdf\n",
"Downloading: Dec_6_AWUII.pdf\n",
"Downloading: December.gif\n",
"Downloading: Deck_1.jpg\n",
"Downloading: Deck_2.jpg\n",
"Downloading: Deck_3.jpg\n",
"Downloading: Deck_4.jpg\n",
"Downloading: Deck_5.jpg\n"
]
}
],
"source": [
"# Let's loop through this list and download each image!\n",
"for filename in images:\n",
" print('Downloading:', filename)\n",
" \n",
" filename = filename.replace(' ', '_') # let's replace spaces again with _\n",
" filename = filename.replace('.jpg', '').replace('.gif', '').replace('.png','').replace('.jpeg','').replace('.JPG','').replace('.JPEG','') # and let's remove the file extension\n",
" \n",
" # first we search for the full URL of the image\n",
" url = f'https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=query&list=allimages&aifrom={ filename }&format=json'\n",
" response = urllib.request.urlopen(url).read()\n",
" data = json.loads(response)\n",
" image = data['query']['allimages'][0]\n",
" \n",
" # then we download the image\n",
" image_url = image['url']\n",
" image_filename = image['name']\n",
" image_response = urllib.request.urlopen(image_url).read()\n",
" \n",
" # and we save it as a file\n",
" out = open(image_filename, 'wb') \n",
" out.write(image_response)\n",
" out.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"filename"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Debo_009_05_01.jpg\n",
"Debord-societysml.gif\n",
"Dec_6_AWU.pdf\n",
"Dec_6_AWUII.pdf\n",
"December.gif\n",
"Deck_1.jpg\n",
"Deck_2.jpg\n",
"Deck_3.jpg\n",
"Deck_4.jpg\n",
"Deck_5.jpg\n"
]
}
],
"source": [
"html = ''\n",
"\n",
"for imagelink in images:\n",
" print(imagelink)\n",
" \n",
" # let's use the \"safe\" pagenames for the filenames \n",
" # by replacing the ' ' with '_'\n",
" filename = imagelink.replace(' ', '_')\n",
" \n",
" if '.pdf' in filename:\n",
" a=f'<iframe src=\"{filename}\"></iframe>'\n",
" else:\n",
" a = f'<img src=\"{ filename }\">'\n",
"\n",
" html += a\n",
" html += '\\n'"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<img src=\"Debo_009_05_01.jpg\">\n",
"<img src=\"Debord-societysml.gif\">\n",
"<iframe src=\"Dec_6_AWU.pdf\"></iframe>\n",
"<iframe src=\"Dec_6_AWUII.pdf\"></iframe>\n",
"<img src=\"December.gif\">\n",
"<img src=\"Deck_1.jpg\">\n",
"<img src=\"Deck_2.jpg\">\n",
"<img src=\"Deck_3.jpg\">\n",
"<img src=\"Deck_4.jpg\">\n",
"<img src=\"Deck_5.jpg\">\n",
"\n"
]
}
],
"source": [
"print(html)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"output = open('image/imageimage.html', 'w')\n",
"output.write(html)\n",
"output.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading…
Cancel
Save