You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
18 KiB
18 KiB
Mediawiki API Download Images¶
In [3]:
import urllib import json from IPython.display import JSON # iPython JSON renderer
In [4]:
# Let's first test it with one image. # For example: File:Debo 009 05 01.jpg filename = 'Debo 009 05 01.jpg' filename = filename.replace(' ', '_') # let's replace spaces again with _ filename = filename.replace('.jpg', '') # and let's remove the file extension
In [ ]:
In [5]:
# We cannot ask the API for the URL of a specific image (:///), but we can still find it using the "aifrom=" parameter. # Note: ai=allimages url = f'https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=query&list=allimages&aifrom={ filename }&format=json' response = urllib.request.urlopen(url).read() data = json.loads(response) JSON(data)
Out[5]:
<IPython.core.display.JSON object>
In [ ]:
In [6]:
# Select the first result [0], let's assume that that is always the right image that we need :) image = data['query']['allimages'][0]
In [ ]:
In [7]:
print(image)
{'name': 'Debo_009_05_01.jpg', 'timestamp': '2021-01-21T14:54:44Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/c/c8/Debo_009_05_01.jpg', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Debo_009_05_01.jpg', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33518', 'ns': 6, 'title': 'File:Debo 009 05 01.jpg'}
In [8]:
print(image['url'])
https://pzwiki.wdka.nl/mw-mediadesign/images/c/c8/Debo_009_05_01.jpg
In [ ]:
Now we can use this URL to download the images!
In [9]:
image_url = image['url'] image_filename = image['name'] image_response = urllib.request.urlopen(image_url).read() # We use urllib for this again, this is basically our tool to download things from the web !
In [ ]:
In [ ]:
In [27]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Download all the images of our page¶
In [37]:
# We have our variable "images" images = data['query']['allimages'] print(images)
[{'name': 'Debo_009_05_01.jpg', 'timestamp': '2021-01-21T14:54:44Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/c/c8/Debo_009_05_01.jpg', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Debo_009_05_01.jpg', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33518', 'ns': 6, 'title': 'File:Debo 009 05 01.jpg'}, {'name': 'Debord-societysml.gif', 'timestamp': '2014-11-30T00:19:20Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/b/ba/Debord-societysml.gif', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Debord-societysml.gif', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=14589', 'ns': 6, 'title': 'File:Debord-societysml.gif'}, {'name': 'Dec_6_AWU.pdf', 'timestamp': '2011-12-06T15:23:11Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/7/70/Dec_6_AWU.pdf', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Dec_6_AWU.pdf', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=4462', 'ns': 6, 'title': 'File:Dec 6 AWU.pdf'}, {'name': 'Dec_6_AWUII.pdf', 'timestamp': '2011-12-06T16:34:43Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/f/fd/Dec_6_AWUII.pdf', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Dec_6_AWUII.pdf', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=4463', 'ns': 6, 'title': 'File:Dec 6 AWUII.pdf'}, {'name': 'December.gif', 'timestamp': '2010-12-14T21:07:54Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/3/3f/December.gif', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:December.gif', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=2090', 'ns': 6, 'title': 'File:December.gif'}, {'name': 'Deck_1.jpg', 'timestamp': '2020-11-23T14:31:00Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/7/74/Deck_1.jpg', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Deck_1.jpg', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33093', 'ns': 6, 'title': 'File:Deck 1.jpg'}, {'name': 'Deck_2.jpg', 'timestamp': '2020-11-23T14:31:00Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/0/08/Deck_2.jpg', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Deck_2.jpg', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33095', 'ns': 6, 'title': 'File:Deck 2.jpg'}, {'name': 'Deck_3.jpg', 'timestamp': '2020-11-23T14:30:52Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/f/f5/Deck_3.jpg', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Deck_3.jpg', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33084', 'ns': 6, 'title': 'File:Deck 3.jpg'}, {'name': 'Deck_4.jpg', 'timestamp': '2020-11-23T14:30:52Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/2/24/Deck_4.jpg', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Deck_4.jpg', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33088', 'ns': 6, 'title': 'File:Deck 4.jpg'}, {'name': 'Deck_5.jpg', 'timestamp': '2020-11-23T14:30:52Z', 'url': 'https://pzwiki.wdka.nl/mw-mediadesign/images/9/93/Deck_5.jpg', 'descriptionurl': 'https://pzwiki.wdka.nl/mediadesign/File:Deck_5.jpg', 'descriptionshorturl': 'https://pzwiki.wdka.nl/mw-mediadesign/index.php?curid=33085', 'ns': 6, 'title': 'File:Deck 5.jpg'}]
In [45]:
images1 = data['query']['allimages'] images=[] for item in images1: filename = item['name'] print(filename) images.append(filename) print(images)
Debo_009_05_01.jpg Debord-societysml.gif Dec_6_AWU.pdf Dec_6_AWUII.pdf December.gif Deck_1.jpg Deck_2.jpg Deck_3.jpg Deck_4.jpg Deck_5.jpg ['Debo_009_05_01.jpg', 'Debord-societysml.gif', 'Dec_6_AWU.pdf', 'Dec_6_AWUII.pdf', 'December.gif', 'Deck_1.jpg', 'Deck_2.jpg', 'Deck_3.jpg', 'Deck_4.jpg', 'Deck_5.jpg']
In [ ]:
In [ ]:
In [46]:
# Let's loop through this list and download each image! for filename in images: print('Downloading:', filename) filename = filename.replace(' ', '_') # let's replace spaces again with _ filename = filename.replace('.jpg', '').replace('.gif', '').replace('.png','').replace('.jpeg','').replace('.JPG','').replace('.JPEG','') # and let's remove the file extension # first we search for the full URL of the image url = f'https://pzwiki.wdka.nl/mw-mediadesign/api.php?action=query&list=allimages&aifrom={ filename }&format=json' response = urllib.request.urlopen(url).read() data = json.loads(response) image = data['query']['allimages'][0] # then we download the image image_url = image['url'] image_filename = image['name'] image_response = urllib.request.urlopen(image_url).read() # and we save it as a file out = open(image_filename, 'wb') out.write(image_response) out.close()
Downloading: Debo_009_05_01.jpg Downloading: Debord-societysml.gif Downloading: Dec_6_AWU.pdf Downloading: Dec_6_AWUII.pdf Downloading: December.gif Downloading: Deck_1.jpg Downloading: Deck_2.jpg Downloading: Deck_3.jpg Downloading: Deck_4.jpg Downloading: Deck_5.jpg
In [ ]:
filename
In [50]:
html = '' for imagelink in images: print(imagelink) # let's use the "safe" pagenames for the filenames # by replacing the ' ' with '_' filename = imagelink.replace(' ', '_') if '.pdf' in filename: a=f'<iframe src="{filename}"></iframe>' else: a = f'<img src="{ filename }">' html += a html += '\n'
Debo_009_05_01.jpg Debord-societysml.gif Dec_6_AWU.pdf Dec_6_AWUII.pdf December.gif Deck_1.jpg Deck_2.jpg Deck_3.jpg Deck_4.jpg Deck_5.jpg
In [51]:
print(html)
<img src="Debo_009_05_01.jpg"> <img src="Debord-societysml.gif"> <iframe src="Dec_6_AWU.pdf"></iframe> <iframe src="Dec_6_AWUII.pdf"></iframe> <img src="December.gif"> <img src="Deck_1.jpg"> <img src="Deck_2.jpg"> <img src="Deck_3.jpg"> <img src="Deck_4.jpg"> <img src="Deck_5.jpg">
In [52]:
output = open('image/imageimage.html', 'w') output.write(html) output.close()
In [ ]: