You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
4.9 KiB
4.9 KiB
In [ ]:
import urllib import json from IPython.display import JSON # iPython JSON renderer import sys
Download all the images from one wikipedia page :)¶
In [ ]:
wikipediapage = 'Sculpture' #https://en.wikipedia.org/wiki/Sculpture
In [ ]:
url = f'https://en.wikipedia.org/w/api.php?action=parse&prop=images&page={ wikipediapage }&format=json' response = urllib.request.urlopen(url).read() data = json.loads(response) JSON(data)
In [ ]:
# We have our variable "images" images = data['parse']['images'] print(images)
In [ ]:
In [ ]:
#ctrl + ? => remove all
In [ ]:
In [ ]:
In [ ]:
# Let's loop through this list and download each image! for filename in images: try: print('Downloading:', filename) filename = filename.replace(' ', '_') # let's replace spaces again with _ filename = filename.replace('.jpg', '').replace('.gif', '').replace('.png','').replace('.jpeg','').replace('.JPG','').replace('.JPEG','') # and let's remove the file extension # first we search for the full URL of the image url = f'https://commons.wikimedia.org/w/api.php?action=query&list=allimages&aifrom={ filename }&format=json' response = urllib.request.urlopen(url).read() data = json.loads(response) image = data['query']['allimages'][0] # then we download the image image_url = image['url'] image_filename = image['name'] image_response = urllib.request.urlopen(image_url).read() # and we save it as a file out = open("wikiimage/"+image_filename, 'wb') out.write(image_response) out.close() except: error = sys.exc_info()[0] print('Skipped:', image) print('With the error:', error)
In [ ]:
In [ ]:
html = '' for imagelink in images: print(imagelink) # let's use the "safe" pagenames for the filenames # by replacing the ' ' with '_' filename = imagelink.replace(' ', '_') if '.pdf' in filename: a=f'<iframe src="{filename}"></iframe>' else: a = f'<img src="{ filename }">' html += a html += '\n'
In [ ]:
print(html)
In [ ]:
output = open('wikiimage/imageimage.html', 'w') output.write(html) output.close()
In [ ]:
#git pull #git status #git add FILENAME #git commit -m "write a msg" #git push