You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
prototyping-times/download_all_image_from_wik...

4.9 KiB

In [ ]:
import urllib
import json
from IPython.display import JSON # iPython JSON renderer
import sys

Download all the images from one wikipedia page :)

In [ ]:
wikipediapage = 'Sculpture'

#https://en.wikipedia.org/wiki/Sculpture
In [ ]:
url = f'https://en.wikipedia.org/w/api.php?action=parse&prop=images&page={ wikipediapage }&format=json'
response = urllib.request.urlopen(url).read()
data = json.loads(response)
JSON(data)
In [ ]:
# We have our variable "images"
images = data['parse']['images']

print(images)
In [ ]:
 
In [ ]:
#ctrl + ? => remove all
In [ ]:
 
In [ ]:
 
In [ ]:
# Let's loop through this list and download each image!
for filename in images:
    try:
        print('Downloading:', filename)

        filename = filename.replace(' ', '_') # let's replace spaces again with _
        filename = filename.replace('.jpg', '').replace('.gif', '').replace('.png','').replace('.jpeg','').replace('.JPG','').replace('.JPEG','') # and let's remove the file extension

        # first we search for the full URL of the image
        url = f'https://commons.wikimedia.org/w/api.php?action=query&list=allimages&aifrom={ filename }&format=json'
        response = urllib.request.urlopen(url).read()
        data = json.loads(response)
        image = data['query']['allimages'][0]

        # then we download the image
        image_url = image['url']
        image_filename = image['name']
        image_response = urllib.request.urlopen(image_url).read()

        # and we save it as a file
        out = open("wikiimage/"+image_filename, 'wb') 
        out.write(image_response)
        out.close()
    
    except:
        error = sys.exc_info()[0]
        print('Skipped:', image)
        print('With the error:', error)
In [ ]:
 
In [ ]:
html = ''

for imagelink in images:
    print(imagelink)
    
    # let's use the "safe" pagenames for the filenames 
    # by replacing the ' ' with '_'
    filename = imagelink.replace(' ', '_')
    
    if '.pdf' in filename:
        a=f'<iframe src="{filename}"></iframe>'
    else:
        a = f'<img src="{ filename }">'

    html += a
    html += '\n'
In [ ]:
print(html)
In [ ]:
output = open('wikiimage/imageimage.html', 'w')
output.write(html)
output.close()
In [ ]:
#git pull
#git status
#git add FILENAME
#git commit -m "write a msg"
#git push