You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123 lines
3.6 KiB
Python

import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import requests
import os
import io
import random
import json
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
def harvesting(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
dictionary = open('./pics.json',)
l = json.load(dictionary)
cycles = 10
def scroll_to_end(wd):
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(sleep_between_interactions)
# build the google query
search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"
# load the page
wd.get(search_url.format(q=query))
image_urls = set()
image_count = 0
results_start = 0
sx ="sx"
dx="dx"
while image_count < max_links_to_fetch:
for loop in range(cycles):
scroll_to_end(wd)
time.sleep(.1)
# get all image thumbnail results
thumbnail_results = wd.find_elements(By.CLASS_NAME,"Q4LuWd")
number_results = len(thumbnail_results)
print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
# some useful variables
nPic = 0
holdPic=0
groundPic = 0
currentMeta = ''
for img in thumbnail_results[results_start:number_results]:
# click the thumbnail to get the actual image
try:
img.click()
time.sleep(.1)
except Exception:
continue
# extract image url
actual_images = wd.find_elements(By.CLASS_NAME,'n3VNCb')
try:
actual_image = actual_images[-1]
except Exception:
continue
if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
image_urls.add(actual_image.get_attribute('src'))
linkPic = actual_image.get_attribute('src')
l[nPic] = linkPic
time.sleep(.5)
with open("./pics.json", "w") as outfile:
json.dump(l, outfile)
nPic = nPic+1
holdPic = holdPic+1
time.sleep(.5)
image_count = len(image_urls)
if len(image_urls) >= max_links_to_fetch:
print(f"Found: {len(image_urls)} image links, done!")
break
else:
print("Found:", len(image_urls), "image links, looking for more ...")
time.sleep(1)
return
load_more_button = wd.find_element(By.CLASS_NAME,"Mye4qd")
if load_more_button:
wd.execute_script("document.querySelector('.mye4qd').click();")
# move the result startpoint further down
results_start = len(thumbnail_results)
with open("./pics.json", "w") as outfile:
lastSentence = "The rest of the results might not be what you're looking for."
l[nPic] = lastSentence
json.dump(l, outfile)
return image_urls
wd = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
query = "VAAST COLSON"
wd.get('https://google.com')
time.sleep(1)
wd.find_element(By.ID,'W0wltc').click()
time.sleep(.5)
search_box = wd.find_element(By.CLASS_NAME,'gLFyf')
search_box.send_keys(query)
links = harvesting(query,300,wd)
wd.quit()