import selenium from selenium import webdriver from selenium.webdriver.common.by import By import time import requests import os import io import json import random import json DRIVER_PATH = '../geckodriver' DRIVER_PATH2 = '../chromedriver' def harvesting(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1): dictionary = open('pics.json',) l = json.load(dictionary) cycles = 10 def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) # build the google query search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" # load the page wd.get(search_url.format(q=query)) image_urls = set() image_count = 0 results_start = 0 sx ="sx" dx="dx" while image_count < max_links_to_fetch: for loop in range(cycles): scroll_to_end(wd) time.sleep(.1) # get all image thumbnail results thumbnail_results = wd.find_elements(By.CLASS_NAME,"Q4LuWd") number_results = len(thumbnail_results) print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}") # some useful variables nPic = 0 holdPic=0 groundPic = 0 currentMeta = '' for img in thumbnail_results[results_start:number_results]: # click the thumbnail to get the actual image try: img.click() time.sleep(.1) except Exception: continue # extract image url actual_images = wd.find_elements(By.CLASS_NAME,'n3VNCb') actual_image = actual_images[-1] if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'): image_urls.add(actual_image.get_attribute('src')) linkPic = actual_image.get_attribute('src') # # print(linkPic) l[nPic] = linkPic time.sleep(.2) # print(l) with open("pics.json", "w") as outfile: json.dump(l, outfile) nPic = nPic+1 holdPic = holdPic+1 time.sleep(.2) image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") time.sleep(1) return load_more_button = wd.find_element(By.CLASS_NAME,"Mye4qd") if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") # move the result startpoint further down results_start = len(thumbnail_results) return image_urls if __name__ == '__main__': wd = webdriver.Firefox(executable_path=DRIVER_PATH) query = "VAAST COLSON" wd.get('https://google.com') time.sleep(.3) wd.find_element(By.ID,'W0wltc').click() time.sleep(.5) search_box = wd.find_element(By.CLASS_NAME,'gLFyf') search_box.send_keys(query) links = harvesting(query,2000,wd) wd.quit()