You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

120 lines
3.4 KiB
Python

import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import requests
import os
import io
import json
import random
import json
DRIVER_PATH = '../geckodriver'
DRIVER_PATH2 = '../chromedriver'
def harvesting(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
dictionary = open('pics.json',)
l = json.load(dictionary)
cycles = 10
def scroll_to_end(wd):
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(sleep_between_interactions)
# build the google query
search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"
# load the page
wd.get(search_url.format(q=query))
image_urls = set()
image_count = 0
results_start = 0
sx ="sx"
dx="dx"
while image_count < max_links_to_fetch:
for loop in range(cycles):
scroll_to_end(wd)
time.sleep(.1)
# get all image thumbnail results
thumbnail_results = wd.find_elements(By.CLASS_NAME,"Q4LuWd")
number_results = len(thumbnail_results)
print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
# some useful variables
nPic = 0
holdPic=0
groundPic = 0
currentMeta = ''
for img in thumbnail_results[results_start:number_results]:
# click the thumbnail to get the actual image
try:
img.click()
time.sleep(.1)
except Exception:
continue
# extract image url
actual_images = wd.find_elements(By.CLASS_NAME,'n3VNCb')
actual_image = actual_images[-1]
if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
image_urls.add(actual_image.get_attribute('src'))
linkPic = actual_image.get_attribute('src')
#
# print(linkPic)
l[nPic] = linkPic
time.sleep(.2)
# print(l)
with open("pics.json", "w") as outfile:
json.dump(l, outfile)
nPic = nPic+1
holdPic = holdPic+1
time.sleep(.2)
image_count = len(image_urls)
if len(image_urls) >= max_links_to_fetch:
print(f"Found: {len(image_urls)} image links, done!")
break
else:
print("Found:", len(image_urls), "image links, looking for more ...")
time.sleep(1)
return
load_more_button = wd.find_element(By.CLASS_NAME,"Mye4qd")
if load_more_button:
wd.execute_script("document.querySelector('.mye4qd').click();")
# move the result startpoint further down
results_start = len(thumbnail_results)
return image_urls
if __name__ == '__main__':
wd = webdriver.Firefox(executable_path=DRIVER_PATH)
query = "VAAST COLSON"
wd.get('https://google.com')
time.sleep(.3)
wd.find_element(By.ID,'W0wltc').click()
time.sleep(.5)
search_box = wd.find_element(By.CLASS_NAME,'gLFyf')
search_box.send_keys(query)
links = harvesting(query,2000,wd)
wd.quit()