123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596 |
- from selenium import webdriver
- from selenium.webdriver.edge.service import Service as EdgeService
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.edge.options import Options
- import requests
- import os
- import time
- import re
- # 设置Edge选项
- edge_options = Options()
- edge_options.add_argument('--headless') # 如果不需要看到浏览器运行界面,则取消此行注释
- edge_options.add_argument('--disable-gpu')
- # 设置webdriver路径,注意替换为你自己的msedgedriver路径
- webdriver_path = "D:/Drivers/msedgedriver.exe"
- service = EdgeService(executable_path=webdriver_path)
- driver = webdriver.Edge(service=service, options=edge_options)
- wait = WebDriverWait(driver, 10) # 等待元素加载的最大等待时间
- save_dir = r"C:\Users\Administrator\Pictures\Captures"
- if not os.path.exists(save_dir):
- os.makedirs(save_dir)
- def sanitize_filename(filename):
- """Remove or replace characters that are invalid in filenames."""
- return re.sub(r'[<>:"/\\|?*]', '_', filename)
- def download_images_from_page(page_url, index_offset=0):
- images = driver.find_elements(By.TAG_NAME, 'img')
- for img_index, img in enumerate(images):
- try:
- img_url = img.get_attribute('src')
- if img_url is None or img_url == '':
- print(f"跳过无src属性的图片 {img_index + index_offset}")
- continue
- response = requests.get(img_url)
- if response.status_code == 200:
- # 使用页面URL作为文件名的基础,并添加图片索引以确保唯一性
- page_name = sanitize_filename(page_url.replace('https://', '').replace('http://', ''))
- save_filename = f"{page_name}_image_{img_index + index_offset}.png"
- save_path = os.path.join(save_dir, save_filename)
- with open(save_path, 'wb') as file:
- file.write(response.content)
- print(f"已保存: {save_path}")
- else:
- print(f"无法下载图片 {img_index + index_offset}, 状态码: {response.status_code}")
- except Exception as e:
- print(f"处理图片 {img_index + index_offset} 时出错: {e}")
- try:
- driver.get('https://tsl.uad360.com/')
- time.sleep(5) # 给页面加载留出时间
- current_url = driver.current_url
- # 获取初始页面的所有图片
- download_images_from_page(current_url)
- # 找到页面中的所有链接
- links = driver.find_elements(By.TAG_NAME, 'a')
- total_links = len(links)
- for link_index, link in enumerate(links):
- try:
- link_url = link.get_attribute('href')
- if link_url is None or link_url == '':
- print(f"跳过无效链接 {link_index}")
- continue
- # 打开新标签页
- driver.execute_script("window.open();")
- driver.switch_to.window(driver.window_handles[1])
- # 访问链接指向的页面
- driver.get(link_url)
- time.sleep(5) # 给页面加载留出时间
- # 下载该页面的所有图片
- download_images_from_page(link_url, index_offset=(link_index + 1) * total_links)
- # 关闭当前标签页并切换回原始标签页
- driver.close()
- driver.switch_to.window(driver.window_handles[0])
- except Exception as e:
- print(f"处理链接 {link_index} 时出错: {e}")
- finally:
- driver.quit()
|