Selenium
Selenium 是一个基于 WebDriver 协议的浏览器自动化框架。通过使用 Remote WebDriver,可以控制远程运行的真实浏览器。
Selenium 是一个基于 WebDriver 协议的浏览器自动化框架,提供:
- 真实 Chromium 浏览器控制
- 页面加载和 JavaScript 执行
- DOM 查询和基本事件模拟
- 支持连接远程指纹浏览器集群
Selenium 不模拟浏览器 HTTP 请求,它通过 WebDriver 协议驱动真实浏览器执行实际页面逻辑。
连接远程指纹浏览器
Section titled “连接远程指纹浏览器”import osfrom selenium import webdriverfrom selenium.webdriver.support.ui import WebDriverWait
# 获取浏览器认证auth = os.environ.get("PROXY_AUTH")
# WebDriver 端点chrome_http = os.environ.get("ChromeHttp") or "chrome-http-inner.coreclaw.com"browser_url = f'http://{auth}@{chrome_http}'
# 配置 Chrome 选项chrome_options = webdriver.ChromeOptions()chrome_options.add_argument('--no-sandbox')chrome_options.add_argument('--disable-dev-shm-usage')chrome_options.add_argument('--window-size=1920,1080')
# 连接远程浏览器driver = webdriver.Remote( command_executor=browser_url, options=chrome_options)
# 导航到页面driver.get(url)WebDriverWait(driver, 180).until( lambda d: d.execute_script("return document.readyState") == "complete")html = driver.page_sourceimport osfrom selenium import webdriverfrom selenium.webdriver.support.ui import WebDriverWaitfrom sdk import CoreSDK
def run(): CoreSDK.Log.info("启动 Selenium 演示...")
# 定义输出表头 headers = [ {"label": "url", "key": "url", "format": "text"}, {"label": "html", "key": "html", "format": "text"}, {"label": "resp_status", "key": "resp_status", "format": "text"}, ] CoreSDK.Result.set_table_header(headers)
# 获取输入参数 input_json = CoreSDK.Parameter.get_input_json_dict() url = input_json['url']
# 获取浏览器认证 auth = os.environ.get("PROXY_AUTH") chrome_http = os.environ.get("ChromeHttp") or "chrome-http-inner.coreclaw.com" browser_url = f'http://{auth}@{chrome_http}'
result = {"url": url, "html": "", "resp_status": "200"}
# 配置选项 chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage')
try: driver = webdriver.Remote( command_executor=browser_url, options=chrome_options ) driver.get(url) WebDriverWait(driver, 180).until( lambda d: d.execute_script("return document.readyState") == "complete" ) result["html"] = driver.page_source except Exception as e: CoreSDK.Log.error(f"失败: {e}") result['resp_status'] = "500"
CoreSDK.Result.push_data(result)
if __name__ == "__main__": run()DOM 操作
Section titled “DOM 操作”from selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as EC
# CSS 选择器(推荐)element = driver.find_element(By.CSS_SELECTOR, '.product-title')element = driver.find_element(By.ID, 'main-content')
# XPathelement = driver.find_element(By.XPATH, '//div[@class="container"]')
# 等待元素element = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, '.product-title')))
# 获取属性text = element.texthtml = element.get_attribute('outerHTML')# 获取所有匹配元素items = driver.find_elements(By.CSS_SELECTOR, '.product-item')
# 遍历products = []for item in items: try: name = item.find_element(By.CSS_SELECTOR, '.name').text price = item.find_element(By.CSS_SELECTOR, '.price').text products.append({'name': name, 'price': price}) except: pass
# 基于 JavaScript 提取(更高性能)products = driver.execute_script(''' const items = document.querySelectorAll('.product-item'); return Array.from(items).map(item => ({ name: item.querySelector('.name')?.textContent.trim(), price: item.querySelector('.price')?.textContent.trim() }));''')❌ 不要使用 sleep 等待:
time.sleep(5) # 不可靠❌ 不要使用 requests 模拟浏览器:
requests.get(url) # 内容不完整,容易被检测