Selenium

在现代 Web 数据采集场景下，为什么选择 Selenium 作为页面采集框架，以及官方推荐的使用方式。

一、Selenium 的定位与作用

Selenium 是一个基于 WebDriver 协议的浏览器自动化框架，

通过 Remote WebDriver 方式，可实现对远程真实浏览器的控制。

其主要能力包括：

真实 Chromium 浏览器控制
页面加载与 JavaScript 执行
DOM 查询与基础事件模拟
支持对接远程指纹浏览器集群

Selenium 并非模拟浏览器请求，

而是通过 WebDriver 协议 驱动真实浏览器执行网页逻辑。

二、官方推荐写法

1️⃣ 连接远程指纹浏览器

平台已提供 基于 HTTP WebDriver 协议的指纹浏览器服务，

通过 Remote 方式接入。

try:
    Auth = os.environ.get("PROXY_AUTH")
    CoreSDK.Log.info(f"当前获取的浏览器认证信息: {Auth}")
except Exception as e:
    # 捕获其他未知异常
    CoreSDK.Log.error(f"当前获取浏览器认证信息失败: {e}")
    Auth = None
    return

# 指纹浏览器的 WebDriver 连接地址（从环境变量读取，支持灵活部署）
chrome_http = os.environ.get("ChromeHttp") or "chrome-http-inner.coreclaw.com"
CoreSDK.Log.info(f"Chrome HTTP 地址: {chrome_http}")

browser_url = f'http://{Auth}@{chrome_http}'
rest_item = {"url": url, "html": "", "resp_status": "200"}

# 设置 Chrome 选项
chrome_options = webdriver.ChromeOptions()

# 添加一些常用选项
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920,1080')

CoreSDK.Log.info(f"请求的url：{url}")
try:
    driver = webdriver.Remote(
        command_executor=browser_url,
        options=chrome_options
    )
except Exception as e:
    CoreSDK.Log.info(f"[错误] 指纹浏览器连接失败: {e}")
    rest_item['resp_status'] = "403"
    return

2️⃣ 页面访问与内容获取

try:
    driver.get(url)
    WebDriverWait(driver,3 *60).until(lambda d: d.execute_script("return document.readyState") == "complete")
    html = driver.page_source
    rest_item["html"] = html
except Exceptionas e:
    CoreSDK.Log.info(f"[错误] 获取浏览器html失败: {e}")
    rest_item['resp_status'] ="500"

CoreSDK.Result.push_data(rest_item)

3️⃣ 连接 Firefox 指纹浏览器

Selenium 支持通过 Remote WebDriver 连接 Firefox 浏览器：

try:
    Auth = os.environ.get("PROXY_AUTH")
    CoreSDK.Log.info(f"当前获取的浏览器认证信息: {Auth}")
except Exception as e:
    # 捕获其他未知异常
    CoreSDK.Log.error(f"当前获取浏览器认证信息失败: {e}")
    Auth = None
    return

# Firefox WebDriver 连接地址（从环境变量读取，支持灵活部署）
firefox_http = os.environ.get("FirefoxHttp") or "firefox-http-inner.coreclaw.com"
CoreSDK.Log.info(f"Firefox HTTP 地址: {firefox_http}")

browser_url = f'http://{Auth}@{firefox_http}'
rest_item = {"url": url, "html": "", "resp_status": "200"}

# 设置 Firefox 选项
firefox_options = webdriver.FirefoxOptions()

# 添加一些常用选项
firefox_options.add_argument('--no-sandbox')
firefox_options.add_argument('--disable-dev-shm-usage')
firefox_options.add_argument('--width=1920')
firefox_options.add_argument('--height=1080')

CoreSDK.Log.info(f"请求的url：{url}")
try:
    driver = webdriver.Remote(
        command_executor=browser_url,
        options=firefox_options
    )
except Exception as e:
    CoreSDK.Log.info(f"[错误] Firefox 指纹浏览器连接失败: {e}")
    rest_item['resp_status'] = "403"
    return

三、完整平台脚本入口示例（推荐直接使用）

import asyncio
import os

from seleniumimport webdriver
from selenium.webdriver.support.uiimport WebDriverWait

from sdkimport CoreSDK

asyncdefrun():
    CoreSDK.Log.info("🚀 Init...")
    CoreSDK.Log.info("====================================================")
    CoreSDK.Log.info("🚀 CoreClaw Selenium Browser Scrape Demo")
    CoreSDK.Log.info("====================================================")

    headers = [
        {"label":"url","key":"url","format":"text"},
        {"label":"html","key":"html","format":"text"},
        {"label":"resp_status","key":"resp_status","format":"text"},
    ]
    CoreSDK.Result.set_table_header(headers)

    input_json_dict = CoreSDK.Parameter.get_input_json_dict()
    url = input_json_dict['url']

    Auth = os.environ.get("PROXY_AUTH")
    CoreSDK.Log.info(f"当前获取的浏览器认证信息: {Auth}")

    # 指纹浏览器的 WebDriver 连接地址（从环境变量读取，支持灵活部署）
    chrome_http = os.environ.get("ChromeHttp") or "chrome-http-inner.coreclaw.com"
    CoreSDK.Log.info(f"Chrome HTTP 地址: {chrome_http}")

    browser_url = f'http://{Auth}@{chrome_http}'
    rest_item = {"url": url, "html": "", "resp_status": "200"}

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')

try:
    driver = webdriver.Remote(
        command_executor=browser_url,
        options=chrome_options
    )
except Exceptionas e:
    CoreSDK.Log.info(f"[错误] 指纹浏览器连接失败: {e}")
    rest_item['resp_status'] ="403"
return

try:
    driver.get(url)
    WebDriverWait(driver,3 *60).until(lambda d: d.execute_script("return document.readyState") =="complete")
    rest_item["html"] = driver.page_source
except Exceptionas e:
    CoreSDK.Log.info(f"[错误] 获取浏览器html失败: {e}")
    rest_item['resp_status'] ="500"

    CoreSDK.Result.push_data(rest_item)

if __name__ =="__main__":
    asyncio.run(run())

四、动态内容与 DOM 操作

获取单个元素

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 方法1：CSS选择器（推荐）
element = driver.find_element(By.CSS_SELECTOR, '.product-title')
element = driver.find_element(By.ID, 'main-content')  # ID选择器
element = driver.find_element(By.TAG_NAME, 'h1')

# 方法2：XPath
element = driver.find_element(By.XPATH, '//div[@class="container"]')
element = driver.find_element(By.XPATH, '//button[contains(text(), "提交")]')

# 方法3：其他定位方式
element = driver.find_element(By.CLASS_NAME, 'product-item')
element = driver.find_element(By.NAME, 'username')
element = driver.find_element(By.LINK_TEXT, '立即购买')
element = driver.find_element(By.PARTIAL_LINK_TEXT, '购买')

# 等待元素出现（推荐）
element = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, '.product-title'))
)

# 检查元素是否存在
try:
    element = driver.find_element(By.CSS_SELECTOR, '.product-title')
    text = element.text
    html = element.get_attribute('outerHTML')
    class_name = element.get_attribute('class')
    href = element.get_attribute('href')
    is_displayed = element.is_displayed()
except NoSuchElementException:
    print("元素不存在")

批量元素处理

# 获取所有匹配元素
product_items = driver.find_elements(By.CSS_SELECTOR, '.product-item')
print(f"找到 {len(product_items)} 个商品")

# 遍历处理
products_data = []
for item in product_items:
    try:
        name_elem = item.find_element(By.CSS_SELECTOR, '.name')
        name = name_elem.text
    except NoSuchElementException:
        name = ''

    try:
        price_elem = item.find_element(By.CSS_SELECTOR, '.price')
        price = price_elem.text
    except NoSuchElementException:
        price = ''

    try:
        link_elem = item.find_element(By.CSS_SELECTOR, '.link')
        link = link_elem.get_attribute('href')
    except NoSuchElementException:
        link = ''

    products_data.append({
        'name': name,
        'price': price,
        'link': link
    })

# 使用列表推导式（带异常处理）
names = []
for item in product_items:
    try:
        names.append(item.find_element(By.CSS_SELECTOR, '.name').text)
    except NoSuchElementException:
        continue

# 使用filter和map（更函数式）
def get_name(item):
    try:
        return item.find_element(By.CSS_SELECTOR, '.name').text
    except NoSuchElementException:
        return None

names = list(filter(None, map(get_name, product_items)))

# 使用JavaScript批量获取（性能更高）
products_data = driver.execute_script('''
    const items = document.querySelectorAll('.product-item');
    return Array.from(items).map(item => {
        const nameElem = item.querySelector('.name');
        const priceElem = item.querySelector('.price');
        const linkElem = item.querySelector('.link');
        return {
            name: nameElem ? nameElem.textContent.trim() : '',
            price: priceElem ? priceElem.textContent.trim() : '',
            link: linkElem ? linkElem.href : ''
        };
    });
''')

特点说明：

操作真实浏览器 DOM
可获取 JS 渲染后的内容
与前端展示逻辑一致

五、官方不推荐写法（反例）

❌ 使用 sleep 等待页面加载

time.sleep(5)

问题：

无法保证 JS 执行完成
页面慢时失败
页面快时浪费时间

❌ 使用 requests 模拟浏览器页面

requests.get(url)

问题：

页面内容不完整
容易触发反爬机制
成功率不可控