Selenium

In modern Web data scraping scenarios, why Selenium is chosen as the page scraping framework, and the officially recommended usage pattern.

1. Positioning and Role of Selenium

Selenium is a browser automation framework based on the WebDriver protocol.

By using Remote WebDriver, it can control real browsers running remotely.

Its core capabilities include:

Control of real Chromium browsers
Page loading and JavaScript execution
DOM querying and basic event simulation
Support for connecting to remote fingerprint browser clusters

Selenium does not simulate browser HTTP requests.

Instead, it drives a real browser to execute actual page logic via the WebDriver protocol.

2. Officially Recommended Usage

1️⃣ Connecting to the Remote Fingerprint Browser

The platform provides a fingerprint browser service based on the HTTP WebDriver protocol,

which can be accessed in Remote mode.

try:
    Auth = os.environ.get("PROXY_AUTH")
    CoreSDK.Log.info(f"Current browser auth info: {Auth}")
except Exception as e:
    CoreSDK.Log.error(f"Failed to get browser auth info: {e}")
    Auth = None
    return

browser_url = f'http://{Auth}@chrome-http-inner.coreclaw.com'  # WebDriver endpoint of the fingerprint browser
rest_item = {"url": url, "html": "", "resp_status": "200"}

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()

# Common options
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920,1080')

CoreSDK.Log.info(f"Requested URL: {url}")
try:
    driver = webdriver.Remote(
        command_executor=browser_url,
        options=chrome_options
    )
except Exception as e:
    CoreSDK.Log.info(f"[Error] Failed to connect to fingerprint browser: {e}")
    rest_item['resp_status'] = "403"
    return

try:
    driver.get(url)
    WebDriverWait(driver, 3 * 60).until(
        lambda d: d.execute_script("return document.readyState") == "complete"
    )
    html = driver.page_source
    rest_item["html"] = html
except Exception as e:
    CoreSDK.Log.info(f"[Error] Failed to get page HTML: {e}")
    rest_item['resp_status'] = "500"

CoreSDK.Result.push_data(rest_item)

3. Complete Platform Script Entry Example (Recommended)

import asyncio
import os

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait

from sdk import CoreSDK

async def run():
    CoreSDK.Log.info("🚀 Init...")
    CoreSDK.Log.info("====================================================")
    CoreSDK.Log.info("🚀 CoreClaw Selenium Browser Scrape Demo")
    CoreSDK.Log.info("====================================================")

    headers = [
        {"label": "url", "key": "url", "format": "text"},
        {"label": "html", "key": "html", "format": "text"},
        {"label": "resp_status", "key": "resp_status", "format": "text"},
    ]
    CoreSDK.Result.set_table_header(headers)

    input_json_dict = CoreSDK.Parameter.get_input_json_dict()
    url = input_json_dict['url']

    Auth = os.environ.get("PROXY_AUTH")
    CoreSDK.Log.info(f"Current browser auth info: {Auth}")

    browser_url = f'http://{Auth}@chrome-http-inner.coreclaw.com'
    rest_item = {"url": url, "html": "", "resp_status": "200"}

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')

    try:
        driver = webdriver.Remote(
            command_executor=browser_url,
            options=chrome_options
        )
    except Exception as e:
        CoreSDK.Log.info(f"[Error] Failed to connect fingerprint browser: {e}")
        rest_item['resp_status'] = "403"
        return

    try:
        driver.get(url)
        WebDriverWait(driver, 3 * 60).until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )
        rest_item["html"] = driver.page_source
    except Exception as e:
        CoreSDK.Log.info(f"[Error] Failed to retrieve HTML: {e}")
        rest_item['resp_status'] = "500"

    CoreSDK.Result.push_data(rest_item)

if __name__ == "__main__":
    asyncio.run(run())

Dynamic Content and DOM Operations

Selecting a Single Element

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Method 1: CSS selectors (recommended)
element = driver.find_element(By.CSS_SELECTOR, '.product-title')
element = driver.find_element(By.ID, 'main-content')
element = driver.find_element(By.TAG_NAME, 'h1')

# Method 2: XPath
element = driver.find_element(By.XPATH, '//div[@class="container"]')
element = driver.find_element(By.XPATH, '//button[contains(text(), "Submit")]')

# Method 3: Other locators
element = driver.find_element(By.CLASS_NAME, 'product-item')
element = driver.find_element(By.NAME, 'username')
element = driver.find_element(By.LINK_TEXT, 'Buy Now')
element = driver.find_element(By.PARTIAL_LINK_TEXT, 'Buy')

# Wait for element to appear (recommended)
element = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, '.product-title'))
)

# Check element existence and attributes
try:
    element = driver.find_element(By.CSS_SELECTOR, '.product-title')
    text = element.text
    html = element.get_attribute('outerHTML')
    class_name = element.get_attribute('class')
    href = element.get_attribute('href')
    is_displayed = element.is_displayed()
except NoSuchElementException:
    print("Element not found")

Batch Element Processing

# Get all matching elements
product_items = driver.find_elements(By.CSS_SELECTOR, '.product-item')
print(f"Found {len(product_items)} products")

# Iterate and extract
products_data = []
for item in product_items:
    try:
        name = item.find_element(By.CSS_SELECTOR, '.name').text
    except NoSuchElementException:
        name = ''

    try:
        price = item.find_element(By.CSS_SELECTOR, '.price').text
    except NoSuchElementException:
        price = ''

    try:
        link = item.find_element(By.CSS_SELECTOR, '.link').get_attribute('href')
    except NoSuchElementException:
        link = ''

    products_data.append({
        'name': name,
        'price': price,
        'link': link
    })

# JavaScript-based bulk extraction (higher performance)
products_data = driver.execute_script('''
    const items = document.querySelectorAll('.product-item');
    return Array.from(items).map(item => {
        const nameElem = item.querySelector('.name');
        const priceElem = item.querySelector('.price');
        const linkElem = item.querySelector('.link');
        return {
            name: nameElem ? nameElem.textContent.trim() : '',
            price: priceElem ? priceElem.textContent.trim() : '',
            link: linkElem ? linkElem.href : ''
        };
    });
''')

Characteristics:

Operates on a real browser DOM
Retrieves fully JavaScript-rendered content
Fully consistent with frontend rendering logic

Officially Not Recommended Practices (Anti-Patterns)

❌ Using sleepto wait for page loading

time.sleep(5)

Issues:

Cannot guarantee JavaScript execution is complete
Fails on slow pages
Wastes time on fast pages

❌ Using requeststo simulate browser behavior

requests.get(url)