Skip to content

Selenium

In modern Web data scraping scenarios, why Selenium is chosen as the page scraping framework, and the officially recommended usage pattern.

Selenium is a browser automation framework based on the WebDriver protocol.

By using Remote WebDriver, it can control real browsers running remotely.

Its core capabilities include:

  • Control of real Chromium browsers
  • Page loading and JavaScript execution
  • DOM querying and basic event simulation
  • Support for connecting to remote fingerprint browser clusters

Selenium does not simulate browser HTTP requests.

Instead, it drives a real browser to execute actual page logic via the WebDriver protocol.


1️⃣ Connecting to the Remote Fingerprint Browser
Section titled “1️⃣ Connecting to the Remote Fingerprint Browser”

The platform provides a fingerprint browser service based on the HTTP WebDriver protocol,

which can be accessed in Remote mode.

try:
Auth = os.environ.get("PROXY_AUTH")
CoreSDK.Log.info(f"Current browser auth info: {Auth}")
except Exception as e:
CoreSDK.Log.error(f"Failed to get browser auth info: {e}")
Auth = None
return
browser_url = f'http://{Auth}@chrome-http-inner.coreclaw.com' # WebDriver endpoint of the fingerprint browser
rest_item = {"url": url, "html": "", "resp_status": "200"}
# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
# Common options
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920,1080')
CoreSDK.Log.info(f"Requested URL: {url}")
try:
driver = webdriver.Remote(
command_executor=browser_url,
options=chrome_options
)
except Exception as e:
CoreSDK.Log.info(f"[Error] Failed to connect to fingerprint browser: {e}")
rest_item['resp_status'] = "403"
return

2️⃣ Page Navigation and Content Retrieval
Section titled “2️⃣ Page Navigation and Content Retrieval”
try:
driver.get(url)
WebDriverWait(driver, 3 * 60).until(
lambda d: d.execute_script("return document.readyState") == "complete"
)
html = driver.page_source
rest_item["html"] = html
except Exception as e:
CoreSDK.Log.info(f"[Error] Failed to get page HTML: {e}")
rest_item['resp_status'] = "500"
CoreSDK.Result.push_data(rest_item)

3. Complete Platform Script Entry Example (Recommended)

Section titled “3. Complete Platform Script Entry Example (Recommended)”
import asyncio
import os
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from sdk import CoreSDK
async def run():
CoreSDK.Log.info("🚀 Init...")
CoreSDK.Log.info("====================================================")
CoreSDK.Log.info("🚀 CoreClaw Selenium Browser Scrape Demo")
CoreSDK.Log.info("====================================================")
headers = [
{"label": "url", "key": "url", "format": "text"},
{"label": "html", "key": "html", "format": "text"},
{"label": "resp_status", "key": "resp_status", "format": "text"},
]
CoreSDK.Result.set_table_header(headers)
input_json_dict = CoreSDK.Parameter.get_input_json_dict()
url = input_json_dict['url']
Auth = os.environ.get("PROXY_AUTH")
CoreSDK.Log.info(f"Current browser auth info: {Auth}")
browser_url = f'http://{Auth}@chrome-http-inner.coreclaw.com'
rest_item = {"url": url, "html": "", "resp_status": "200"}
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920,1080')
try:
driver = webdriver.Remote(
command_executor=browser_url,
options=chrome_options
)
except Exception as e:
CoreSDK.Log.info(f"[Error] Failed to connect fingerprint browser: {e}")
rest_item['resp_status'] = "403"
return
try:
driver.get(url)
WebDriverWait(driver, 3 * 60).until(
lambda d: d.execute_script("return document.readyState") == "complete"
)
rest_item["html"] = driver.page_source
except Exception as e:
CoreSDK.Log.info(f"[Error] Failed to retrieve HTML: {e}")
rest_item['resp_status'] = "500"
CoreSDK.Result.push_data(rest_item)
if __name__ == "__main__":
asyncio.run(run())

  1. Dynamic Content and DOM Operations
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Method 1: CSS selectors (recommended)
element = driver.find_element(By.CSS_SELECTOR, '.product-title')
element = driver.find_element(By.ID, 'main-content')
element = driver.find_element(By.TAG_NAME, 'h1')
# Method 2: XPath
element = driver.find_element(By.XPATH, '//div[@class="container"]')
element = driver.find_element(By.XPATH, '//button[contains(text(), "Submit")]')
# Method 3: Other locators
element = driver.find_element(By.CLASS_NAME, 'product-item')
element = driver.find_element(By.NAME, 'username')
element = driver.find_element(By.LINK_TEXT, 'Buy Now')
element = driver.find_element(By.PARTIAL_LINK_TEXT, 'Buy')
# Wait for element to appear (recommended)
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.product-title'))
)
# Check element existence and attributes
try:
element = driver.find_element(By.CSS_SELECTOR, '.product-title')
text = element.text
html = element.get_attribute('outerHTML')
class_name = element.get_attribute('class')
href = element.get_attribute('href')
is_displayed = element.is_displayed()
except NoSuchElementException:
print("Element not found")
# Get all matching elements
product_items = driver.find_elements(By.CSS_SELECTOR, '.product-item')
print(f"Found {len(product_items)} products")
# Iterate and extract
products_data = []
for item in product_items:
try:
name = item.find_element(By.CSS_SELECTOR, '.name').text
except NoSuchElementException:
name = ''
try:
price = item.find_element(By.CSS_SELECTOR, '.price').text
except NoSuchElementException:
price = ''
try:
link = item.find_element(By.CSS_SELECTOR, '.link').get_attribute('href')
except NoSuchElementException:
link = ''
products_data.append({
'name': name,
'price': price,
'link': link
})
# JavaScript-based bulk extraction (higher performance)
products_data = driver.execute_script('''
const items = document.querySelectorAll('.product-item');
return Array.from(items).map(item => {
const nameElem = item.querySelector('.name');
const priceElem = item.querySelector('.price');
const linkElem = item.querySelector('.link');
return {
name: nameElem ? nameElem.textContent.trim() : '',
price: priceElem ? priceElem.textContent.trim() : '',
link: linkElem ? linkElem.href : ''
};
});
''')

Characteristics:

  • Operates on a real browser DOM
  • Retrieves fully JavaScript-rendered content
  • Fully consistent with frontend rendering logic

  1. Officially Not Recommended Practices (Anti-Patterns)
time.sleep(5)

Issues:

  • Cannot guarantee JavaScript execution is complete
  • Fails on slow pages
  • Wastes time on fast pages
❌ Using requeststo simulate browser behavior
Section titled “❌ Using requeststo simulate browser behavior”
requests.get(url)

Issues:

  • Incomplete page content
  • Easily detected by anti-bot systems
  • Unstable success rate