Examples¶

Usage examples for the ScrapeDoClient and AsyncScrapeDoClient

Authentication

Every example assumes SCRAPE_DO_API_KEY is set in the environment so that the clients can pick it up automatically
Pass api_token=... to the constructor to wire it in explicitly

Making A GET Request¶

GET Request Example

SyncAsync

from scrape_do import ScrapeDoClient

with ScrapeDoClient() as client:
    response = client.get(
        "https://shop.example.com/products/headphones-42",
        super=True,
        geo_code="us",
        render=True,
        wait_until="domcontentloaded"
        )

response.raise_for_status()
print(response.target_status_code)
print(response.remaining_credits)

import asyncio
from scrape_do import AsyncScrapeDoClient

async def fetch(url: str) -> str:
    async with AsyncScrapeDoClient() as client:
        response = await client.get(
            url,
            super=True,
            geo_code="us",
            render=True,
            wait_until="domcontentloaded"
            )
    response.raise_for_status()
    return response.text


html = asyncio.run(fetch("https://shop.example.com/products/headphones-42"))

What Happened

Your keyword arguments were validated through RequestParameters and turned into a properly formatted https://api.scrape.do/?token=...&url=...&render=true&... URL
The client executed it through httpx and wrapped the result in ScrapeDoResponse

Making A POST Request With A JSON Payload¶

POST Request Example

from scrape_do import ScrapeDoClient

with ScrapeDoClient() as client:
    response = client.post(
        "https://api.shop.example.com/v1/search",
        body={"query": "wireless headphones", "page": 1, "sort": "price_asc"},
        payload_type="json",
        super=True,
        geo_code="us",
        custom_headers=True
        headers={
            "Accept": "application/json",
            "X-Trace-Id": "scraper-7d34",
        },
    )

response.raise_for_status()
products = response.json()["results"]
print(f"Got {len(products)} products on page 1")

Using Browser Actions¶

Browser Actions Example

from scrape_do import ScrapeDoClient
from scrape_do.models import (
    ClickAction,
    FillAction,
    ScreenShotAction,
    ScrollYAction,
    WaitAction,
    WaitSelectorAction
    )

with ScrapeDoClient() as client:
    response = client.get(
        "https://shop.example.com/catalog?q=headphones",
        render=True,
        return_json=True,
        wait_until="domcontentloaded",
        play_with_browser=[
            # 1. Wait for the "Accept Cookies" button and dismiss it
            WaitSelectorAction(wait_selector="button#cookie-accept", timeout=5_000),
            ClickAction(selector="button#cookie-accept"),

            # 2. Filter for "In Stock" via the sidebar
            ClickAction(selector="input#filter-in-stock"),

            # 3. Let the result grid re-render
            WaitAction(timeout=1_500),

            # 4. Scroll down to trigger lazy-loaded cards
            ScrollYAction(value=2_400),
            WaitAction(timeout=1_000),

            # 5. Snapshot for evidence
            ScreenShotAction(full_screenshot=True)
            ]
            )

response.raise_for_status()
for ar in response.action_results:
    if not ar.success:
        raise RuntimeError(f"action {ar.index} ({ar.action}) failed: {ar.error}")

response.screenshots[0].to_file("catalog_in_stock.png")

Authenticated Crawl With Rotation Detection¶

Rotation Detection Example

from scrape_do import ScrapeDoClient, RotatedSessionError


SESSION_ID = 7777
AUTH_COOKIE: str | None = None


def login(client: ScrapeDoClient) -> str:
    """
    Authenticate against the site and return the session cookie
    the target hands back
    """
    response = client.post(
        "https://shop.example.com/api/auth/login",
        body={"email": "scraper@example.com", "password": "***"},
        payload_type="json",
        session_id=SESSION_ID
        )
    response.raise_for_status()
    return response.cookies["session"]


def authenticated(response) -> bool:
    """
    As long as the logged-in account name shows up in the page,
    the same proxy node is still carrying our login. If Scrape.do
    rotated to a fresh IP, the target sees no session cookie and
    the marker disappears.
    """
    return '<span class="account-name">' in response.text


pages = [f"https://shop.example.com/orders?page={i}" for i in range(1, 11)]

with ScrapeDoClient(session_validator=authenticated) as client:
    AUTH_COOKIE = login(client)

    for url in pages:
        try:
            response = client.get(
                url,
                session_id=SESSION_ID,
                set_cookies=f"session={AUTH_COOKIE}"
                )
        except RotatedSessionError:
            # Proxy node rotated mid-crawl
            # log back in on the new node and retry the page from scratch
            AUTH_COOKIE = login(client)
            response = client.get(
                url,
                session_id=SESSION_ID,
                set_cookies=f"session={AUTH_COOKIE}"
                )

Using Native Event Hooks¶

Event Hooks Logger Example

import logging
from scrape_do import (
    ScrapeDoClient,
    ScrapeDoResponse,
    PreparedScrapeDoRequest
)
from typing import Optional

logger = logging.getLogger("scrape_do")

def on_request(prepared: PreparedScrapeDoRequest):
    logger.info(f"Making {prepared.method} Request To {prepared.api_params.url}")


def on_response(response: ScrapeDoResponse):
    logger.info(
        f"Got Response From {response.target_url}"
        )

    with open(f"{response.request.url}_resp.json", "w") as f:
        f.write(response.to_json())


def on_retry(
    attempt: int,
    prepared: PreparedScrapeDoRequest,
    response: Optional[ScrapeDoResopnse]
    exc: Optional[Exception]
):
    if response is not None:
        logger.info(f"Got Response For Retry #{attempt}")
        with open(f"retry_{attempt}.json", "w") as f:
            f.write(response.to_json())

    if exc is not None:
        logger.warning(f"Retry #{attempt} Failed With Exception: {exc}")



with ScrapeDoClient(
    event_hooks={
        "request": [on_request],
        "response": [on_response],
        "retry": [on_retry]
        }
) as client:
    client.get("https://www.example.com", super=True)