Examples¶
Usage examples for the ScrapeDoClient and AsyncScrapeDoClient
Authentication
-
Every example assumes
SCRAPE_DO_API_KEYis set in the environment so that the clients can pick it up automatically -
Pass
api_token=...to the constructor to wire it in explicitly
Making A GET Request¶
GET Request Example
from scrape_do import ScrapeDoClient
with ScrapeDoClient() as client:
response = client.get(
"https://shop.example.com/products/headphones-42",
super=True,
geo_code="us",
render=True,
wait_until="domcontentloaded"
)
response.raise_for_status()
print(response.target_status_code)
print(response.remaining_credits)
import asyncio
from scrape_do import AsyncScrapeDoClient
async def fetch(url: str) -> str:
async with AsyncScrapeDoClient() as client:
response = await client.get(
url,
super=True,
geo_code="us",
render=True,
wait_until="domcontentloaded"
)
response.raise_for_status()
return response.text
html = asyncio.run(fetch("https://shop.example.com/products/headphones-42"))
What Happened
-
Your keyword arguments were validated through
RequestParametersand turned into a properly formattedhttps://api.scrape.do/?token=...&url=...&render=true&...URL -
The client executed it through
httpxand wrapped the result inScrapeDoResponse
Making A POST Request With A JSON Payload¶
POST Request Example
from scrape_do import ScrapeDoClient
with ScrapeDoClient() as client:
response = client.post(
"https://api.shop.example.com/v1/search",
body={"query": "wireless headphones", "page": 1, "sort": "price_asc"},
payload_type="json",
super=True,
geo_code="us",
custom_headers=True
headers={
"Accept": "application/json",
"X-Trace-Id": "scraper-7d34",
},
)
response.raise_for_status()
products = response.json()["results"]
print(f"Got {len(products)} products on page 1")
Using Browser Actions¶
Browser Actions Example
from scrape_do import ScrapeDoClient
from scrape_do.models import (
ClickAction,
FillAction,
ScreenShotAction,
ScrollYAction,
WaitAction,
WaitSelectorAction
)
with ScrapeDoClient() as client:
response = client.get(
"https://shop.example.com/catalog?q=headphones",
render=True,
return_json=True,
wait_until="domcontentloaded",
play_with_browser=[
# 1. Wait for the "Accept Cookies" button and dismiss it
WaitSelectorAction(wait_selector="button#cookie-accept", timeout=5_000),
ClickAction(selector="button#cookie-accept"),
# 2. Filter for "In Stock" via the sidebar
ClickAction(selector="input#filter-in-stock"),
# 3. Let the result grid re-render
WaitAction(timeout=1_500),
# 4. Scroll down to trigger lazy-loaded cards
ScrollYAction(value=2_400),
WaitAction(timeout=1_000),
# 5. Snapshot for evidence
ScreenShotAction(full_screenshot=True)
]
)
response.raise_for_status()
for ar in response.action_results:
if not ar.success:
raise RuntimeError(f"action {ar.index} ({ar.action}) failed: {ar.error}")
response.screenshots[0].to_file("catalog_in_stock.png")
Authenticated Crawl With Rotation Detection¶
Rotation Detection Example
from scrape_do import ScrapeDoClient, RotatedSessionError
SESSION_ID = 7777
AUTH_COOKIE: str | None = None
def login(client: ScrapeDoClient) -> str:
"""
Authenticate against the site and return the session cookie
the target hands back
"""
response = client.post(
"https://shop.example.com/api/auth/login",
body={"email": "scraper@example.com", "password": "***"},
payload_type="json",
session_id=SESSION_ID
)
response.raise_for_status()
return response.cookies["session"]
def authenticated(response) -> bool:
"""
As long as the logged-in account name shows up in the page,
the same proxy node is still carrying our login. If Scrape.do
rotated to a fresh IP, the target sees no session cookie and
the marker disappears.
"""
return '<span class="account-name">' in response.text
pages = [f"https://shop.example.com/orders?page={i}" for i in range(1, 11)]
with ScrapeDoClient(session_validator=authenticated) as client:
AUTH_COOKIE = login(client)
for url in pages:
try:
response = client.get(
url,
session_id=SESSION_ID,
set_cookies=f"session={AUTH_COOKIE}"
)
except RotatedSessionError:
# Proxy node rotated mid-crawl
# log back in on the new node and retry the page from scratch
AUTH_COOKIE = login(client)
response = client.get(
url,
session_id=SESSION_ID,
set_cookies=f"session={AUTH_COOKIE}"
)
Using Native Event Hooks¶
Event Hooks Logger Example
import logging
from scrape_do import (
ScrapeDoClient,
ScrapeDoResponse,
PreparedScrapeDoRequest
)
from typing import Optional
logger = logging.getLogger("scrape_do")
def on_request(prepared: PreparedScrapeDoRequest):
logger.info(f"Making {prepared.method} Request To {prepared.api_params.url}")
def on_response(response: ScrapeDoResponse):
logger.info(
f"Got Response From {response.target_url}"
)
with open(f"{response.request.url}_resp.json", "w") as f:
f.write(response.to_json())
def on_retry(
attempt: int,
prepared: PreparedScrapeDoRequest,
response: Optional[ScrapeDoResopnse]
exc: Optional[Exception]
):
if response is not None:
logger.info(f"Got Response For Retry #{attempt}")
with open(f"retry_{attempt}.json", "w") as f:
f.write(response.to_json())
if exc is not None:
logger.warning(f"Retry #{attempt} Failed With Exception: {exc}")
with ScrapeDoClient(
event_hooks={
"request": [on_request],
"response": [on_response],
"retry": [on_retry]
}
) as client:
client.get("https://www.example.com", super=True)