Olostep Python SDK

PyPI Package: olostep | Requirements: Python 3.11+

Installation

pip install olostep

Authentication

Get your API key from olostep.com/auth

from olostep import OlostepClient

client = OlostepClient(api_key="YOUR_API_KEY")

Quick Start

"""
The quickstart uses the async/await interface as it's the default and generally preferred.
* If you need a blocking interface scroll to the end of this codeblock.
* If you want to see the full interfaces scroll to the next section.
"""

from olostep import OlostepClient

# Provide the API key either via passing in the 'api_key' parameter or
# by setting the OLOSTEP_API_KEY environment variable
client = OlostepClient(api_key="YOUR_REAL_KEY")


# MINIMAL SCRAPE EXAMPLE

scrape_result = await client.scrape("https://example.com")
# -> ScrapeResult(id='scrape_123', available=['html_content', 'markdown_content'])


# MINIMAL BATCH EXAMPLE

batch = await client.batch(["https://site1.com", "https://site2.com"])
# -> Batch(id='batch_123', urls=2)

# waits for all the batch jobs to finish, then starts fetching the results in batches
async for item in batch.items():
    content = await item.retrieve(["html"])
    print(f"{item.url}: {len(content.html_content)} bytes")



# MINIMAL CRAWL EXAMPLE

crawl = await client.crawl("https://example.com", max_pages=100)
# -> Crawl(id='crawl_123', urls=100)


async for page in crawl.pages():
    content = await page.retrieve(["html"])
    print(f"{page.url}: {len(content.html_content)} bytes")



# SYNC (FACADE) CLIENT
# this client is just a wrapper around the async client.
# The interface is the same, just don't use await.
# If you can use the OlostepClient instead, do so.
from olostep import SyncOlostepClient

client = SyncOlostepClient(api_key="YOUR_REAL_KEY")

scrape_result = client.scrape("https://example.com")
# -> ScrapeResult(id='scrape_123', available=['html_content', 'markdown_content'])

Usage

The SDK provides a clean, Pythonic interface organized into logical namespaces. Each operation returns stateful objects with ergonomic methods for follow-up operations.

Scraping

from olostep import OlostepClient
from olostep import Country, FillInputAction, Format, LLMExtract, LinksOnPage, ScreenSize, Transformer, WaitAction

client = OlostepClient(api_key="YOUR_REAL_KEY")


# Minimal: Just scrape a URL
result = await client.scrape("https://example.com")
# ScrapeResult(id='scrape_123', available=['html_content', 'markdown_content'])

# Maximal: Full control over scraping behavior
result = await client.scrape(
    "https://example.com",
    wait_before_scraping=3000,
    formats=[Format.HTML, Format.MARKDOWN],
    remove_css_selectors=["script", ".popup"],
    actions=[
        WaitAction(milliseconds=1500),
        FillInputAction(selector="searchbox", value="olostep")
    ],
    country=Country.US,
    transformer=Transformer("postlight"),
    remove_images=True,
    remove_class_names=["ad"],
    parser="VALID_PARSER",  # check website for valid parsers
    llm_extract=LLMExtract(schema="YOUR_SCHEMA"),
    links_on_page=LinksOnPage(
        absolute_links=False,
        query_to_order_links_by='cars',
        include_links=["/events/**", "/offers/**"],
        exclude_links=[".pdf"]
    ),
    screen_size=ScreenSize(screen_width=1920, screen_height=1080),
    metadata={"custom": "sidecart_data"}  # Not supported yet
)

Batch Processing

from olostep import OlostepClient
from olostep import BatchItem, Country

client = OlostepClient(api_key="YOUR_REAL_KEY")


# Minimal: Process a list of URLs
batch = await client.batch(["https://site1.com", "https://site2.com"])
# Batch(id='batch_123', urls=2)

# Maximal: Advanced batch with custom IDs and options
batch = await client.batch(
    [
        BatchItem(url="https://www.google.com/search?q=olostep"),
        BatchItem(url="https://www.google.com/search?q=olostep+api", custom_id="news_2")
    ],
    country=Country.US,
    parser_id="@olostep/google-search"
)

# This is optional but you can check on the process of your batch at any time with:
info = await batch.info()
# -> BatchInfo(id='batch_123', status='in_progress', completed=1/2, age=2h ago)

# Also optional: Wait for completion.
# Pass in `check_every_n_secs=` to change interval, default 10
await batch.wait_till_done()


# Note: batch.items() automatically checks if the batch is completed before starting to return elements (can be disabled by passing in `wait_for_completion=False`)
async for item in batch.items(batch_size=10):
    content = await item.retrieve(["html", "json"])  # json from the parser
    print(f"{item.custom_id}: {len(content.html_content)} bytes")

# Alternative: Direct API access (stateless)
async for item in client.batch.items(batch_id='a_batch_id', batch_size=10):
    content = await item.retrieve(["html", "json"])
    print(f"{item.custom_id}: {len(content.html_content)} bytes")

Web Crawling

# Minimal: Crawl a site with default settings
crawl = await client.crawl("https://example.com", max_pages=100)
# Crawl(id='crawl_123', urls=100)

# Maximal: Advanced crawling with filters and limits
crawl = await client.crawl(
    "https://example.com",
    max_pages=1000,
    max_depth=3,
    include_urls=["/articles/**", "/news/**"],
    exclude_urls=["/ads/**", "/tracking/**"],
    include_external=False,
    include_subdomain=True,
    search_query="hot shingles",
    top_n=50
)

# This is optional but you can check on the process of your crawl at any time with:
info = await crawl.info()  # CrawlInfo(id='crawl_123', status='in_progress', pages_count=42, age=15m ago)

# Also optional: Wait for completion.
# Pass in `check_every_n_secs=` to change interval, default 10
await crawl.wait_till_done()

# Note: crawl.pages automatically checks if the batch is completed before starting to return elements (can be disabled by passing in `wait_for_completion=False`)
async for page in crawl.pages():
    content = await page.retrieve(["html"])
    print(f"{page.url}: {len(content.html_content)} bytes")

# Alternative: Direct API access (stateless)
async for page in client.crawl.pages(crawl_id='a_crawl_id'):
    content = await page.retrieve(["html"])
    print(f"{page.url}: {len(content.html_content)} bytes")

Site Mapping

# Minimal: Extract all links from a site
sitemap = await client.sitemap("https://example.com")
# Sitemap(id='map_123', urls_count=150, has_more=True)

# Maximal: Advanced link extraction with filters
sitemap = await client.sitemap(
    "https://example.com",
    search_query="documentation",
    top_n=500,
    include_subdomain=True,
    include_urls=["/docs/**", "/api/**"],
    exclude_urls=["/admin/**", "/private/**"]
)

# Seamless iteration over all URLs (auto-pagination)
all_urls = []
async for url in sitemap.urls():  # async generator
    print(f"Found URL: {url}")
    all_urls.append(url)
# Note: This can yield tens of thousands of URLs. If you can don't
#       create a list but use the generator as such.

Data Retrieval

# Notes:
#   * You should generally not need to use this endpoint as the other endpoints generate stateful return objects that can retrieve content.
#   * Not all formats are available all the time

# Minimal: Get content by retrieve ID
result = await client.retrieve("ret_123")
# ScrapeResult(id='ret_123', available=[...])

# Maximal: Get multiple formats
result = await client.retrieve("ret_123", ["html", "markdown", "text", "json"])
# ScrapeResult(id='ret_123', available=['html_content', 'markdown_content', 'text_content', 'json_content'])

Advanced Features

Method Shorthands

# These are equivalent:
await client.scrape("https://example.com")            # shorthand
await client.scrape.create("https://example.com")     # explicit method

await client.batch(["url1", "url2"])                 # shorthand
await client.batch.start(["url1", "url2"])           # explicit method

await client.crawl("https://example.com")            # shorthand
await client.crawl.start("https://example.com")      # explicit method

await client.sitemap("https://example.com")          # shorthand
await client.sitemap.create("https://example.com")   # explicit method

await client.retrieve("ret_123")                     # shorthand
await client.retrieve.get("ret_123")                 # explicit method

Smart Input Coercion

The SDK intelligently handles various input formats for maximum convenience:

# Formats: string, list, or enum
await client.scrape("https://example.com", formats="html")
await client.scrape("https://example.com", formats=["html", "markdown"])


# Countries: case-insensitive strings or enums
await client.scrape("https://example.com", country="us")
await client.scrape("https://example.com", country=Country.US)

# Lists: single values or lists
await client.batch("https://example.com")    # Single URL
await client.batch(["https://a.com", "https://b.com"])  # Multiple URLs

Error Handling

Exception Hierarchy

The SDK handles error detection for you and provides a comprehensive exception hierarchy:

* Olostep_BaseError -------------------------------------- <- Catch base class for all errors
  x Olostep_APIConnectionError --------------------------- <- No connection to the API
  x OlostepServerError_BaseError ------------------------- <- Server-issued errors (still detected in client ofc)
    + OlostepServerError_TemporaryIssue
      - OlostepServerError_NetworkBusy
      - OlostepServerError_InternalNetworkIssue
    + OlostepServerError_RequestUnprocessable
      - OlostepServerError_ParserNotFound
      - OlostepServerError_OutOfResources
    + OlostepServerError_BlacklistedDomain
    + OlostepServerError_FeatureApprovalRequired
    + OlostepServerError_AuthFailed
    + OlostepServerError_CreditsExhausted
    + OlostepServerError_InvalidEndpointCalled
    + OlostepServerError_ResourceNotFound
    + OlostepServerError_NoResultInResponse
    + OlostepServerError_UnknownIssue
  x OlostepClientError_BaseError ------------------------- <- Client-issued errors
    + OlostepClientError_RequestValidationFailed
    + OlostepClientError_ResponseValidationFailed
    + OlostepClientError_NoAPIKey
    + OlostepClientError_AsyncContext
    + OlostepClientError_BetaFeatureAccessRequired
    + OlostepClientError_Timeout

Handling Errors

from olostep import OlostepClient
from olostep.errors import (
    Olostep_BaseError,
    Olostep_APIConnectionError,
    OlostepServerError_AuthFailed,
    OlostepClientError_Timeout,
)

client = OlostepClient()

try:
    result = await client.scrape("https://example.com")
    content = await result.retrieve(["html"])
    print(content.html_content)
    
except Olostep_APIConnectionError:
    print("Network error - check your connection")
    
except OlostepServerError_AuthFailed:
    print("Authentication failed - check your API key")
    
except OlostepClientError_Timeout:
    print("Request timed out - try again")
    
except Olostep_BaseError as e:
    print(f"Olostep error: {e}")

The SDK automatically retries failed requests with exponential backoff for transient errors.

Logging

Enable logging to debug issues:

import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("olostep")
logger.setLevel(logging.INFO)  # Use DEBUG for verbose output

Log Levels: INFO (recommended), DEBUG (verbose), WARNING, ERROR

Configuration

Environment Variables

Variable	Description	Default
`OLOSTEP_API_KEY`	Your API key	Required
`OLOSTEP_BASE_API_URL`	API base URL	`https://api.olostep.com/v1`
`OLOSTEP_API_TIMEOUT`	Request timeout (seconds)	`150`

SDKs

Installation

Authentication

Quick Start

Usage

Scraping

Batch Processing

Web Crawling

Site Mapping

Data Retrieval

Advanced Features

Method Shorthands

Smart Input Coercion

Error Handling

Exception Hierarchy

Handling Errors

Logging

Configuration

Environment Variables

Resources

API Reference

Use Cases

PyPI Package

Get API Key

SDKs

​Installation

​Authentication

​Quick Start

​Usage

​Scraping

​Batch Processing

​Web Crawling

​Site Mapping

​Data Retrieval

​Advanced Features

​Method Shorthands

​Smart Input Coercion

​Error Handling

​Exception Hierarchy

​Handling Errors

​Logging

​Configuration

​Environment Variables

​Resources

API Reference

Use Cases

PyPI Package

Get API Key

Installation

Authentication

Quick Start

Usage

Scraping

Batch Processing

Web Crawling

Site Mapping

Data Retrieval

Advanced Features

Method Shorthands

Smart Input Coercion

Error Handling

Exception Hierarchy

Handling Errors

Logging

Configuration

Environment Variables

Resources