The crawls endpoint allows you to autonomously scrape all relevant pages in websites like docs, blogs, and more.

Initiating a Crawl

To start a new crawl, provide the starting point, URLs to include and max pages.

Other optional parameters such as the depth of the crawl, exclude URL patterns and if to include external URLs are also available.

Example

In this example, we initiate a crawl on https://sugarbooandco.com and specify that we want to crawl up to 100 pages with all kinds of URLs except collections.

import requests
import time

# Define the API URL and your API key
API_URL = 'https://api.olostep.com'
API_KEY = '<your_token>'

# Set the headers for the requests
HEADERS = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer {API_KEY}'
}

# Function to initiate a crawl
def initiate_crawl(data):
    response = requests.post(f'{API_URL}/v1/crawls', headers=HEADERS, json=data)
    return response.json()

# Function to get crawl information
def get_crawl_info(crawl_id):
    response = requests.get(f'{API_URL}/v1/crawls/{crawl_id}', headers=HEADERS)
    return response.json()

# Function to get the list of crawled pages with formats parameter
def get_crawled_pages(crawl_id):
    params = {}
    response = requests.get(f'{API_URL}/v1/crawls/{crawl_id}/pages', headers=HEADERS, params=params)
    return response.json()

# Function to retrieve content
def retrieve_content(retrieve_id, formats):
    api_url = f"{API_URL}/v1/retrieve"
    params = {
        "retrieve_id": retrieve_id,
        "formats": json.dumps(formats)
    }
    response = requests.get(api_url, headers=HEADERS, params=params)
    return response.json()

# Data for initiating the crawl
data = {
    "start_url": "https://sugarbooandco.com",
    "max_pages": 100,
    "include_urls": ["/**"],
    "exclude_urls": ["/collections/**"], # Optional
    "max_depth": None, # Default
    "include_external": False # Default
}

# Initiate the crawl
crawl = initiate_crawl(data)
crawl_id = crawl['id']

# Wait for the crawl to complete
while True:
    info = get_crawl_info(crawl_id)
    if info['status'] == 'completed':
        break
    time.sleep(5)

# Get the list of crawled pages with their retrieve_id
crawl_pages = get_crawled_pages(crawl_id, formats=None)

# Print the content of the crawled pages
for page in crawl_pages['pages']:
    print(f"URL: {page['url']}")
    print(f"Retrieve ID: {page['retrieve_id']}")

    retrieve_id = page['retrieve_id']
    formats = ["html", "markdown"]
    retrieved_data = retrieve_content(retrieve_id, formats)
    
    if retrieved_data:
        html_content = retrieved_data.get("html_content")
        markdown_content = retrieved_data.get("markdown_content")
        print(f"HTML Content:\n{html_content}")
        print(f"Markdown Content:\n{markdown_content}")
    else:
        print("Failed to retrieve content")

For more detailed information, please refer the API reference.