We use a cursor and limit based mechanism to help you paginate or stream results as the crawl continues.

The endpoint returns a cursor value to pass and fetch the next set of pages. It is also returned when you paginate data by specifying a limit.

import requests
import time

# Define the API URL and your API key
API_URL = 'https://api.olostep.com'
API_KEY = '<your_token>'

# Set the headers for the requests
HEADERS = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer {API_KEY}'
}

# Function to initiate a crawl
def initiate_crawl(data):
    response = requests.post(f'{API_URL}/v1/crawls', headers=HEADERS, json=data)
    return response.json()

# Function to get crawl information
def get_crawl_info(crawl_id):
    response = requests.get(f'{API_URL}/v1/crawls/{crawl_id}', headers=HEADERS)
    return response.json()

# Function to get the list of crawled pages with cursor pagination
def get_crawled_pages(crawl_id, cursor=None, limit=None):
    params = {
        'cursor': cursor,
        'limit': limit
    }
    response = requests.get(f'{API_URL}/v1/crawls/{crawl_id}/pages', headers=HEADERS, params=params)
    return response.json()

def crawl_with_cursor():
    data = {
        "start_url": "https://sugarbooandco.com",
        "max_pages": 100,
        "include_urls": ["/**"]
    }
    crawl = initiate_crawl(data)
    crawl_id = crawl['id']
    cursor = 0

    # Works both while the crawl is `in_progress` or `completed`
    while True:
        crawl_pages = get_crawled_pages(crawl_id, cursor=cursor, limit=10)
        for page in crawl_pages['pages']:
            print(f"URL: {page['url']}")
            print(f"Retrieve ID: {page['retrieve_id']}")

        # Process the pages as they are received

        if 'cursor' not in crawl_pages:
            break
        cursor = crawl_pages['cursor']
        time.sleep(5)

# Run the example
crawl_with_cursor()