We use a cursor and limit based mechanism to help you paginate or stream results as the crawl continues.
The endpoint returns a cursor value to pass and fetch the next set of pages. It is also returned when you paginate data by specifying a limit.
import requestsimport time# Define the API URL and your API keyAPI_URL ='https://api.olostep.com'API_KEY ='<your_token>'# Set the headers for the requestsHEADERS ={'Content-Type':'application/json','Authorization':f'Bearer {API_KEY}'}# Function to initiate a crawldefinitiate_crawl(data): response = requests.post(f'{API_URL}/v1/crawls', headers=HEADERS, json=data)return response.json()# Function to get crawl informationdefget_crawl_info(crawl_id): response = requests.get(f'{API_URL}/v1/crawls/{crawl_id}', headers=HEADERS)return response.json()# Function to get the list of crawled pages with cursor paginationdefget_crawled_pages(crawl_id, cursor=None, limit=None): params ={'cursor': cursor,'limit': limit} response = requests.get(f'{API_URL}/v1/crawls/{crawl_id}/pages', headers=HEADERS, params=params)return response.json()defcrawl_with_cursor(): data ={"start_url":"https://sugarbooandco.com","max_pages":100,"include_urls":["/**"]} crawl = initiate_crawl(data) crawl_id = crawl['id'] cursor =0# Works both while the crawl is `in_progress` or `completed`whileTrue: crawl_pages = get_crawled_pages(crawl_id, cursor=cursor, limit=10)for page in crawl_pages['pages']:print(f"URL: {page['url']}")print(f"Retrieve ID: {page['retrieve_id']}")# Process the pages as they are receivedif'cursor'notin crawl_pages:break cursor = crawl_pages['cursor'] time.sleep(5)# Run the examplecrawl_with_cursor()