Crawl
Paginate or Stream using Cursor
Get Started
Integrations
Why Olostep?
Crawl
Paginate or Stream using Cursor
We use a cursor
and limit
based mechanism to help you paginate or stream results as the crawl continues.
The endpoint returns a cursor
value to pass and fetch the next set of pages. It is also returned when you paginate data by specifying a limit
.
import requests
import time
# Define the API URL and your API key
API_URL = 'https://api.olostep.com'
API_KEY = '<your_token>'
# Set the headers for the requests
HEADERS = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {API_KEY}'
}
# Function to initiate a crawl
def initiate_crawl(data):
response = requests.post(f'{API_URL}/v1/crawls', headers=HEADERS, json=data)
return response.json()
# Function to get crawl information
def get_crawl_info(crawl_id):
response = requests.get(f'{API_URL}/v1/crawls/{crawl_id}', headers=HEADERS)
return response.json()
# Function to get the list of crawled pages with cursor pagination
def get_crawled_pages(crawl_id, cursor=None, limit=None):
params = {
'cursor': cursor,
'limit': limit
}
response = requests.get(f'{API_URL}/v1/crawls/{crawl_id}/pages', headers=HEADERS, params=params)
return response.json()
def crawl_with_cursor():
data = {
"start_url": "https://sugarbooandco.com",
"max_pages": 100,
"include_urls": ["/**"]
}
crawl = initiate_crawl(data)
crawl_id = crawl['id']
cursor = 0
# Works both while the crawl is `in_progress` or `completed`
while True:
crawl_pages = get_crawled_pages(crawl_id, cursor=cursor, limit=10)
for page in crawl_pages['pages']:
print(f"URL: {page['url']}")
print(f"Retrieve ID: {page['retrieve_id']}")
# Process the pages as they are received
if 'cursor' not in crawl_pages:
break
cursor = crawl_pages['cursor']
time.sleep(5)
# Run the example
crawl_with_cursor()