In this example, we initiate a crawl on https://sugarbooandco.com and specify that we want to crawl up to 100 pages with all kinds of URLs except collections.
import requestsimport time# Define the API URL and your API keyAPI_URL ='https://api.olostep.com'API_KEY ='<your_token>'# Set the headers for the requestsHEADERS ={'Content-Type':'application/json','Authorization':f'Bearer {API_KEY}'}# Function to initiate a crawldefinitiate_crawl(data): response = requests.post(f'{API_URL}/v1/crawls', headers=HEADERS, json=data)return response.json()# Function to get crawl informationdefget_crawl_info(crawl_id): response = requests.get(f'{API_URL}/v1/crawls/{crawl_id}', headers=HEADERS)return response.json()# Function to get the list of crawled pages with formats parameterdefget_crawled_pages(crawl_id): params ={} response = requests.get(f'{API_URL}/v1/crawls/{crawl_id}/pages', headers=HEADERS, params=params)return response.json()# Function to retrieve contentdefretrieve_content(retrieve_id, formats): api_url =f"{API_URL}/v1/retrieve" params ={"retrieve_id": retrieve_id,"formats": json.dumps(formats)} response = requests.get(api_url, headers=HEADERS, params=params)return response.json()# Data for initiating the crawldata ={"start_url":"https://sugarbooandco.com","max_pages":100,"include_urls":["/**"],"exclude_urls":["/collections/**"],# Optional"max_depth":None,# Default"include_external":False# Default}# Initiate the crawlcrawl = initiate_crawl(data)crawl_id = crawl['id']# Wait for the crawl to completewhileTrue: info = get_crawl_info(crawl_id)if info['status']=='completed':break time.sleep(5)# Get the list of crawled pages with their retrieve_idcrawl_pages = get_crawled_pages(crawl_id, formats=None)# Print the content of the crawled pagesfor page in crawl_pages['pages']:print(f"URL: {page['url']}")print(f"Retrieve ID: {page['retrieve_id']}") retrieve_id = page['retrieve_id'] formats =["html","markdown"] retrieved_data = retrieve_content(retrieve_id, formats)if retrieved_data: html_content = retrieved_data.get("html_content") markdown_content = retrieved_data.get("markdown_content")print(f"HTML Content:\n{html_content}")print(f"Markdown Content:\n{markdown_content}")else:print("Failed to retrieve content")
For more detailed information, please refer the API reference.