To start a new crawl, provide the starting point, URLs to include and max pages.Other optional parameters such as the depth of the crawl, exclude URL patterns and if to include external URLs are also available.
In this example, we initiate a crawl on https://sugarbooandco.com and specify that we want to crawl up to 100 pages with all kinds of URLs except collections.
Copy
import requestsimport time# Define the API URL and your API keyAPI_URL = 'https://api.olostep.com'API_KEY = '<YOUR_API_KEY>'# Set the headers for the requestsHEADERS = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {API_KEY}'}# Function to initiate a crawldef initiate_crawl(data): response = requests.post(f'{API_URL}/v1/crawls', headers=HEADERS, json=data) return response.json()# Function to get crawl informationdef get_crawl_info(crawl_id): response = requests.get(f'{API_URL}/v1/crawls/{crawl_id}', headers=HEADERS) return response.json()# Function to get the list of crawled pages with formats parameterdef get_crawled_pages(crawl_id): params = {} response = requests.get(f'{API_URL}/v1/crawls/{crawl_id}/pages', headers=HEADERS, params=params) return response.json()# Function to retrieve contentdef retrieve_content(retrieve_id): api_url = f"{API_URL}/v1/retrieve" params = { "retrieve_id": retrieve_id } response = requests.get(api_url, headers=HEADERS, params=params) return response.json()# Data for initiating the crawldata = { "start_url": "https://sugarbooandco.com", "max_pages": 100, "include_urls": ["/**"], "exclude_urls": ["/collections/**"], # Optional "max_depth": None, # Default "include_external": False # Default}# Initiate the crawlcrawl = initiate_crawl(data)crawl_id = crawl['id']# Wait for the crawl to completewhile True: info = get_crawl_info(crawl_id) if info['status'] == 'completed': break time.sleep(5)# Get the list of crawled pages with their retrieve_idcrawl_pages = get_crawled_pages(crawl_id)# Print the content of the crawled pagesfor page in crawl_pages['pages']: print(f"URL: {page['url']}") print(f"Retrieve ID: {page['retrieve_id']}") retrieve_id = page['retrieve_id'] retrieved_data = retrieve_content(retrieve_id) if retrieved_data: html_content = retrieved_data.get("html_content") markdown_content = retrieved_data.get("markdown_content") print(f"HTML Content:\n{html_content}") print(f"Markdown Content:\n{markdown_content}") else: print("Failed to retrieve content")
For more detailed information, please refer the API reference.