You can also use a search query to refine the crawl and limit the results to the top N most relevant pages.
In this example, we initiate a crawl with a search query “contact us” and limit the results to crawling only top 5 most relevant URLs on every page.
import requestsimport time# Define the API URL and your API keyAPI_URL ='https://api.olostep.com/v1'API_KEY ='<your_token>'# Set the headers for the requestsHEADERS ={'Content-Type':'application/json','Authorization':f'Bearer {API_KEY}'}# Function to initiate a crawldefinitiate_crawl(data): response = requests.post(f'{API_URL}/v1/crawls', headers=HEADERS, json=data)return response.json()# Function to get crawl informationdefget_crawl_info(crawl_id): response = requests.get(f'{API_URL}/v1/crawls/{crawl_id}', headers=HEADERS)return response.json()# Function to get the list of crawled pages with search querydefget_crawled_pages(crawl_id, search_query=None): params ={'search_query': search_query} response = requests.get(f'{API_URL}/v1/crawls/{crawl_id}/list', headers=HEADERS, params=params)return response.json()defcrawl_with_search_query(): data ={"start_url":"https://sugarbooandco.com","max_pages":100,"include_urls":["/**"],"search_query":"contact us",# look for urls matching query"top_n":5# only visit top n matching urls per page} crawl = initiate_crawl(data) crawl_id = crawl['id']print(crawl_id)whileTrue: info = get_crawl_info(crawl_id)if info['status']=='completed':break time.sleep(5) crawled_pages = get_crawled_pages(crawl_id, search_query="contact us")print(crawled_pages['pages'])# Run the examplecrawl_with_search_query()