Crawl
Search for Relevant Pages
Get Started
Integrations
Why Olostep?
Crawl
Search for Relevant Pages
You can also use a search query to refine the crawl and limit the results to the top N most relevant pages.
In this example, we initiate a crawl with a search query “contact us” and limit the results to crawling only top 5 most relevant URLs on every page.
import requests
import time
# Define the API URL and your API key
API_URL = 'https://api.olostep.com/v1'
API_KEY = '<your_token>'
# Set the headers for the requests
HEADERS = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {API_KEY}'
}
# Function to initiate a crawl
def initiate_crawl(data):
response = requests.post(f'{API_URL}/v1/crawls', headers=HEADERS, json=data)
return response.json()
# Function to get crawl information
def get_crawl_info(crawl_id):
response = requests.get(f'{API_URL}/v1/crawls/{crawl_id}', headers=HEADERS)
return response.json()
# Function to get the list of crawled pages with search query
def get_crawled_pages(crawl_id, search_query=None):
params = {
'search_query': search_query
}
response = requests.get(f'{API_URL}/v1/crawls/{crawl_id}/list', headers=HEADERS, params=params)
return response.json()
def crawl_with_search_query():
data = {
"start_url": "https://sugarbooandco.com",
"max_pages": 100,
"include_urls": ["/**"],
"search_query": "contact us", # look for urls matching query
"top_n": 5 # only visit top n matching urls per page
}
crawl = initiate_crawl(data)
crawl_id = crawl['id']
print(crawl_id)
while True:
info = get_crawl_info(crawl_id)
if info['status'] == 'completed':
break
time.sleep(5)
crawled_pages = get_crawled_pages(crawl_id, search_query="contact us")
print(crawled_pages['pages'])
# Run the example
crawl_with_search_query()