Search for Relevant Pages

You can also use a search query to refine the crawl and limit the results to the top N most relevant pages. In this example, we initiate a crawl with a search query “contact us” and limit the results to crawling only top 5 most relevant URLs on every page.

import requests
import time

# Define the API URL and your API key
API_URL = 'https://api.olostep.com/v1'
API_KEY = '<your_token>'

# Set the headers for the requests
HEADERS = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer {API_KEY}'
}

# Function to initiate a crawl
def initiate_crawl(data):
    response = requests.post(f'{API_URL}/v1/crawls', headers=HEADERS, json=data)
    return response.json()

# Function to get crawl information
def get_crawl_info(crawl_id):
    response = requests.get(f'{API_URL}/v1/crawls/{crawl_id}', headers=HEADERS)
    return response.json()

# Function to get the list of crawled pages with search query
def get_crawled_pages(crawl_id, search_query=None):
    params = {
        'search_query': search_query
    }
    response = requests.get(f'{API_URL}/v1/crawls/{crawl_id}/list', headers=HEADERS, params=params)
    return response.json()

def crawl_with_search_query():
    data = {
        "start_url": "https://sugarbooandco.com",
        "max_pages": 100,
        "include_urls": ["/**"],
        "search_query": "contact us", # look for urls matching query
        "top_n": 5 # only visit top n matching urls per page
    }
    crawl = initiate_crawl(data)
    crawl_id = crawl['id']
    print(crawl_id)

    while True:
        info = get_crawl_info(crawl_id)
        if info['status'] == 'completed':
            break
        time.sleep(5)

    crawled_pages = get_crawled_pages(crawl_id, search_query="contact us")
    print(crawled_pages['pages'])

# Run the example
crawl_with_search_query()

Get Started

Features

Integrations

Search for Relevant Pages