The Batches endpoint allows to process multiple URLs in a single batch, making it easier to manage large sets of data. You can start a batch of up to 10k urls and retrieve the content after 5-7 mins. You can start many batches in parallel to do up to 1 million requests in around 30 minutes.

Initiating a Batch

To initiate a new batch, specify the items to be processed with their customID and url. You can also provide optional parameters such as the country and parser to be used for the batch.

import requests
import time

# Define the API URL and your API key
API_URL = 'https://api.olostep.com'
API_KEY = '<your_token>'
# Set the headers for the requests
HEADERS = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer {API_KEY}'
}

# Function to initiate a batch
def initiate_batch(data):
    response = requests.post(f'{API_URL}/v1/batches', headers=HEADERS, json=data)
    return response.json()

# Function to get batch information
def get_batch_info(batch_id):
    response = requests.get(f'{API_URL}/v1/batches/{batch_id}', headers=HEADERS)
    return response.json()

# Function to get the list of URLs in the batch
def get_batch_items(batch_id, status="completed"):
    params = {
        'status': status
    }
    response = requests.get(f'{API_URL}/v1/batches/{batch_id}/items', headers=HEADERS, params=params)
    return response.json()

# Function to retrieve content
def retrieve_content(retrieve_id, formats):
    api_url = f"{API_URL}/v1/retrieve"
    params = {
        "retrieve_id": retrieve_id,
        "formats": json.dumps(formats)
    }
    response = requests.get(api_url, headers=HEADERS, params=params)
    return response.json()

# Data for initiating the batch
data = {
    "items": [
        {"custom_id": "1", "url": "https://github.com"},
        {"custom_id": "2", "url": "https://github.com/features"}
    ]
}

# Initiate the batch
batch = initiate_batch(data)
batch_id = batch['id']
print(batch_id)

# Wait for the batch to complete
while True:
    info = get_batch_info(batch_id)
    if info['status'] == 'completed':
        break
    time.sleep(5)

# Get the list of completed URLs
batch_items = get_batch_items(batch_id, "completed")

# Print the content of the URLs in the batch
for item in batch_items['items']:
    print(f"CustomID: {item['custom_id']}")
    print(f"URL: {item['url']}")
    print(f"Retrieve ID: {item['retrieve_id']}")

    retrieve_id = item['retrieve_id']
    formats = ["html", "markdown"]
    retrieved_data = retrieve_content(retrieve_id, formats)
    
    if retrieved_data:
        html_content = retrieved_data.get("html_content")
        markdown_content = retrieved_data.get("markdown_content")
        print(f"HTML Content:\n{html_content}")
        print(f"Markdown Content:\n{markdown_content}")
    else:
        print("Failed to retrieve content")