The Batches endpoint allows to process multiple URLs in a single batch, making it easier to manage large sets of data.
You can start a batch of up to 10k urls and retrieve the content after 5-7 mins. You can start many batches in parallel to do up to 1 million requests in around 30 minutes.
To initiate a new batch, specify the items to be processed with their customID and url. You can also provide optional parameters such as the country and parser to be used for the batch.
import requestsimport time# Define the API URL and your API keyAPI_URL ='https://api.olostep.com'API_KEY ='<your_token>'# Set the headers for the requestsHEADERS ={'Content-Type':'application/json','Authorization':f'Bearer {API_KEY}'}# Function to initiate a batchdefinitiate_batch(data): response = requests.post(f'{API_URL}/v1/batches', headers=HEADERS, json=data)return response.json()# Function to get batch informationdefget_batch_info(batch_id): response = requests.get(f'{API_URL}/v1/batches/{batch_id}', headers=HEADERS)return response.json()# Function to get the list of URLs in the batchdefget_batch_items(batch_id, status="completed"): params ={'status': status} response = requests.get(f'{API_URL}/v1/batches/{batch_id}/items', headers=HEADERS, params=params)return response.json()# Function to retrieve contentdefretrieve_content(retrieve_id, formats): api_url =f"{API_URL}/v1/retrieve" params ={"retrieve_id": retrieve_id,"formats": json.dumps(formats)} response = requests.get(api_url, headers=HEADERS, params=params)return response.json()# Data for initiating the batchdata ={"items":[{"custom_id":"1","url":"https://github.com"},{"custom_id":"2","url":"https://github.com/features"}]}# Initiate the batchbatch = initiate_batch(data)batch_id = batch['id']print(batch_id)# Wait for the batch to completewhileTrue: info = get_batch_info(batch_id)if info['status']=='completed':break time.sleep(5)# Get the list of completed URLsbatch_items = get_batch_items(batch_id,"completed")# Print the content of the URLs in the batchfor item in batch_items['items']:print(f"CustomID: {item['custom_id']}")print(f"URL: {item['url']}")print(f"Retrieve ID: {item['retrieve_id']}") retrieve_id = item['retrieve_id'] formats =["html","markdown"] retrieved_data = retrieve_content(retrieve_id, formats)if retrieved_data: html_content = retrieved_data.get("html_content") markdown_content = retrieved_data.get("markdown_content")print(f"HTML Content:\n{html_content}")print(f"Markdown Content:\n{markdown_content}")else:print("Failed to retrieve content")