import requests
import time
import json
from datetime import datetime
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
# Configuration
API_URL = 'https://api.olostep.com/v1'
API_KEY = '<your_olostep_api_key>'
HEADERS = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {API_KEY}'
}
# Function to retrieve content with markdown format
def retrieve_content(retrieve_id, formats):
params = {
"retrieve_id": retrieve_id,
"formats": json.dumps(formats)
}
response = requests.get(f"{API_URL}/retrieve", headers=HEADERS, params=params)
return response.json()
# Continuing from the previous crawl example
if status_data['status'] == 'completed':
print(f"\nCrawl completed! Retrieved {status_data['pages_count']} pages.")
pages_response = requests.get(f'{API_URL}/crawls/{crawl_id}/pages', headers=HEADERS)
pages_data = pages_response.json()
# Create output directory if it doesn't exist
os.makedirs("output", exist_ok=True)
# Prepare to collect markdown content
markdown_pages = []
total_pages = len(pages_data['pages'])
# Process pages in parallel to get markdown content
with ThreadPoolExecutor(max_workers=10) as executor:
# Create futures for content retrieval
future_to_page = {
executor.submit(retrieve_content, page['retrieve_id'], ["markdown"]): page
for page in pages_data['pages']
}
# Process results as they complete
for i, future in enumerate(as_completed(future_to_page), 1):
page = future_to_page[future]
url = page['url']
print(f"Processing {i}/{total_pages}: {url}")
try:
content_data = future.result()
if content_data and "markdown_content" in content_data:
markdown_pages.append({
'url': url,
'title': page['title'],
'markdown_content': content_data['markdown_content']
})
print(f"✓ Markdown content retrieved for {url}")
else:
print(f"⚠ No markdown content for {url}")
except Exception as e:
print(f"❌ Error retrieving content for {url}: {str(e)}")
# Save all markdown content to a single file
output_file = "output/stripe_blog_markdown.md"
with open(output_file, "w", encoding="utf-8") as f:
for page in markdown_pages:
# Write page header with title and URL
f.write(f"URL: {page['url']}\n\n")
# Write the markdown content
f.write(f"{page['markdown_content']}\n\n")
# Add separator between pages
f.write("---\n\n")
print(f"✓ Added markdown content from {page['url']}")
print(f"\n✅ Process complete! All markdown content has been saved to '{output_file}'")
print(f"Total pages processed: {len(markdown_pages)}")
else:
print(f"Crawl failed with status: {status_data['status']}")