跳转到主要内容
对于结构变化的网站或一次性提取需求,Olostep 提供了基于 LLM 的提取方法。此方法:
  • 将内容输入到大型语言模型
  • 指导模型解析并仅返回指定的数据
  • 返回一个包含所需内容的干净 JSON 结构
import requests
import json

def extract_with_llm():
    url = "https://api.olostep.com/v1/scrapes"

    headers = {
        "Authorization": "Bearer <API_KEY>",
        "Content-Type": "application/json"
    }

    data = {
        "url_to_scrape": "https://www.berklee.edu/events/stefano-marchese-friends",
        "formats": [
            "markdown", 
            "json"
        ],
        "llm_extract": {
            "schema": {
                "event": {
                    "type": "object",
                    "properties": {
                        "title": {"type": "string"},
                        "date": {"type": "string"},
                        "description": {"type": "string"},
                        "venue": {"type": "string"},
                        "address": {"type": "string"},
                        "start_time": {"type": "string"}
                    }
                }
            }
        },
    }

    response = requests.post(url, headers=headers, json=data)
    result = response.json()

    # LLM 提取结果将在 result 中可用
    print(json.dumps(result, indent=2))

    return result

if __name__ == "__main__":
    extract_with_llm()
你可以传递 schemaprompt 给 LLM。

import requests
import json

def extract_with_llm():
    url = "https://api.olostep.com/v1/scrapes"

    headers = {
        "Authorization": "Bearer <API_KEY>",
        "Content-Type": "application/json"
    }

    data = {
        "url_to_scrape": "https://www.berklee.edu/events/stefano-marchese-friends",
        "formats": [
            "markdown", 
            "json"
        ],
        "llm_extract": {
            "prompt": "从活动页面提取活动标题、日期、描述、场地、地址和开始时间。"
        },
    }

    response = requests.post(url, headers=headers, json=data)
    result = response.json()

    # LLM 提取结果将在 result 中可用
    print(json.dumps(result, indent=2))

    return result

if __name__ == "__main__":
    extract_with_llm()
prompt 是一个自然语言字符串,传递给 LLM 用于提取数据。LLM 根据提示决定如何提取数据。当你不想使用 schema 时,可以使用此方法。 示例响应:
{
    "id": "scrape_94iqy385ty",
    ...
    "result": {
        "json_content":  "{\"event\":{\"title\":\"Stefano Marchese and Friends\",\"date\":\"Wednesday / January 22, 2025\",\"description\":\"Join acclaimed Italian singer-songwriter and educator Stefano Marchese for an unforgettable evening of musical magic as he takes the stage alongside a constellation of extraordinary talent in a concert titled Concerto di Duetti.\",\"venue\":\"David Friend Recital Hall (DFRH)\",\"address\":\"921 Boylston Street Boston MA 02115 United States\",\"start_time\":\"7:30 p.m. (EST)\"}}"
    }
}
json_content 是事件的字符串化 JSON 内容。你可以通过解析字符串将其作为 JSON 对象访问。
import json

event = json.loads(result["json_content"])
print(event["event"]["title"])