创建抓取

启动网页抓取

curl --request POST \
  --url https://api.olostep.com/v1/scrapes \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "url_to_scrape": "<string>",
  "wait_before_scraping": 123,
  "formats": [],
  "actions": [
    {
      "type": "wait",
      "milliseconds": 1
    }
  ],
  "country": "<string>",
  "remove_images": false,
  "remove_class_names": [
    "<string>"
  ],
  "llm_extract": {
    "schema": {}
  },
  "links_on_page": {
    "query_to_order_links_by": "<string>",
    "include_links": [
      "<string>"
    ],
    "exclude_links": [
      "<string>"
    ]
  },
  "screen_size": {
    "screen_width": 123,
    "screen_height": 123
  },
  "screenshot": {
    "full_page": true
  },
  "metadata": {},
  "max_age": 0
}
'

import requests

url = "https://api.olostep.com/v1/scrapes"

payload = {
    "url_to_scrape": "<string>",
    "wait_before_scraping": 123,
    "formats": [],
    "actions": [
        {
            "type": "wait",
            "milliseconds": 1
        }
    ],
    "country": "<string>",
    "remove_images": False,
    "remove_class_names": ["<string>"],
    "llm_extract": { "schema": {} },
    "links_on_page": {
        "query_to_order_links_by": "<string>",
        "include_links": ["<string>"],
        "exclude_links": ["<string>"]
    },
    "screen_size": {
        "screen_width": 123,
        "screen_height": 123
    },
    "screenshot": { "full_page": True },
    "metadata": {},
    "max_age": 0
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    url_to_scrape: '<string>',
    wait_before_scraping: 123,
    formats: [],
    actions: [{type: 'wait', milliseconds: 1}],
    country: '<string>',
    remove_images: false,
    remove_class_names: ['<string>'],
    llm_extract: {schema: {}},
    links_on_page: {
      query_to_order_links_by: '<string>',
      include_links: ['<string>'],
      exclude_links: ['<string>']
    },
    screen_size: {screen_width: 123, screen_height: 123},
    screenshot: {full_page: true},
    metadata: {},
    max_age: 0
  })
};

fetch('https://api.olostep.com/v1/scrapes', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.olostep.com/v1/scrapes",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'url_to_scrape' => '<string>',
    'wait_before_scraping' => 123,
    'formats' => [
        
    ],
    'actions' => [
        [
                'type' => 'wait',
                'milliseconds' => 1
        ]
    ],
    'country' => '<string>',
    'remove_images' => false,
    'remove_class_names' => [
        '<string>'
    ],
    'llm_extract' => [
        'schema' => [
                
        ]
    ],
    'links_on_page' => [
        'query_to_order_links_by' => '<string>',
        'include_links' => [
                '<string>'
        ],
        'exclude_links' => [
                '<string>'
        ]
    ],
    'screen_size' => [
        'screen_width' => 123,
        'screen_height' => 123
    ],
    'screenshot' => [
        'full_page' => true
    ],
    'metadata' => [
        
    ],
    'max_age' => 0
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.olostep.com/v1/scrapes"

	payload := strings.NewReader("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.olostep.com/v1/scrapes")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.olostep.com/v1/scrapes")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}"

response = http.request(request)
puts response.read_body

{
  "id": "<string>",
  "object": "<string>",
  "created": 123,
  "metadata": {},
  "url_to_scrape": "<string>",
  "result": {
    "html_content": "<string>",
    "markdown_content": "<string>",
    "text_content": "<string>",
    "json_content": "<string>",
    "screenshot_hosted_url": "<string>",
    "html_hosted_url": "<string>",
    "markdown_hosted_url": "<string>",
    "text_hosted_url": "<string>",
    "links_on_page": [
      "<string>"
    ],
    "page_metadata": {
      "status_code": 123,
      "title": "<string>"
    }
  },
  "credits_consumed": 123,
  "cost_usd": 123
}

{
  "id": "error_x2nmu5bqn6",
  "object": "error",
  "created": 1777923912,
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "dns_resolution_failed",
    "message": "The URL contains a typo, or the domain does not exist."
  }
}

{
  "id": "error_ogeb6rik8c",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "tls_error",
    "detail": "err_ssl_tlsv1_alert_internal_error",
    "message": "The website closed or rejected the TLS handshake. The server may be misconfigured or use an unsupported SSL/TLS version."
  }
}

{
  "id": "error_qat3d1amjt",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "request_timeout",
    "code": "scrape_poll_timeout",
    "message": "Request timed out while waiting for scrape result. The page may be slow, blocked for our fetchers, or temporarily unavailable."
  }
}

POST

scrapes

启动网页抓取

curl --request POST \
  --url https://api.olostep.com/v1/scrapes \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "url_to_scrape": "<string>",
  "wait_before_scraping": 123,
  "formats": [],
  "actions": [
    {
      "type": "wait",
      "milliseconds": 1
    }
  ],
  "country": "<string>",
  "remove_images": false,
  "remove_class_names": [
    "<string>"
  ],
  "llm_extract": {
    "schema": {}
  },
  "links_on_page": {
    "query_to_order_links_by": "<string>",
    "include_links": [
      "<string>"
    ],
    "exclude_links": [
      "<string>"
    ]
  },
  "screen_size": {
    "screen_width": 123,
    "screen_height": 123
  },
  "screenshot": {
    "full_page": true
  },
  "metadata": {},
  "max_age": 0
}
'

import requests

url = "https://api.olostep.com/v1/scrapes"

payload = {
    "url_to_scrape": "<string>",
    "wait_before_scraping": 123,
    "formats": [],
    "actions": [
        {
            "type": "wait",
            "milliseconds": 1
        }
    ],
    "country": "<string>",
    "remove_images": False,
    "remove_class_names": ["<string>"],
    "llm_extract": { "schema": {} },
    "links_on_page": {
        "query_to_order_links_by": "<string>",
        "include_links": ["<string>"],
        "exclude_links": ["<string>"]
    },
    "screen_size": {
        "screen_width": 123,
        "screen_height": 123
    },
    "screenshot": { "full_page": True },
    "metadata": {},
    "max_age": 0
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    url_to_scrape: '<string>',
    wait_before_scraping: 123,
    formats: [],
    actions: [{type: 'wait', milliseconds: 1}],
    country: '<string>',
    remove_images: false,
    remove_class_names: ['<string>'],
    llm_extract: {schema: {}},
    links_on_page: {
      query_to_order_links_by: '<string>',
      include_links: ['<string>'],
      exclude_links: ['<string>']
    },
    screen_size: {screen_width: 123, screen_height: 123},
    screenshot: {full_page: true},
    metadata: {},
    max_age: 0
  })
};

fetch('https://api.olostep.com/v1/scrapes', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.olostep.com/v1/scrapes",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'url_to_scrape' => '<string>',
    'wait_before_scraping' => 123,
    'formats' => [
        
    ],
    'actions' => [
        [
                'type' => 'wait',
                'milliseconds' => 1
        ]
    ],
    'country' => '<string>',
    'remove_images' => false,
    'remove_class_names' => [
        '<string>'
    ],
    'llm_extract' => [
        'schema' => [
                
        ]
    ],
    'links_on_page' => [
        'query_to_order_links_by' => '<string>',
        'include_links' => [
                '<string>'
        ],
        'exclude_links' => [
                '<string>'
        ]
    ],
    'screen_size' => [
        'screen_width' => 123,
        'screen_height' => 123
    ],
    'screenshot' => [
        'full_page' => true
    ],
    'metadata' => [
        
    ],
    'max_age' => 0
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.olostep.com/v1/scrapes"

	payload := strings.NewReader("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.olostep.com/v1/scrapes")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.olostep.com/v1/scrapes")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}"

response = http.request(request)
puts response.read_body

{
  "id": "<string>",
  "object": "<string>",
  "created": 123,
  "metadata": {},
  "url_to_scrape": "<string>",
  "result": {
    "html_content": "<string>",
    "markdown_content": "<string>",
    "text_content": "<string>",
    "json_content": "<string>",
    "screenshot_hosted_url": "<string>",
    "html_hosted_url": "<string>",
    "markdown_hosted_url": "<string>",
    "text_hosted_url": "<string>",
    "links_on_page": [
      "<string>"
    ],
    "page_metadata": {
      "status_code": 123,
      "title": "<string>"
    }
  },
  "credits_consumed": 123,
  "cost_usd": 123
}

{
  "id": "error_x2nmu5bqn6",
  "object": "error",
  "created": 1777923912,
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "dns_resolution_failed",
    "message": "The URL contains a typo, or the domain does not exist."
  }
}

{
  "id": "error_ogeb6rik8c",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "tls_error",
    "detail": "err_ssl_tlsv1_alert_internal_error",
    "message": "The website closed or rejected the TLS handshake. The server may be misconfigured or use an unsupported SSL/TLS version."
  }
}

{
  "id": "error_qat3d1amjt",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "request_timeout",
    "code": "scrape_poll_timeout",
    "message": "Request timed out while waiting for scrape result. The page may be slow, blocked for our fetchers, or temporarily unavailable."
  }
}

可选缓存： 传递 max_age（以秒为单位）以重用具有相同参数的最近抓取，而不是再次获取页面。默认值为 0（始终最新）。在仪表板游乐场中，默认值为24小时。详情请参见缓存。

授权

Authorization

string

header

必填

Bearer认证头格式为Bearer ，其中是你的认证令牌。

请求体

application/json

url_to_scrape

string<uri>

必填

开始抓取的URL。

wait_before_scraping

integer

在开始抓取之前等待的时间，以毫秒为单位。

formats

enum<string>[]

你想要内容的格式。

可用选项:

html,

markdown,

text,

json,

raw_pdf,

screenshot

remove_css_selectors

enum<string>

选择从内容中移除某些CSS选择器。你也可以传递一个JSON字符串化的数组，指定你想要移除的特定选择器。当此选项设置为默认时，移除的CSS选择器为 ['nav','footer','script','style','noscript','svg',[role=alert],[role=banner],[role=dialog],[role=alertdialog],[role=region][aria-label*=skip i],[aria-modal=true]]

可用选项:

default,

none,

array

actions

(等待 · object | 点击 · object | 填写输入 · object | 滚动 · object)[]

在获取内容之前对页面执行的操作。

等待
点击
填写输入
滚动

Show child attributes

country

string

加载请求的住宅国家。支持的值有： - US (United States) - CA (Canada) - IT (Italy) - IN (India) - GB (England) - JP (Japan) - MX (Mexico) - AU (Australia) - ID (Indonesia) - UA (UAE) - RU (Russia) - RANDOM 一些操作，如抓取Google搜索和Google新闻，支持所有国家。

transformer

enum<string>

指定要使用的HTML转换器（如果有）。使用Postlight的Mercury Parser库来移除广告和其他不需要的内容。

可用选项:

postlight,

none

remove_images

boolean

默认值:false

选择从抓取的内容中移除图像。默认为false。

remove_class_names

string[]

要从内容中移除的类名列表。

parser

object

当定义json作为格式时，你可以使用此参数指定要使用的解析器。解析器对于从网页中提取结构化内容非常有用。Olostep为大多数常见网页内置了一些解析器，你也可以创建自己的解析器。

Show child attributes

llm_extract

object

Show child attributes

links_on_page

object

使用此选项，你可以获取你抓取的页面上存在的所有链接。链接始终以绝对URL返回。

Show child attributes

screen_size

object

屏幕尺寸配置。通过 screen_type 可用的预设尺寸有：desktop (1920x1080)、mobile (414x896) 或 default (768x1024)。

Show child attributes

screenshot

object

Show child attributes

metadata

object

用户定义的元数据。尚不支持。

max_age

integer

默认值:0

缓存内容的最大可接受年龄，以秒为单位。当已存在的抓取匹配并且比 max_age 秒更新时，Olostep 返回存储的结果，而不是启动新的浏览器抓取。默认值为 0（始终抓取最新）。在仪表板游乐场中，默认值为 86400（24 小时）。允许的最大值为 604800（7 天）。有关详细信息，请参阅抓取功能文档中的缓存部分。

必填范围: x >= 0

响应

成功响应，包含抓取启动的详细信息。

string

抓取 ID

object

string

对象的种类。此端点为 "scrape"。

created

number

创建的纪元时间

metadata

object

用户定义的元数据。

url_to_scrape

string

被抓取的 URL。

result

object

Show child attributes

credits_consumed

integer | null

此请求消耗的积分数量。在执行完成后填充。积分是计费的真实来源。

cost_usd

number | null

此请求的估计成本（以美元计）。在执行完成后填充。根据消耗的积分和你的计划费率计算——99% 准确，但 credits_consumed 是权威值。

Webhooks 获取抓取

常用

抓取

批次

抓取

地图

答案

搜索

显示器

文件

日程安排

检索

余额与账单

授权

请求体

响应