抓取页面 - Olostep Docs

获取已爬取页面的列表，可选择包含内容

curl --request GET \
  --url https://api.olostep.com/v1/crawls/{crawl_id}/pages \
  --header 'Authorization: Bearer <token>'

const options = {method: 'GET', headers: {Authorization: 'Bearer <token>'}};

fetch('https://api.olostep.com/v1/crawls/{crawl_id}/pages', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.olostep.com/v1/crawls/{crawl_id}/pages",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "GET",
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"net/http"
	"io"
)

func main() {

	url := "https://api.olostep.com/v1/crawls/{crawl_id}/pages"

	req, _ := http.NewRequest("GET", url, nil)

	req.Header.Add("Authorization", "Bearer <token>")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

require 'uri'
require 'net/http'

url = URI("https://api.olostep.com/v1/crawls/{crawl_id}/pages")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Get.new(url)
request["Authorization"] = 'Bearer <token>'

response = http.request(request)
puts response.read_body

{
  "crawl_id": "<string>",
  "object": "<string>",
  "status": "<string>",
  "search_query": "<string>",
  "pages_count": 123,
  "pages": [
    {
      "id": "<string>",
      "retrieve_id": "<string>",
      "url": "<string>",
      "is_external": true,
      "html_content": "<string>",
      "markdown_content": "<string>"
    }
  ],
  "metadata": {
    "external_urls": [
      "<string>"
    ],
    "failed_urls": [
      "<string>"
    ]
  },
  "cursor": 123
}

GET

/

v1

/

crawls

/

{crawl_id}

/

pages

获取已爬取页面的列表，可选择包含内容

curl --request GET \
  --url https://api.olostep.com/v1/crawls/{crawl_id}/pages \
  --header 'Authorization: Bearer <token>'

const options = {method: 'GET', headers: {Authorization: 'Bearer <token>'}};

fetch('https://api.olostep.com/v1/crawls/{crawl_id}/pages', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.olostep.com/v1/crawls/{crawl_id}/pages",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "GET",
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"net/http"
	"io"
)

func main() {

	url := "https://api.olostep.com/v1/crawls/{crawl_id}/pages"

	req, _ := http.NewRequest("GET", url, nil)

	req.Header.Add("Authorization", "Bearer <token>")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

require 'uri'
require 'net/http'

url = URI("https://api.olostep.com/v1/crawls/{crawl_id}/pages")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Get.new(url)
request["Authorization"] = 'Bearer <token>'

response = http.request(request)
puts response.read_body

{
  "crawl_id": "<string>",
  "object": "<string>",
  "status": "<string>",
  "search_query": "<string>",
  "pages_count": 123,
  "pages": [
    {
      "id": "<string>",
      "retrieve_id": "<string>",
      "url": "<string>",
      "is_external": true,
      "html_content": "<string>",
      "markdown_content": "<string>"
    }
  ],
  "metadata": {
    "external_urls": [
      "<string>"
    ],
    "failed_urls": [
      "<string>"
    ]
  },
  "cursor": 123
}

授权

Authorization

string

header

必填

Bearer认证头格式为Bearer ，其中是你的认证令牌。

路径参数

crawl_id

string

必填

要获取URL列表的爬取ID。

查询参数

cursor

integer

可选整数，表示开始获取内容的索引。用于分页，直到所有URL都被获取。从0开始，然后提供上次请求的 response['cursor'] 值。

limit

integer

可选整数，用于限制返回结果的数量。建议每次10-50个结果。使用cursor分页。单个请求最多可获取10MB的内容。

search_query

string

可选的搜索查询，用于按相关性排序结果。如果提供，默认使用原始search_query。

formats

enum<string>[]

已弃用：使用带有 retrieve_id 的 /retrieve 端点。要获取的格式数组（例如，["html", "markdown"]）。

可用选项:

html,

markdown

响应

成功响应，包含URL列表。

crawl_id

string

爬虫 ID

object

string

对象的类型。此端点为 "crawl"。

status

string

in_progress 或 completed

search_query

string

pages_count

number

pages

object[]

Show child attributes

metadata

object

Show child attributes

cursor

integer

在下一个请求的查询中传递，以获取下一个项目。

抓取信息创建地图