クロールの作成

新しいクローラーを開始

curl --request POST \
  --url https://api.olostep.com/v1/crawls \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "start_url": "<string>",
  "max_pages": 123,
  "include_urls": [
    "<string>"
  ],
  "exclude_urls": [
    "<string>"
  ],
  "max_depth": 123,
  "include_external": true,
  "include_subdomain": true,
  "search_query": "<string>",
  "top_n": 123,
  "webhook": "<string>",
  "timeout": 123,
  "follow_robots_txt": true,
  "scrape_options": {
    "formats": [
      "markdown",
      "screenshot"
    ],
    "parser": "@olostep/extract-emails"
  }
}
'

import requests

url = "https://api.olostep.com/v1/crawls"

payload = {
    "start_url": "<string>",
    "max_pages": 123,
    "include_urls": ["<string>"],
    "exclude_urls": ["<string>"],
    "max_depth": 123,
    "include_external": True,
    "include_subdomain": True,
    "search_query": "<string>",
    "top_n": 123,
    "webhook": "<string>",
    "timeout": 123,
    "follow_robots_txt": True,
    "scrape_options": {
        "formats": ["markdown", "screenshot"],
        "parser": "@olostep/extract-emails"
    }
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    start_url: '<string>',
    max_pages: 123,
    include_urls: ['<string>'],
    exclude_urls: ['<string>'],
    max_depth: 123,
    include_external: true,
    include_subdomain: true,
    search_query: '<string>',
    top_n: 123,
    webhook: '<string>',
    timeout: 123,
    follow_robots_txt: true,
    scrape_options: {formats: ['markdown', 'screenshot'], parser: '@olostep/extract-emails'}
  })
};

fetch('https://api.olostep.com/v1/crawls', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.olostep.com/v1/crawls",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'start_url' => '<string>',
    'max_pages' => 123,
    'include_urls' => [
        '<string>'
    ],
    'exclude_urls' => [
        '<string>'
    ],
    'max_depth' => 123,
    'include_external' => true,
    'include_subdomain' => true,
    'search_query' => '<string>',
    'top_n' => 123,
    'webhook' => '<string>',
    'timeout' => 123,
    'follow_robots_txt' => true,
    'scrape_options' => [
        'formats' => [
                'markdown',
                'screenshot'
        ],
        'parser' => '@olostep/extract-emails'
    ]
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.olostep.com/v1/crawls"

	payload := strings.NewReader("{\n  \"start_url\": \"<string>\",\n  \"max_pages\": 123,\n  \"include_urls\": [\n    \"<string>\"\n  ],\n  \"exclude_urls\": [\n    \"<string>\"\n  ],\n  \"max_depth\": 123,\n  \"include_external\": true,\n  \"include_subdomain\": true,\n  \"search_query\": \"<string>\",\n  \"top_n\": 123,\n  \"webhook\": \"<string>\",\n  \"timeout\": 123,\n  \"follow_robots_txt\": true,\n  \"scrape_options\": {\n    \"formats\": [\n      \"markdown\",\n      \"screenshot\"\n    ],\n    \"parser\": \"@olostep/extract-emails\"\n  }\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.olostep.com/v1/crawls")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"start_url\": \"<string>\",\n  \"max_pages\": 123,\n  \"include_urls\": [\n    \"<string>\"\n  ],\n  \"exclude_urls\": [\n    \"<string>\"\n  ],\n  \"max_depth\": 123,\n  \"include_external\": true,\n  \"include_subdomain\": true,\n  \"search_query\": \"<string>\",\n  \"top_n\": 123,\n  \"webhook\": \"<string>\",\n  \"timeout\": 123,\n  \"follow_robots_txt\": true,\n  \"scrape_options\": {\n    \"formats\": [\n      \"markdown\",\n      \"screenshot\"\n    ],\n    \"parser\": \"@olostep/extract-emails\"\n  }\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.olostep.com/v1/crawls")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"start_url\": \"<string>\",\n  \"max_pages\": 123,\n  \"include_urls\": [\n    \"<string>\"\n  ],\n  \"exclude_urls\": [\n    \"<string>\"\n  ],\n  \"max_depth\": 123,\n  \"include_external\": true,\n  \"include_subdomain\": true,\n  \"search_query\": \"<string>\",\n  \"top_n\": 123,\n  \"webhook\": \"<string>\",\n  \"timeout\": 123,\n  \"follow_robots_txt\": true,\n  \"scrape_options\": {\n    \"formats\": [\n      \"markdown\",\n      \"screenshot\"\n    ],\n    \"parser\": \"@olostep/extract-emails\"\n  }\n}"

response = http.request(request)
puts response.read_body

{
  "id": "<string>",
  "object": "<string>",
  "status": "<string>",
  "created": 123,
  "start_date": "<string>",
  "start_url": "<string>",
  "max_pages": 123,
  "max_depth": 123,
  "exclude_urls": [
    "<string>"
  ],
  "include_urls": [
    "<string>"
  ],
  "include_external": true,
  "search_query": "<string>",
  "top_n": 123,
  "current_depth": 123,
  "pages_count": 123,
  "webhook": "<string>",
  "follow_robots_txt": true,
  "credits_consumed": 123,
  "cost_usd": 123
}

POST

crawls

新しいクローラーを開始

curl --request POST \
  --url https://api.olostep.com/v1/crawls \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "start_url": "<string>",
  "max_pages": 123,
  "include_urls": [
    "<string>"
  ],
  "exclude_urls": [
    "<string>"
  ],
  "max_depth": 123,
  "include_external": true,
  "include_subdomain": true,
  "search_query": "<string>",
  "top_n": 123,
  "webhook": "<string>",
  "timeout": 123,
  "follow_robots_txt": true,
  "scrape_options": {
    "formats": [
      "markdown",
      "screenshot"
    ],
    "parser": "@olostep/extract-emails"
  }
}
'

import requests

url = "https://api.olostep.com/v1/crawls"

payload = {
    "start_url": "<string>",
    "max_pages": 123,
    "include_urls": ["<string>"],
    "exclude_urls": ["<string>"],
    "max_depth": 123,
    "include_external": True,
    "include_subdomain": True,
    "search_query": "<string>",
    "top_n": 123,
    "webhook": "<string>",
    "timeout": 123,
    "follow_robots_txt": True,
    "scrape_options": {
        "formats": ["markdown", "screenshot"],
        "parser": "@olostep/extract-emails"
    }
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    start_url: '<string>',
    max_pages: 123,
    include_urls: ['<string>'],
    exclude_urls: ['<string>'],
    max_depth: 123,
    include_external: true,
    include_subdomain: true,
    search_query: '<string>',
    top_n: 123,
    webhook: '<string>',
    timeout: 123,
    follow_robots_txt: true,
    scrape_options: {formats: ['markdown', 'screenshot'], parser: '@olostep/extract-emails'}
  })
};

fetch('https://api.olostep.com/v1/crawls', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.olostep.com/v1/crawls",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'start_url' => '<string>',
    'max_pages' => 123,
    'include_urls' => [
        '<string>'
    ],
    'exclude_urls' => [
        '<string>'
    ],
    'max_depth' => 123,
    'include_external' => true,
    'include_subdomain' => true,
    'search_query' => '<string>',
    'top_n' => 123,
    'webhook' => '<string>',
    'timeout' => 123,
    'follow_robots_txt' => true,
    'scrape_options' => [
        'formats' => [
                'markdown',
                'screenshot'
        ],
        'parser' => '@olostep/extract-emails'
    ]
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.olostep.com/v1/crawls"

	payload := strings.NewReader("{\n  \"start_url\": \"<string>\",\n  \"max_pages\": 123,\n  \"include_urls\": [\n    \"<string>\"\n  ],\n  \"exclude_urls\": [\n    \"<string>\"\n  ],\n  \"max_depth\": 123,\n  \"include_external\": true,\n  \"include_subdomain\": true,\n  \"search_query\": \"<string>\",\n  \"top_n\": 123,\n  \"webhook\": \"<string>\",\n  \"timeout\": 123,\n  \"follow_robots_txt\": true,\n  \"scrape_options\": {\n    \"formats\": [\n      \"markdown\",\n      \"screenshot\"\n    ],\n    \"parser\": \"@olostep/extract-emails\"\n  }\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.olostep.com/v1/crawls")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"start_url\": \"<string>\",\n  \"max_pages\": 123,\n  \"include_urls\": [\n    \"<string>\"\n  ],\n  \"exclude_urls\": [\n    \"<string>\"\n  ],\n  \"max_depth\": 123,\n  \"include_external\": true,\n  \"include_subdomain\": true,\n  \"search_query\": \"<string>\",\n  \"top_n\": 123,\n  \"webhook\": \"<string>\",\n  \"timeout\": 123,\n  \"follow_robots_txt\": true,\n  \"scrape_options\": {\n    \"formats\": [\n      \"markdown\",\n      \"screenshot\"\n    ],\n    \"parser\": \"@olostep/extract-emails\"\n  }\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.olostep.com/v1/crawls")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"start_url\": \"<string>\",\n  \"max_pages\": 123,\n  \"include_urls\": [\n    \"<string>\"\n  ],\n  \"exclude_urls\": [\n    \"<string>\"\n  ],\n  \"max_depth\": 123,\n  \"include_external\": true,\n  \"include_subdomain\": true,\n  \"search_query\": \"<string>\",\n  \"top_n\": 123,\n  \"webhook\": \"<string>\",\n  \"timeout\": 123,\n  \"follow_robots_txt\": true,\n  \"scrape_options\": {\n    \"formats\": [\n      \"markdown\",\n      \"screenshot\"\n    ],\n    \"parser\": \"@olostep/extract-emails\"\n  }\n}"

response = http.request(request)
puts response.read_body

{
  "id": "<string>",
  "object": "<string>",
  "status": "<string>",
  "created": 123,
  "start_date": "<string>",
  "start_url": "<string>",
  "max_pages": 123,
  "max_depth": 123,
  "exclude_urls": [
    "<string>"
  ],
  "include_urls": [
    "<string>"
  ],
  "include_external": true,
  "search_query": "<string>",
  "top_n": 123,
  "current_depth": 123,
  "pages_count": 123,
  "webhook": "<string>",
  "follow_robots_txt": true,
  "credits_consumed": 123,
  "cost_usd": 123
}

完了時に通知を受け取る: クロールが完了したときにHTTP POSTを受け取るために、webhook パラメータとあなたのエンドポイントURLを渡します。詳細はWebhooksを参照してください。

承認

Authorization

string

header

必須

Bearer 形式のBearer認証ヘッダー。はあなたの認証トークンです。

ボディ

application/json

start_url

string

必須

クローラーの開始地点。

max_pages

number

必須

クロールするページの最大数。ウェブサイト全体をクロールするようなほとんどのユースケースに推奨されます。

include_urls

string[]

glob構文を使用してクローラーに含めるURLパスパターン。デフォルトは/**で、すべてのURLを含みます。特定のセクションをクロールするには/blog/**のようなパターンを使用します（例：ブログページのみ）、/products/*.htmlは商品ページ用、または異なるセクションのために複数のパターンを使用します。*（任意の文字）や**（再帰的マッチング）などの標準的なglob機能をサポートしています。

exclude_urls

string[]

globパターンで除外するURLパス名。例：/careers/**。除外されたURLは含まれるURLより優先されます。

max_depth

number

クロールの最大深度。n次のリンクまでのみ抽出するのに便利です。

include_external

boolean

一次外部リンクをクロールします。

include_subdomain

boolean

ウェブサイトのサブドメインを含めます。デフォルトはfalseです。

search_query

string

特定のリンクを見つけるためのオプションの検索クエリで、結果を関連性でソートします。

top_n

number

検索クエリに従って、各ページで最も関連性の高いリンクのトップNのみをクロールするオプションの数。

webhook

string<uri>

クロールが完了したときにPOSTリクエストを受け取るためのHTTPS URL。http://またはhttps://プロトコルを使用して公開アクセス可能なURLでなければなりません。localhostやプライベートIPアドレスを指すことはできません。ペイロード形式と再試行の動作についてはWebhooksを参照してください。

timeout

number

n秒後にクロールを終了し、その時点までに完了したページを取得します。指定されたタイムアウトから約10秒余分にかかることがあります。

follow_robots_txt

boolean

デフォルト:true

robots.txtのルールを尊重するかどうか。falseに設定すると、クローラーはrobots.txtの禁止指令に関係なくウェブサイトをスクレイプします。デフォルトはtrueです。

scrape_options

object

Olostep APIから各個別ページのスクレイプリクエストを制御します。すべてのフィールドはオプションです。

Show child attributes

レスポンス

クロールが正常に開始されました。

string

クロールID

object

string

オブジェクトの種類。このエンドポイントでは "crawl"。

status

string

in_progress または completed

created

number

エポックでの作成時間

start_date

string

日付での作成時間

start_url

string

max_pages

number

max_depth

number

exclude_urls

string[]

include_urls

string[]

include_external

boolean

search_query

string

top_n

number

current_depth

number

クローリングプロセスの現在の深さ。

pages_count

number

クロールされたページの数

webhook

string

follow_robots_txt

boolean

credits_consumed

integer | null

このリクエストで消費されたクレジットの数。実行完了後に設定されるよ。クレジットは請求の基準だよ。

cost_usd

number | null

このリクエストのUSDでの推定コスト。実行完了後に設定されるよ。消費されたクレジットとプランのレートから計算されるよ — 99%の精度だけど、credits_consumedが正確な値だよ。

バッチアイテムクロール情報

共通

スクレイプ

バッチ

クロール

地図

回答

検索

モニター

ファイル

スケジュール

取得

残高と請求書

承認

ボディ

レスポンス