Crawl Aanmaken

Start een nieuwe crawl

curl --request POST \
  --url https://api.olostep.com/v1/crawls \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "start_url": "<string>",
  "max_pages": 123,
  "include_urls": [
    "<string>"
  ],
  "exclude_urls": [
    "<string>"
  ],
  "max_depth": 123,
  "include_external": true,
  "include_subdomain": true,
  "search_query": "<string>",
  "top_n": 123,
  "webhook": "<string>",
  "timeout": 123,
  "follow_robots_txt": true,
  "scrape_options": {
    "formats": [
      "markdown",
      "screenshot"
    ],
    "parser": "@olostep/extract-emails"
  }
}
'

import requests

url = "https://api.olostep.com/v1/crawls"

payload = {
    "start_url": "<string>",
    "max_pages": 123,
    "include_urls": ["<string>"],
    "exclude_urls": ["<string>"],
    "max_depth": 123,
    "include_external": True,
    "include_subdomain": True,
    "search_query": "<string>",
    "top_n": 123,
    "webhook": "<string>",
    "timeout": 123,
    "follow_robots_txt": True,
    "scrape_options": {
        "formats": ["markdown", "screenshot"],
        "parser": "@olostep/extract-emails"
    }
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    start_url: '<string>',
    max_pages: 123,
    include_urls: ['<string>'],
    exclude_urls: ['<string>'],
    max_depth: 123,
    include_external: true,
    include_subdomain: true,
    search_query: '<string>',
    top_n: 123,
    webhook: '<string>',
    timeout: 123,
    follow_robots_txt: true,
    scrape_options: {formats: ['markdown', 'screenshot'], parser: '@olostep/extract-emails'}
  })
};

fetch('https://api.olostep.com/v1/crawls', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.olostep.com/v1/crawls",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'start_url' => '<string>',
    'max_pages' => 123,
    'include_urls' => [
        '<string>'
    ],
    'exclude_urls' => [
        '<string>'
    ],
    'max_depth' => 123,
    'include_external' => true,
    'include_subdomain' => true,
    'search_query' => '<string>',
    'top_n' => 123,
    'webhook' => '<string>',
    'timeout' => 123,
    'follow_robots_txt' => true,
    'scrape_options' => [
        'formats' => [
                'markdown',
                'screenshot'
        ],
        'parser' => '@olostep/extract-emails'
    ]
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.olostep.com/v1/crawls"

	payload := strings.NewReader("{\n  \"start_url\": \"<string>\",\n  \"max_pages\": 123,\n  \"include_urls\": [\n    \"<string>\"\n  ],\n  \"exclude_urls\": [\n    \"<string>\"\n  ],\n  \"max_depth\": 123,\n  \"include_external\": true,\n  \"include_subdomain\": true,\n  \"search_query\": \"<string>\",\n  \"top_n\": 123,\n  \"webhook\": \"<string>\",\n  \"timeout\": 123,\n  \"follow_robots_txt\": true,\n  \"scrape_options\": {\n    \"formats\": [\n      \"markdown\",\n      \"screenshot\"\n    ],\n    \"parser\": \"@olostep/extract-emails\"\n  }\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.olostep.com/v1/crawls")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"start_url\": \"<string>\",\n  \"max_pages\": 123,\n  \"include_urls\": [\n    \"<string>\"\n  ],\n  \"exclude_urls\": [\n    \"<string>\"\n  ],\n  \"max_depth\": 123,\n  \"include_external\": true,\n  \"include_subdomain\": true,\n  \"search_query\": \"<string>\",\n  \"top_n\": 123,\n  \"webhook\": \"<string>\",\n  \"timeout\": 123,\n  \"follow_robots_txt\": true,\n  \"scrape_options\": {\n    \"formats\": [\n      \"markdown\",\n      \"screenshot\"\n    ],\n    \"parser\": \"@olostep/extract-emails\"\n  }\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.olostep.com/v1/crawls")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"start_url\": \"<string>\",\n  \"max_pages\": 123,\n  \"include_urls\": [\n    \"<string>\"\n  ],\n  \"exclude_urls\": [\n    \"<string>\"\n  ],\n  \"max_depth\": 123,\n  \"include_external\": true,\n  \"include_subdomain\": true,\n  \"search_query\": \"<string>\",\n  \"top_n\": 123,\n  \"webhook\": \"<string>\",\n  \"timeout\": 123,\n  \"follow_robots_txt\": true,\n  \"scrape_options\": {\n    \"formats\": [\n      \"markdown\",\n      \"screenshot\"\n    ],\n    \"parser\": \"@olostep/extract-emails\"\n  }\n}"

response = http.request(request)
puts response.read_body

{
  "id": "<string>",
  "object": "<string>",
  "status": "<string>",
  "created": 123,
  "start_date": "<string>",
  "start_url": "<string>",
  "max_pages": 123,
  "max_depth": 123,
  "exclude_urls": [
    "<string>"
  ],
  "include_urls": [
    "<string>"
  ],
  "include_external": true,
  "search_query": "<string>",
  "top_n": 123,
  "current_depth": 123,
  "pages_count": 123,
  "webhook": "<string>",
  "follow_robots_txt": true,
  "credits_consumed": 123,
  "cost_usd": 123
}

POST

crawls

Start een nieuwe crawl

curl --request POST \
  --url https://api.olostep.com/v1/crawls \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "start_url": "<string>",
  "max_pages": 123,
  "include_urls": [
    "<string>"
  ],
  "exclude_urls": [
    "<string>"
  ],
  "max_depth": 123,
  "include_external": true,
  "include_subdomain": true,
  "search_query": "<string>",
  "top_n": 123,
  "webhook": "<string>",
  "timeout": 123,
  "follow_robots_txt": true,
  "scrape_options": {
    "formats": [
      "markdown",
      "screenshot"
    ],
    "parser": "@olostep/extract-emails"
  }
}
'

import requests

url = "https://api.olostep.com/v1/crawls"

payload = {
    "start_url": "<string>",
    "max_pages": 123,
    "include_urls": ["<string>"],
    "exclude_urls": ["<string>"],
    "max_depth": 123,
    "include_external": True,
    "include_subdomain": True,
    "search_query": "<string>",
    "top_n": 123,
    "webhook": "<string>",
    "timeout": 123,
    "follow_robots_txt": True,
    "scrape_options": {
        "formats": ["markdown", "screenshot"],
        "parser": "@olostep/extract-emails"
    }
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    start_url: '<string>',
    max_pages: 123,
    include_urls: ['<string>'],
    exclude_urls: ['<string>'],
    max_depth: 123,
    include_external: true,
    include_subdomain: true,
    search_query: '<string>',
    top_n: 123,
    webhook: '<string>',
    timeout: 123,
    follow_robots_txt: true,
    scrape_options: {formats: ['markdown', 'screenshot'], parser: '@olostep/extract-emails'}
  })
};

fetch('https://api.olostep.com/v1/crawls', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.olostep.com/v1/crawls",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'start_url' => '<string>',
    'max_pages' => 123,
    'include_urls' => [
        '<string>'
    ],
    'exclude_urls' => [
        '<string>'
    ],
    'max_depth' => 123,
    'include_external' => true,
    'include_subdomain' => true,
    'search_query' => '<string>',
    'top_n' => 123,
    'webhook' => '<string>',
    'timeout' => 123,
    'follow_robots_txt' => true,
    'scrape_options' => [
        'formats' => [
                'markdown',
                'screenshot'
        ],
        'parser' => '@olostep/extract-emails'
    ]
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.olostep.com/v1/crawls"

	payload := strings.NewReader("{\n  \"start_url\": \"<string>\",\n  \"max_pages\": 123,\n  \"include_urls\": [\n    \"<string>\"\n  ],\n  \"exclude_urls\": [\n    \"<string>\"\n  ],\n  \"max_depth\": 123,\n  \"include_external\": true,\n  \"include_subdomain\": true,\n  \"search_query\": \"<string>\",\n  \"top_n\": 123,\n  \"webhook\": \"<string>\",\n  \"timeout\": 123,\n  \"follow_robots_txt\": true,\n  \"scrape_options\": {\n    \"formats\": [\n      \"markdown\",\n      \"screenshot\"\n    ],\n    \"parser\": \"@olostep/extract-emails\"\n  }\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.olostep.com/v1/crawls")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"start_url\": \"<string>\",\n  \"max_pages\": 123,\n  \"include_urls\": [\n    \"<string>\"\n  ],\n  \"exclude_urls\": [\n    \"<string>\"\n  ],\n  \"max_depth\": 123,\n  \"include_external\": true,\n  \"include_subdomain\": true,\n  \"search_query\": \"<string>\",\n  \"top_n\": 123,\n  \"webhook\": \"<string>\",\n  \"timeout\": 123,\n  \"follow_robots_txt\": true,\n  \"scrape_options\": {\n    \"formats\": [\n      \"markdown\",\n      \"screenshot\"\n    ],\n    \"parser\": \"@olostep/extract-emails\"\n  }\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.olostep.com/v1/crawls")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"start_url\": \"<string>\",\n  \"max_pages\": 123,\n  \"include_urls\": [\n    \"<string>\"\n  ],\n  \"exclude_urls\": [\n    \"<string>\"\n  ],\n  \"max_depth\": 123,\n  \"include_external\": true,\n  \"include_subdomain\": true,\n  \"search_query\": \"<string>\",\n  \"top_n\": 123,\n  \"webhook\": \"<string>\",\n  \"timeout\": 123,\n  \"follow_robots_txt\": true,\n  \"scrape_options\": {\n    \"formats\": [\n      \"markdown\",\n      \"screenshot\"\n    ],\n    \"parser\": \"@olostep/extract-emails\"\n  }\n}"

response = http.request(request)
puts response.read_body

{
  "id": "<string>",
  "object": "<string>",
  "status": "<string>",
  "created": 123,
  "start_date": "<string>",
  "start_url": "<string>",
  "max_pages": 123,
  "max_depth": 123,
  "exclude_urls": [
    "<string>"
  ],
  "include_urls": [
    "<string>"
  ],
  "include_external": true,
  "search_query": "<string>",
  "top_n": 123,
  "current_depth": 123,
  "pages_count": 123,
  "webhook": "<string>",
  "follow_robots_txt": true,
  "credits_consumed": 123,
  "cost_usd": 123
}

Word op de hoogte gebracht bij voltooiing: Geef de webhook parameter door met jouw endpoint URL om een HTTP POST te ontvangen wanneer de crawl voltooid is. Zie Webhooks voor details.

Autorisaties

Authorization

string

header

vereist

Bearer authenticatie header in de vorm Bearer , waar jouw auth token is.

Body

application/json

start_url

string

vereist

Het startpunt van de crawl.

max_pages

number

vereist

Maximum aantal pagina's om te crawlen. Aanbevolen voor de meeste gebruikssituaties zoals het crawlen van een hele website.

include_urls

string[]

URL-padpatronen om op te nemen in de crawl met behulp van glob-syntaxis. Standaard ingesteld op /** wat alle URLs omvat. Gebruik patronen zoals /blog/** om specifieke secties te crawlen (bijv. alleen blogpagina's), /products/*.html voor productpagina's, of meerdere patronen voor verschillende secties. Ondersteunt standaard glob-functies zoals * (willekeurige tekens) en ** (recursieve matching).

exclude_urls

string[]

URL-padnamen in glob-patroon om uit te sluiten. Bijvoorbeeld: /careers/**. Uitgesloten URLs zullen voorrang hebben op opgenomen URLs.

max_depth

number

Maximale diepte van de crawl. Handig om alleen tot n-graad van links te extraheren.

include_external

boolean

Crawl eerste-graads externe links.

include_subdomain

boolean

Inclusief subdomeinen van de website. Standaard false.

search_query

string

Een optionele zoekopdracht om specifieke links te vinden en ook de resultaten te sorteren op relevantie.

top_n

number

Een optioneel aantal om alleen de top N meest relevante links op elke pagina te crawlen volgens de zoekopdracht.

webhook

string<uri>

HTTPS URL om een POST-verzoek te ontvangen wanneer de crawl voltooid is. Moet een openbaar toegankelijke URL zijn met gebruik van http:// of https:// protocol. Kan niet wijzen naar localhost of privé IP-adressen. Zie Webhooks voor payloadformaat en retry-gedrag.

timeout

number

Beëindig de crawl na n seconden met de tot dan toe voltooide pagina's. Kan ~10s extra duren vanaf de opgegeven timeout.

follow_robots_txt

boolean

standaard:true

Of de robots.txt-regels gerespecteerd moeten worden. Als ingesteld op false, zal de crawler de website scrapen ongeacht robots.txt disallow-richtlijnen. Standaard true.

scrape_options

object

Bepaalt wat elke individuele pagina scrape aanvraagt van de Olostep API. Alle velden zijn optioneel.

Show child attributes

Respons

Crawl succesvol gestart.

string

Crawl ID

object

string

Het soort object. "crawl" voor deze endpoint.

status

string

in_progress of completed

created

number

Aangemaakte tijd in epoch

start_date

string

Aangemaakte tijd in datum

start_url

string

max_pages

number

max_depth

number

exclude_urls

string[]

include_urls

string[]

include_external

boolean

search_query

string

top_n

number

current_depth

number

De huidige diepte van het crawlproces.

pages_count

number

Aantal gecrawlde pagina's

webhook

string

follow_robots_txt

boolean

credits_consumed

integer | null

Aantal credits verbruikt door dit verzoek. Wordt ingevuld nadat de uitvoering is voltooid. Credits zijn de bron van waarheid voor facturering.

cost_usd

number | null

Geschatte kosten in USD voor dit verzoek. Wordt ingevuld nadat de uitvoering is voltooid. Berekend op basis van verbruikte credits en je tariefplan — 99% nauwkeurig, maar credits_consumed is de gezaghebbende waarde.

Batch-items Crawl Info

Algemeen

Schraapsels

Partijen

Crawls

Kaarten

Antwoorden

Zoeken

Monitoren

Bestanden

Roosters

Ophalen

Saldo & Facturering

Autorisaties

Body

Respons