Maak Scrape

Start een webpagina scrape

curl --request POST \
  --url https://api.olostep.com/v1/scrapes \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "url_to_scrape": "<string>",
  "wait_before_scraping": 123,
  "formats": [],
  "actions": [
    {
      "type": "wait",
      "milliseconds": 1
    }
  ],
  "country": "<string>",
  "remove_images": false,
  "remove_class_names": [
    "<string>"
  ],
  "llm_extract": {
    "schema": {}
  },
  "links_on_page": {
    "query_to_order_links_by": "<string>",
    "include_links": [
      "<string>"
    ],
    "exclude_links": [
      "<string>"
    ]
  },
  "screen_size": {
    "screen_width": 123,
    "screen_height": 123
  },
  "screenshot": {
    "full_page": true
  },
  "metadata": {},
  "max_age": 0
}
'

import requests

url = "https://api.olostep.com/v1/scrapes"

payload = {
    "url_to_scrape": "<string>",
    "wait_before_scraping": 123,
    "formats": [],
    "actions": [
        {
            "type": "wait",
            "milliseconds": 1
        }
    ],
    "country": "<string>",
    "remove_images": False,
    "remove_class_names": ["<string>"],
    "llm_extract": { "schema": {} },
    "links_on_page": {
        "query_to_order_links_by": "<string>",
        "include_links": ["<string>"],
        "exclude_links": ["<string>"]
    },
    "screen_size": {
        "screen_width": 123,
        "screen_height": 123
    },
    "screenshot": { "full_page": True },
    "metadata": {},
    "max_age": 0
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    url_to_scrape: '<string>',
    wait_before_scraping: 123,
    formats: [],
    actions: [{type: 'wait', milliseconds: 1}],
    country: '<string>',
    remove_images: false,
    remove_class_names: ['<string>'],
    llm_extract: {schema: {}},
    links_on_page: {
      query_to_order_links_by: '<string>',
      include_links: ['<string>'],
      exclude_links: ['<string>']
    },
    screen_size: {screen_width: 123, screen_height: 123},
    screenshot: {full_page: true},
    metadata: {},
    max_age: 0
  })
};

fetch('https://api.olostep.com/v1/scrapes', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.olostep.com/v1/scrapes",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'url_to_scrape' => '<string>',
    'wait_before_scraping' => 123,
    'formats' => [
        
    ],
    'actions' => [
        [
                'type' => 'wait',
                'milliseconds' => 1
        ]
    ],
    'country' => '<string>',
    'remove_images' => false,
    'remove_class_names' => [
        '<string>'
    ],
    'llm_extract' => [
        'schema' => [
                
        ]
    ],
    'links_on_page' => [
        'query_to_order_links_by' => '<string>',
        'include_links' => [
                '<string>'
        ],
        'exclude_links' => [
                '<string>'
        ]
    ],
    'screen_size' => [
        'screen_width' => 123,
        'screen_height' => 123
    ],
    'screenshot' => [
        'full_page' => true
    ],
    'metadata' => [
        
    ],
    'max_age' => 0
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.olostep.com/v1/scrapes"

	payload := strings.NewReader("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.olostep.com/v1/scrapes")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.olostep.com/v1/scrapes")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}"

response = http.request(request)
puts response.read_body

{
  "id": "<string>",
  "object": "<string>",
  "created": 123,
  "metadata": {},
  "url_to_scrape": "<string>",
  "result": {
    "html_content": "<string>",
    "markdown_content": "<string>",
    "text_content": "<string>",
    "json_content": "<string>",
    "screenshot_hosted_url": "<string>",
    "html_hosted_url": "<string>",
    "markdown_hosted_url": "<string>",
    "text_hosted_url": "<string>",
    "links_on_page": [
      "<string>"
    ],
    "page_metadata": {
      "status_code": 123,
      "title": "<string>"
    }
  },
  "credits_consumed": 123,
  "cost_usd": 123
}

{
  "id": "error_x2nmu5bqn6",
  "object": "error",
  "created": 1777923912,
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "dns_resolution_failed",
    "message": "The URL contains a typo, or the domain does not exist."
  }
}

{
  "id": "error_ogeb6rik8c",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "tls_error",
    "detail": "err_ssl_tlsv1_alert_internal_error",
    "message": "The website closed or rejected the TLS handshake. The server may be misconfigured or use an unsupported SSL/TLS version."
  }
}

{
  "id": "error_qat3d1amjt",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "request_timeout",
    "code": "scrape_poll_timeout",
    "message": "Request timed out while waiting for scrape result. The page may be slow, blocked for our fetchers, or temporarily unavailable."
  }
}

POST

scrapes

Start een webpagina scrape

curl --request POST \
  --url https://api.olostep.com/v1/scrapes \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "url_to_scrape": "<string>",
  "wait_before_scraping": 123,
  "formats": [],
  "actions": [
    {
      "type": "wait",
      "milliseconds": 1
    }
  ],
  "country": "<string>",
  "remove_images": false,
  "remove_class_names": [
    "<string>"
  ],
  "llm_extract": {
    "schema": {}
  },
  "links_on_page": {
    "query_to_order_links_by": "<string>",
    "include_links": [
      "<string>"
    ],
    "exclude_links": [
      "<string>"
    ]
  },
  "screen_size": {
    "screen_width": 123,
    "screen_height": 123
  },
  "screenshot": {
    "full_page": true
  },
  "metadata": {},
  "max_age": 0
}
'

import requests

url = "https://api.olostep.com/v1/scrapes"

payload = {
    "url_to_scrape": "<string>",
    "wait_before_scraping": 123,
    "formats": [],
    "actions": [
        {
            "type": "wait",
            "milliseconds": 1
        }
    ],
    "country": "<string>",
    "remove_images": False,
    "remove_class_names": ["<string>"],
    "llm_extract": { "schema": {} },
    "links_on_page": {
        "query_to_order_links_by": "<string>",
        "include_links": ["<string>"],
        "exclude_links": ["<string>"]
    },
    "screen_size": {
        "screen_width": 123,
        "screen_height": 123
    },
    "screenshot": { "full_page": True },
    "metadata": {},
    "max_age": 0
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    url_to_scrape: '<string>',
    wait_before_scraping: 123,
    formats: [],
    actions: [{type: 'wait', milliseconds: 1}],
    country: '<string>',
    remove_images: false,
    remove_class_names: ['<string>'],
    llm_extract: {schema: {}},
    links_on_page: {
      query_to_order_links_by: '<string>',
      include_links: ['<string>'],
      exclude_links: ['<string>']
    },
    screen_size: {screen_width: 123, screen_height: 123},
    screenshot: {full_page: true},
    metadata: {},
    max_age: 0
  })
};

fetch('https://api.olostep.com/v1/scrapes', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.olostep.com/v1/scrapes",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'url_to_scrape' => '<string>',
    'wait_before_scraping' => 123,
    'formats' => [
        
    ],
    'actions' => [
        [
                'type' => 'wait',
                'milliseconds' => 1
        ]
    ],
    'country' => '<string>',
    'remove_images' => false,
    'remove_class_names' => [
        '<string>'
    ],
    'llm_extract' => [
        'schema' => [
                
        ]
    ],
    'links_on_page' => [
        'query_to_order_links_by' => '<string>',
        'include_links' => [
                '<string>'
        ],
        'exclude_links' => [
                '<string>'
        ]
    ],
    'screen_size' => [
        'screen_width' => 123,
        'screen_height' => 123
    ],
    'screenshot' => [
        'full_page' => true
    ],
    'metadata' => [
        
    ],
    'max_age' => 0
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.olostep.com/v1/scrapes"

	payload := strings.NewReader("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.olostep.com/v1/scrapes")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.olostep.com/v1/scrapes")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}"

response = http.request(request)
puts response.read_body

{
  "id": "<string>",
  "object": "<string>",
  "created": 123,
  "metadata": {},
  "url_to_scrape": "<string>",
  "result": {
    "html_content": "<string>",
    "markdown_content": "<string>",
    "text_content": "<string>",
    "json_content": "<string>",
    "screenshot_hosted_url": "<string>",
    "html_hosted_url": "<string>",
    "markdown_hosted_url": "<string>",
    "text_hosted_url": "<string>",
    "links_on_page": [
      "<string>"
    ],
    "page_metadata": {
      "status_code": 123,
      "title": "<string>"
    }
  },
  "credits_consumed": 123,
  "cost_usd": 123
}

{
  "id": "error_x2nmu5bqn6",
  "object": "error",
  "created": 1777923912,
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "dns_resolution_failed",
    "message": "The URL contains a typo, or the domain does not exist."
  }
}

{
  "id": "error_ogeb6rik8c",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "tls_error",
    "detail": "err_ssl_tlsv1_alert_internal_error",
    "message": "The website closed or rejected the TLS handshake. The server may be misconfigured or use an unsupported SSL/TLS version."
  }
}

{
  "id": "error_qat3d1amjt",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "request_timeout",
    "code": "scrape_poll_timeout",
    "message": "Request timed out while waiting for scrape result. The page may be slow, blocked for our fetchers, or temporarily unavailable."
  }
}

Optionele caching: Geef max_age (in seconden) door om een recente scrape met dezelfde parameters opnieuw te gebruiken in plaats van de pagina opnieuw op te halen. Standaard is 0 (altijd vers). In de dashboard playground is de standaard 24 uur. Zie Caching voor details.

Autorisaties

Authorization

string

header

vereist

Bearer authenticatie header in de vorm Bearer , waar jouw auth token is.

Body

application/json

url_to_scrape

string<uri>

vereist

De URL om te beginnen met scrapen.

wait_before_scraping

integer

Tijd om te wachten in milliseconden voordat de scraping begint.

formats

enum<string>[]

Formaten waarin je de inhoud wilt.

Beschikbare opties:

html,

markdown,

text,

json,

raw_pdf,

screenshot

remove_css_selectors

enum<string>

Optie om bepaalde CSS-selectors uit de inhoud te verwijderen. Optioneel kun je ook een JSON-stringified array van specifieke selectors doorgeven die je wilt verwijderen. De CSS-selectors die worden verwijderd wanneer deze optie is ingesteld op standaard zijn ['nav','footer','script','style','noscript','svg',[role=alert],[role=banner],[role=dialog],[role=alertdialog],[role=region][aria-label*=skip i],[aria-modal=true]]

Beschikbare opties:

default,

none,

array

actions

(Wachten · object | Klikken · object | Vul Invoer In · object | Scroll · object)[]

Acties om uit te voeren op de pagina voordat je de inhoud krijgt.

Wachten
Klikken
Vul Invoer In
Scroll

Show child attributes

country

string

Woonland van waaruit het verzoek moet worden geladen. Ondersteunde waarden zijn: - US (Verenigde Staten) - CA (Canada) - IT (Italië) - IN (India) - GB (Engeland) - JP (Japan) - MX (Mexico) - AU (Australië) - ID (Indonesië) - UA (VAE) - RU (Rusland) - RANDOM Sommige operaties, zoals het scrapen van Google Search en Google News, ondersteunen alle landen.

transformer

enum<string>

Specificeer de HTML-transformator die je wilt gebruiken, indien van toepassing. De Mercury Parser-bibliotheek van Postlight wordt gebruikt om advertenties en andere ongewenste inhoud uit de gescrapete inhoud te verwijderen.

Beschikbare opties:

postlight,

none

remove_images

boolean

standaard:false

Optie om afbeeldingen uit de gescrapete inhoud te verwijderen. Standaard is false.

remove_class_names

string[]

Lijst van class-namen om uit de inhoud te verwijderen.

parser

object

Wanneer je json als formaat definieert, kun je deze parameter gebruiken om de parser te specificeren die je wilt gebruiken. Parsers zijn nuttig om gestructureerde inhoud uit webpagina's te halen. Olostep heeft een paar ingebouwde parsers voor de meest voorkomende webpagina's, en je kunt ook je eigen parsers maken.

Show child attributes

llm_extract

object

Show child attributes

links_on_page

object

Met deze optie kun je alle links krijgen die aanwezig zijn op de pagina die je scrapt. Links worden altijd geretourneerd als absolute URLs.

Show child attributes

screen_size

object

Configuratie voor schermgrootte. Vooraf ingestelde afmetingen zijn beschikbaar via screen_type: desktop (1920x1080), mobile (414x896), of default (768x1024).

Show child attributes

screenshot

object

Show child attributes

metadata

object

Door de gebruiker gedefinieerde metadata. Nog niet ondersteund.

max_age

integer

standaard:0

Maximale acceptabele leeftijd van gecachte inhoud, in seconden. Wanneer een overeenkomende scrape al bestaat en nieuwer is dan max_age seconden, retourneert Olostep het opgeslagen resultaat in plaats van een nieuwe browser scrape te starten. Standaard is 0 (altijd vers scrapen). In de dashboard playground is de standaard 86400 (24 uur). De maximaal toegestane waarde is 604800 (7 dagen). Zie de sectie Caching in de Scrapes feature docs voor details.

Vereist bereik: x >= 0

Respons

Succesvolle respons met de details van de scrape initiatie.

string

Scrape ID

object

string

Het soort object. "scrape" voor dit endpoint.

created

number

Gemaakt epoch

metadata

object

Door de gebruiker gedefinieerde metadata.

url_to_scrape

string

De URL die is gescraped.

result

object

Show child attributes

credits_consumed

integer | null

Aantal credits verbruikt door dit verzoek. Wordt ingevuld nadat de uitvoering is voltooid. Credits zijn de bron van waarheid voor facturering.

cost_usd

number | null

Geschatte kosten in USD voor dit verzoek. Wordt ingevuld nadat de uitvoering is voltooid. Berekend op basis van verbruikte credits en je tariefplan — 99% nauwkeurig, maar credits_consumed is de gezaghebbende waarde.

Webhooks Scrape Ophalen

Algemeen

Schraapsels

Partijen

Crawls

Kaarten

Antwoorden

Zoeken

Monitoren

Bestanden

Roosters

Ophalen

Saldo & Facturering

Autorisaties

Body

Respons