Crea Scrape

Inizia uno scraping di una pagina web

curl --request POST \
  --url https://api.olostep.com/v1/scrapes \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "url_to_scrape": "<string>",
  "wait_before_scraping": 123,
  "formats": [],
  "actions": [
    {
      "type": "wait",
      "milliseconds": 1
    }
  ],
  "country": "<string>",
  "remove_images": false,
  "remove_class_names": [
    "<string>"
  ],
  "llm_extract": {
    "schema": {}
  },
  "links_on_page": {
    "query_to_order_links_by": "<string>",
    "include_links": [
      "<string>"
    ],
    "exclude_links": [
      "<string>"
    ]
  },
  "screen_size": {
    "screen_width": 123,
    "screen_height": 123
  },
  "screenshot": {
    "full_page": true
  },
  "metadata": {},
  "max_age": 0
}
'

import requests

url = "https://api.olostep.com/v1/scrapes"

payload = {
    "url_to_scrape": "<string>",
    "wait_before_scraping": 123,
    "formats": [],
    "actions": [
        {
            "type": "wait",
            "milliseconds": 1
        }
    ],
    "country": "<string>",
    "remove_images": False,
    "remove_class_names": ["<string>"],
    "llm_extract": { "schema": {} },
    "links_on_page": {
        "query_to_order_links_by": "<string>",
        "include_links": ["<string>"],
        "exclude_links": ["<string>"]
    },
    "screen_size": {
        "screen_width": 123,
        "screen_height": 123
    },
    "screenshot": { "full_page": True },
    "metadata": {},
    "max_age": 0
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    url_to_scrape: '<string>',
    wait_before_scraping: 123,
    formats: [],
    actions: [{type: 'wait', milliseconds: 1}],
    country: '<string>',
    remove_images: false,
    remove_class_names: ['<string>'],
    llm_extract: {schema: {}},
    links_on_page: {
      query_to_order_links_by: '<string>',
      include_links: ['<string>'],
      exclude_links: ['<string>']
    },
    screen_size: {screen_width: 123, screen_height: 123},
    screenshot: {full_page: true},
    metadata: {},
    max_age: 0
  })
};

fetch('https://api.olostep.com/v1/scrapes', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.olostep.com/v1/scrapes",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'url_to_scrape' => '<string>',
    'wait_before_scraping' => 123,
    'formats' => [
        
    ],
    'actions' => [
        [
                'type' => 'wait',
                'milliseconds' => 1
        ]
    ],
    'country' => '<string>',
    'remove_images' => false,
    'remove_class_names' => [
        '<string>'
    ],
    'llm_extract' => [
        'schema' => [
                
        ]
    ],
    'links_on_page' => [
        'query_to_order_links_by' => '<string>',
        'include_links' => [
                '<string>'
        ],
        'exclude_links' => [
                '<string>'
        ]
    ],
    'screen_size' => [
        'screen_width' => 123,
        'screen_height' => 123
    ],
    'screenshot' => [
        'full_page' => true
    ],
    'metadata' => [
        
    ],
    'max_age' => 0
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.olostep.com/v1/scrapes"

	payload := strings.NewReader("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.olostep.com/v1/scrapes")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.olostep.com/v1/scrapes")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}"

response = http.request(request)
puts response.read_body

{
  "id": "<string>",
  "object": "<string>",
  "created": 123,
  "metadata": {},
  "url_to_scrape": "<string>",
  "result": {
    "html_content": "<string>",
    "markdown_content": "<string>",
    "text_content": "<string>",
    "json_content": "<string>",
    "screenshot_hosted_url": "<string>",
    "html_hosted_url": "<string>",
    "markdown_hosted_url": "<string>",
    "text_hosted_url": "<string>",
    "links_on_page": [
      "<string>"
    ],
    "page_metadata": {
      "status_code": 123,
      "title": "<string>"
    }
  },
  "credits_consumed": 123,
  "cost_usd": 123
}

{
  "id": "error_x2nmu5bqn6",
  "object": "error",
  "created": 1777923912,
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "dns_resolution_failed",
    "message": "The URL contains a typo, or the domain does not exist."
  }
}

{
  "id": "error_ogeb6rik8c",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "tls_error",
    "detail": "err_ssl_tlsv1_alert_internal_error",
    "message": "The website closed or rejected the TLS handshake. The server may be misconfigured or use an unsupported SSL/TLS version."
  }
}

{
  "id": "error_qat3d1amjt",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "request_timeout",
    "code": "scrape_poll_timeout",
    "message": "Request timed out while waiting for scrape result. The page may be slow, blocked for our fetchers, or temporarily unavailable."
  }
}

POST

scrapes

Inizia uno scraping di una pagina web

curl --request POST \
  --url https://api.olostep.com/v1/scrapes \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "url_to_scrape": "<string>",
  "wait_before_scraping": 123,
  "formats": [],
  "actions": [
    {
      "type": "wait",
      "milliseconds": 1
    }
  ],
  "country": "<string>",
  "remove_images": false,
  "remove_class_names": [
    "<string>"
  ],
  "llm_extract": {
    "schema": {}
  },
  "links_on_page": {
    "query_to_order_links_by": "<string>",
    "include_links": [
      "<string>"
    ],
    "exclude_links": [
      "<string>"
    ]
  },
  "screen_size": {
    "screen_width": 123,
    "screen_height": 123
  },
  "screenshot": {
    "full_page": true
  },
  "metadata": {},
  "max_age": 0
}
'

import requests

url = "https://api.olostep.com/v1/scrapes"

payload = {
    "url_to_scrape": "<string>",
    "wait_before_scraping": 123,
    "formats": [],
    "actions": [
        {
            "type": "wait",
            "milliseconds": 1
        }
    ],
    "country": "<string>",
    "remove_images": False,
    "remove_class_names": ["<string>"],
    "llm_extract": { "schema": {} },
    "links_on_page": {
        "query_to_order_links_by": "<string>",
        "include_links": ["<string>"],
        "exclude_links": ["<string>"]
    },
    "screen_size": {
        "screen_width": 123,
        "screen_height": 123
    },
    "screenshot": { "full_page": True },
    "metadata": {},
    "max_age": 0
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    url_to_scrape: '<string>',
    wait_before_scraping: 123,
    formats: [],
    actions: [{type: 'wait', milliseconds: 1}],
    country: '<string>',
    remove_images: false,
    remove_class_names: ['<string>'],
    llm_extract: {schema: {}},
    links_on_page: {
      query_to_order_links_by: '<string>',
      include_links: ['<string>'],
      exclude_links: ['<string>']
    },
    screen_size: {screen_width: 123, screen_height: 123},
    screenshot: {full_page: true},
    metadata: {},
    max_age: 0
  })
};

fetch('https://api.olostep.com/v1/scrapes', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.olostep.com/v1/scrapes",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'url_to_scrape' => '<string>',
    'wait_before_scraping' => 123,
    'formats' => [
        
    ],
    'actions' => [
        [
                'type' => 'wait',
                'milliseconds' => 1
        ]
    ],
    'country' => '<string>',
    'remove_images' => false,
    'remove_class_names' => [
        '<string>'
    ],
    'llm_extract' => [
        'schema' => [
                
        ]
    ],
    'links_on_page' => [
        'query_to_order_links_by' => '<string>',
        'include_links' => [
                '<string>'
        ],
        'exclude_links' => [
                '<string>'
        ]
    ],
    'screen_size' => [
        'screen_width' => 123,
        'screen_height' => 123
    ],
    'screenshot' => [
        'full_page' => true
    ],
    'metadata' => [
        
    ],
    'max_age' => 0
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.olostep.com/v1/scrapes"

	payload := strings.NewReader("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.olostep.com/v1/scrapes")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.olostep.com/v1/scrapes")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}"

response = http.request(request)
puts response.read_body

{
  "id": "<string>",
  "object": "<string>",
  "created": 123,
  "metadata": {},
  "url_to_scrape": "<string>",
  "result": {
    "html_content": "<string>",
    "markdown_content": "<string>",
    "text_content": "<string>",
    "json_content": "<string>",
    "screenshot_hosted_url": "<string>",
    "html_hosted_url": "<string>",
    "markdown_hosted_url": "<string>",
    "text_hosted_url": "<string>",
    "links_on_page": [
      "<string>"
    ],
    "page_metadata": {
      "status_code": 123,
      "title": "<string>"
    }
  },
  "credits_consumed": 123,
  "cost_usd": 123
}

{
  "id": "error_x2nmu5bqn6",
  "object": "error",
  "created": 1777923912,
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "dns_resolution_failed",
    "message": "The URL contains a typo, or the domain does not exist."
  }
}

{
  "id": "error_ogeb6rik8c",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "tls_error",
    "detail": "err_ssl_tlsv1_alert_internal_error",
    "message": "The website closed or rejected the TLS handshake. The server may be misconfigured or use an unsupported SSL/TLS version."
  }
}

{
  "id": "error_qat3d1amjt",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "request_timeout",
    "code": "scrape_poll_timeout",
    "message": "Request timed out while waiting for scrape result. The page may be slow, blocked for our fetchers, or temporarily unavailable."
  }
}

Caching opzionale: Passa max_age (in secondi) per riutilizzare uno scrape recente con gli stessi parametri invece di recuperare nuovamente la pagina. Il valore predefinito è 0 (sempre fresco). Nel playground della dashboard, il valore predefinito è 24 ore. Vedi Caching per i dettagli.

Autorizzazioni

Authorization

string

header

obbligatorio

Intestazione di autenticazione Bearer del tipo Bearer , dove è il tuo token di autenticazione.

Corpo

application/json

url_to_scrape

string<uri>

obbligatorio

L'URL da cui iniziare lo scraping.

wait_before_scraping

integer

Tempo di attesa in millisecondi prima di iniziare lo scraping.

formats

enum<string>[]

Formati nei quali vuoi il contenuto.

Opzioni disponibili:

html,

markdown,

text,

json,

raw_pdf,

screenshot

remove_css_selectors

enum<string>

Opzione per rimuovere determinati selettori CSS dal contenuto. Facoltativamente, puoi anche passare un array JSON stringificato di selettori specifici che vuoi rimuovere. I selettori CSS rimossi quando questa opzione è impostata su default sono ['nav','footer','script','style','noscript','svg',[role=alert],[role=banner],[role=dialog],[role=alertdialog],[role=region][aria-label*=skip i],[aria-modal=true]]

Opzioni disponibili:

default,

none,

array

actions

(Attendere · object | Cliccare · object | Compila Input · object | Scorri · object)[]

Azioni da eseguire sulla pagina prima di ottenere il contenuto.

Attendere
Cliccare
Compila Input
Scorri

Show child attributes

country

string

Paese residenziale da cui caricare la richiesta. Valori supportati sono: - US (Stati Uniti) - CA (Canada) - IT (Italia) - IN (India) - GB (Inghilterra) - JP (Giappone) - MX (Messico) - AU (Australia) - ID (Indonesia) - UA (UAE) - RU (Russia) - RANDOM Alcune operazioni, come lo scraping di Google Search e Google News, supportano tutti i paesi.

transformer

enum<string>

Specifica il trasformatore HTML da utilizzare, se presente. La libreria Mercury Parser di Postlight viene utilizzata per rimuovere annunci e altri contenuti indesiderati dal contenuto estratto.

Opzioni disponibili:

postlight,

none

remove_images

boolean

predefinito:false

Opzione per rimuovere le immagini dal contenuto estratto. Di default è false.

remove_class_names

string[]

Elenco dei nomi di classe da rimuovere dal contenuto.

parser

object

Quando definisci json come formato, puoi usare questo parametro per specificare il parser da utilizzare. I parser sono utili per estrarre contenuti strutturati dalle pagine web. Olostep ha alcuni parser integrati per le pagine web più comuni, e puoi anche creare i tuoi parser.

Show child attributes

llm_extract

object

Show child attributes

links_on_page

object

Con questa opzione, puoi ottenere tutti i link presenti sulla pagina che stai scrappando. I link sono sempre restituiti come URL assoluti.

Show child attributes

screen_size

object

Configurazione per la dimensione dello schermo. Le dimensioni preimpostate sono disponibili tramite screen_type: desktop (1920x1080), mobile (414x896) o default (768x1024).

Show child attributes

screenshot

object

Show child attributes

metadata

object

Metadati definiti dall'utente. Non ancora supportato.

max_age

integer

predefinito:0

Età massima accettabile del contenuto memorizzato nella cache, in secondi. Quando esiste già uno scrape corrispondente ed è più recente di max_age secondi, Olostep restituisce il risultato memorizzato invece di avviare un nuovo scrape del browser. Il valore predefinito è 0 (sempre scrape fresco). Nel playground della dashboard, il valore predefinito è 86400 (24 ore). Il valore massimo consentito è 604800 (7 giorni). Vedi la sezione Caching nei documenti della funzione Scrapes per i dettagli.

Intervallo richiesto: x >= 0

Risposta

Risposta riuscita con i dettagli dell'inizio dello scrape.

string

Scrape ID

object

string

Il tipo di oggetto. "scrape" per questo endpoint.

created

number

Epoch creato

metadata

object

Metadati definiti dall'utente.

url_to_scrape

string

L'URL che è stato scrappato.

result

object

Show child attributes

credits_consumed

integer | null

Numero di crediti consumati da questa richiesta. Popolato dopo il completamento dell'esecuzione. I crediti sono la fonte di verità per la fatturazione.

cost_usd

number | null

Costo stimato in USD per questa richiesta. Popolato dopo il completamento dell'esecuzione. Calcolato dai crediti consumati e dal tuo piano tariffario — 99% accurato, ma credits_consumed è il valore autorevole.

Webhooks Ottieni Scrape

Comune

Raschiature

Lotti

Scansioni

Mappe

Risposte

Cerca

Monitor

File

Programmi

Recupera

Saldo e Fatturazione

Autorizzazioni

Corpo

Risposta