Créer un Scrape

Initier un scraping de page web

curl --request POST \
  --url https://api.olostep.com/v1/scrapes \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "url_to_scrape": "<string>",
  "wait_before_scraping": 123,
  "formats": [],
  "actions": [
    {
      "type": "wait",
      "milliseconds": 1
    }
  ],
  "country": "<string>",
  "remove_images": false,
  "remove_class_names": [
    "<string>"
  ],
  "llm_extract": {
    "schema": {}
  },
  "links_on_page": {
    "query_to_order_links_by": "<string>",
    "include_links": [
      "<string>"
    ],
    "exclude_links": [
      "<string>"
    ]
  },
  "screen_size": {
    "screen_width": 123,
    "screen_height": 123
  },
  "screenshot": {
    "full_page": true
  },
  "metadata": {},
  "max_age": 0
}
'

import requests

url = "https://api.olostep.com/v1/scrapes"

payload = {
    "url_to_scrape": "<string>",
    "wait_before_scraping": 123,
    "formats": [],
    "actions": [
        {
            "type": "wait",
            "milliseconds": 1
        }
    ],
    "country": "<string>",
    "remove_images": False,
    "remove_class_names": ["<string>"],
    "llm_extract": { "schema": {} },
    "links_on_page": {
        "query_to_order_links_by": "<string>",
        "include_links": ["<string>"],
        "exclude_links": ["<string>"]
    },
    "screen_size": {
        "screen_width": 123,
        "screen_height": 123
    },
    "screenshot": { "full_page": True },
    "metadata": {},
    "max_age": 0
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    url_to_scrape: '<string>',
    wait_before_scraping: 123,
    formats: [],
    actions: [{type: 'wait', milliseconds: 1}],
    country: '<string>',
    remove_images: false,
    remove_class_names: ['<string>'],
    llm_extract: {schema: {}},
    links_on_page: {
      query_to_order_links_by: '<string>',
      include_links: ['<string>'],
      exclude_links: ['<string>']
    },
    screen_size: {screen_width: 123, screen_height: 123},
    screenshot: {full_page: true},
    metadata: {},
    max_age: 0
  })
};

fetch('https://api.olostep.com/v1/scrapes', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.olostep.com/v1/scrapes",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'url_to_scrape' => '<string>',
    'wait_before_scraping' => 123,
    'formats' => [
        
    ],
    'actions' => [
        [
                'type' => 'wait',
                'milliseconds' => 1
        ]
    ],
    'country' => '<string>',
    'remove_images' => false,
    'remove_class_names' => [
        '<string>'
    ],
    'llm_extract' => [
        'schema' => [
                
        ]
    ],
    'links_on_page' => [
        'query_to_order_links_by' => '<string>',
        'include_links' => [
                '<string>'
        ],
        'exclude_links' => [
                '<string>'
        ]
    ],
    'screen_size' => [
        'screen_width' => 123,
        'screen_height' => 123
    ],
    'screenshot' => [
        'full_page' => true
    ],
    'metadata' => [
        
    ],
    'max_age' => 0
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.olostep.com/v1/scrapes"

	payload := strings.NewReader("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.olostep.com/v1/scrapes")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.olostep.com/v1/scrapes")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}"

response = http.request(request)
puts response.read_body

{
  "id": "<string>",
  "object": "<string>",
  "created": 123,
  "metadata": {},
  "url_to_scrape": "<string>",
  "result": {
    "html_content": "<string>",
    "markdown_content": "<string>",
    "text_content": "<string>",
    "json_content": "<string>",
    "screenshot_hosted_url": "<string>",
    "html_hosted_url": "<string>",
    "markdown_hosted_url": "<string>",
    "text_hosted_url": "<string>",
    "links_on_page": [
      "<string>"
    ],
    "page_metadata": {
      "status_code": 123,
      "title": "<string>"
    }
  },
  "credits_consumed": 123,
  "cost_usd": 123
}

{
  "id": "error_x2nmu5bqn6",
  "object": "error",
  "created": 1777923912,
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "dns_resolution_failed",
    "message": "The URL contains a typo, or the domain does not exist."
  }
}

{
  "id": "error_ogeb6rik8c",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "tls_error",
    "detail": "err_ssl_tlsv1_alert_internal_error",
    "message": "The website closed or rejected the TLS handshake. The server may be misconfigured or use an unsupported SSL/TLS version."
  }
}

{
  "id": "error_qat3d1amjt",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "request_timeout",
    "code": "scrape_poll_timeout",
    "message": "Request timed out while waiting for scrape result. The page may be slow, blocked for our fetchers, or temporarily unavailable."
  }
}

POST

scrapes

Initier un scraping de page web

curl --request POST \
  --url https://api.olostep.com/v1/scrapes \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "url_to_scrape": "<string>",
  "wait_before_scraping": 123,
  "formats": [],
  "actions": [
    {
      "type": "wait",
      "milliseconds": 1
    }
  ],
  "country": "<string>",
  "remove_images": false,
  "remove_class_names": [
    "<string>"
  ],
  "llm_extract": {
    "schema": {}
  },
  "links_on_page": {
    "query_to_order_links_by": "<string>",
    "include_links": [
      "<string>"
    ],
    "exclude_links": [
      "<string>"
    ]
  },
  "screen_size": {
    "screen_width": 123,
    "screen_height": 123
  },
  "screenshot": {
    "full_page": true
  },
  "metadata": {},
  "max_age": 0
}
'

import requests

url = "https://api.olostep.com/v1/scrapes"

payload = {
    "url_to_scrape": "<string>",
    "wait_before_scraping": 123,
    "formats": [],
    "actions": [
        {
            "type": "wait",
            "milliseconds": 1
        }
    ],
    "country": "<string>",
    "remove_images": False,
    "remove_class_names": ["<string>"],
    "llm_extract": { "schema": {} },
    "links_on_page": {
        "query_to_order_links_by": "<string>",
        "include_links": ["<string>"],
        "exclude_links": ["<string>"]
    },
    "screen_size": {
        "screen_width": 123,
        "screen_height": 123
    },
    "screenshot": { "full_page": True },
    "metadata": {},
    "max_age": 0
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    url_to_scrape: '<string>',
    wait_before_scraping: 123,
    formats: [],
    actions: [{type: 'wait', milliseconds: 1}],
    country: '<string>',
    remove_images: false,
    remove_class_names: ['<string>'],
    llm_extract: {schema: {}},
    links_on_page: {
      query_to_order_links_by: '<string>',
      include_links: ['<string>'],
      exclude_links: ['<string>']
    },
    screen_size: {screen_width: 123, screen_height: 123},
    screenshot: {full_page: true},
    metadata: {},
    max_age: 0
  })
};

fetch('https://api.olostep.com/v1/scrapes', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.olostep.com/v1/scrapes",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'url_to_scrape' => '<string>',
    'wait_before_scraping' => 123,
    'formats' => [
        
    ],
    'actions' => [
        [
                'type' => 'wait',
                'milliseconds' => 1
        ]
    ],
    'country' => '<string>',
    'remove_images' => false,
    'remove_class_names' => [
        '<string>'
    ],
    'llm_extract' => [
        'schema' => [
                
        ]
    ],
    'links_on_page' => [
        'query_to_order_links_by' => '<string>',
        'include_links' => [
                '<string>'
        ],
        'exclude_links' => [
                '<string>'
        ]
    ],
    'screen_size' => [
        'screen_width' => 123,
        'screen_height' => 123
    ],
    'screenshot' => [
        'full_page' => true
    ],
    'metadata' => [
        
    ],
    'max_age' => 0
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.olostep.com/v1/scrapes"

	payload := strings.NewReader("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.olostep.com/v1/scrapes")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.olostep.com/v1/scrapes")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}"

response = http.request(request)
puts response.read_body

{
  "id": "<string>",
  "object": "<string>",
  "created": 123,
  "metadata": {},
  "url_to_scrape": "<string>",
  "result": {
    "html_content": "<string>",
    "markdown_content": "<string>",
    "text_content": "<string>",
    "json_content": "<string>",
    "screenshot_hosted_url": "<string>",
    "html_hosted_url": "<string>",
    "markdown_hosted_url": "<string>",
    "text_hosted_url": "<string>",
    "links_on_page": [
      "<string>"
    ],
    "page_metadata": {
      "status_code": 123,
      "title": "<string>"
    }
  },
  "credits_consumed": 123,
  "cost_usd": 123
}

{
  "id": "error_x2nmu5bqn6",
  "object": "error",
  "created": 1777923912,
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "dns_resolution_failed",
    "message": "The URL contains a typo, or the domain does not exist."
  }
}

{
  "id": "error_ogeb6rik8c",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "tls_error",
    "detail": "err_ssl_tlsv1_alert_internal_error",
    "message": "The website closed or rejected the TLS handshake. The server may be misconfigured or use an unsupported SSL/TLS version."
  }
}

{
  "id": "error_qat3d1amjt",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "request_timeout",
    "code": "scrape_poll_timeout",
    "message": "Request timed out while waiting for scrape result. The page may be slow, blocked for our fetchers, or temporarily unavailable."
  }
}

Mise en cache optionnelle : Passez max_age (en secondes) pour réutiliser un scrape récent avec les mêmes paramètres au lieu de récupérer à nouveau la page. Par défaut, il est à 0 (toujours frais). Dans le bac à sable du tableau de bord, la valeur par défaut est de 24 heures. Voir Mise en cache pour plus de détails.

Autorisations

Authorization

string

header

requis

En-tête d'authentification Bearer sous la forme Bearer , où est ton jeton d'authentification.

Corps

application/json

url_to_scrape

string<uri>

requis

L'URL à partir de laquelle commencer le scraping.

wait_before_scraping

integer

Temps d'attente en millisecondes avant de commencer le scraping.

formats

enum<string>[]

Formats dans lesquels tu veux le contenu.

Options disponibles:

html,

markdown,

text,

json,

raw_pdf,

screenshot

remove_css_selectors

enum<string>

Option pour supprimer certains sélecteurs CSS du contenu. Tu peux également passer un tableau JSON sous forme de chaîne des sélecteurs spécifiques que tu veux supprimer. Les sélecteurs CSS supprimés lorsque cette option est définie par défaut sont ['nav','footer','script','style','noscript','svg',[role=alert],[role=banner],[role=dialog],[role=alertdialog],[role=region][aria-label*=skip i],[aria-modal=true]]

Options disponibles:

default,

none,

array

actions

(Attendre · object | Cliquer · object | Remplir l'entrée · object | Faire défiler · object)[]

Actions à effectuer sur la page avant d'obtenir le contenu.

Attendre
Cliquer
Remplir l'entrée
Faire défiler

Show child attributes

country

string

Pays résidentiel à partir duquel charger la requête. Valeurs supportées : - US (États-Unis) - CA (Canada) - IT (Italie) - IN (Inde) - GB (Angleterre) - JP (Japon) - MX (Mexique) - AU (Australie) - ID (Indonésie) - UA (Émirats Arabes Unis) - RU (Russie) - RANDOM Certaines opérations, comme le scraping de Google Search et Google News, supportent tous les pays.

transformer

enum<string>

Spécifie le transformateur HTML à utiliser, si nécessaire. La bibliothèque Mercury Parser de Postlight est utilisée pour supprimer les publicités et autres contenus indésirables du contenu scrapé.

Options disponibles:

postlight,

none

remove_images

boolean

défaut:false

Option pour supprimer les images du contenu scrappé. Par défaut, c'est false.

remove_class_names

string[]

Liste des noms de classes à supprimer du contenu.

parser

object

Lors de la définition de json comme format, tu peux utiliser ce paramètre pour spécifier le parseur à utiliser. Les parseurs sont utiles pour extraire du contenu structuré des pages web. Olostep a quelques parseurs intégrés pour les pages web les plus courantes, et tu peux aussi créer tes propres parseurs.

Show child attributes

llm_extract

object

Show child attributes

links_on_page

object

Avec cette option, tu peux obtenir tous les liens présents sur la page que tu scrapes. Les liens sont toujours retournés sous forme d'URLs absolues.

Show child attributes

screen_size

object

Configuration pour la taille de l'écran. Des dimensions prédéfinies sont disponibles via screen_type : desktop (1920x1080), mobile (414x896), ou default (768x1024).

Show child attributes

screenshot

object

Show child attributes

metadata

object

Métadonnées définies par l'utilisateur. Pas encore supporté.

max_age

integer

défaut:0

Âge maximum acceptable du contenu mis en cache, en secondes. Lorsqu'un scrape correspondant existe déjà et est plus récent que max_age secondes, Olostep renvoie le résultat stocké au lieu de lancer un nouveau scrape de navigateur. Par défaut, c'est 0 (toujours scraper frais). Dans le terrain de jeu du tableau de bord, la valeur par défaut est 86400 (24 heures). La valeur maximale autorisée est 604800 (7 jours). Voir la section Caching dans la documentation des fonctionnalités Scrapes pour plus de détails.

Plage requise: x >= 0

Réponse

Réponse réussie avec les détails de l'initiation du scrape.

string

ID du scrape

object

string

Le type d'objet. "scrape" pour ce point de terminaison.

created

number

Époque créée

metadata

object

Métadonnées définies par l'utilisateur.

url_to_scrape

string

L'URL qui a été scrappée.

result

object

Show child attributes

credits_consumed

integer | null

Nombre de crédits consommés par cette requête. Rempli après l'exécution terminée. Les crédits sont la source de vérité pour la facturation.

cost_usd

number | null

Coût estimé en USD pour cette requête. Rempli après l'exécution terminée. Calculé à partir des crédits consommés et de ton tarif de plan — 99% précis, mais credits_consumed est la valeur faisant autorité.

Webhooks Obtenir Scrape

Commun

Grattages

Lots

Explorations

Cartes

Réponses

Rechercher

Moniteurs

Fichiers

Horaires

Récupérer

Solde et Facturation

Autorisations

Corps

Réponse