Crear Scrape

Iniciar un scrapeo de página web

curl --request POST \
  --url https://api.olostep.com/v1/scrapes \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "url_to_scrape": "<string>",
  "wait_before_scraping": 123,
  "formats": [],
  "actions": [
    {
      "type": "wait",
      "milliseconds": 1
    }
  ],
  "country": "<string>",
  "remove_images": false,
  "remove_class_names": [
    "<string>"
  ],
  "llm_extract": {
    "schema": {}
  },
  "links_on_page": {
    "query_to_order_links_by": "<string>",
    "include_links": [
      "<string>"
    ],
    "exclude_links": [
      "<string>"
    ]
  },
  "screen_size": {
    "screen_width": 123,
    "screen_height": 123
  },
  "screenshot": {
    "full_page": true
  },
  "metadata": {},
  "max_age": 0
}
'

import requests

url = "https://api.olostep.com/v1/scrapes"

payload = {
    "url_to_scrape": "<string>",
    "wait_before_scraping": 123,
    "formats": [],
    "actions": [
        {
            "type": "wait",
            "milliseconds": 1
        }
    ],
    "country": "<string>",
    "remove_images": False,
    "remove_class_names": ["<string>"],
    "llm_extract": { "schema": {} },
    "links_on_page": {
        "query_to_order_links_by": "<string>",
        "include_links": ["<string>"],
        "exclude_links": ["<string>"]
    },
    "screen_size": {
        "screen_width": 123,
        "screen_height": 123
    },
    "screenshot": { "full_page": True },
    "metadata": {},
    "max_age": 0
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    url_to_scrape: '<string>',
    wait_before_scraping: 123,
    formats: [],
    actions: [{type: 'wait', milliseconds: 1}],
    country: '<string>',
    remove_images: false,
    remove_class_names: ['<string>'],
    llm_extract: {schema: {}},
    links_on_page: {
      query_to_order_links_by: '<string>',
      include_links: ['<string>'],
      exclude_links: ['<string>']
    },
    screen_size: {screen_width: 123, screen_height: 123},
    screenshot: {full_page: true},
    metadata: {},
    max_age: 0
  })
};

fetch('https://api.olostep.com/v1/scrapes', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.olostep.com/v1/scrapes",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'url_to_scrape' => '<string>',
    'wait_before_scraping' => 123,
    'formats' => [
        
    ],
    'actions' => [
        [
                'type' => 'wait',
                'milliseconds' => 1
        ]
    ],
    'country' => '<string>',
    'remove_images' => false,
    'remove_class_names' => [
        '<string>'
    ],
    'llm_extract' => [
        'schema' => [
                
        ]
    ],
    'links_on_page' => [
        'query_to_order_links_by' => '<string>',
        'include_links' => [
                '<string>'
        ],
        'exclude_links' => [
                '<string>'
        ]
    ],
    'screen_size' => [
        'screen_width' => 123,
        'screen_height' => 123
    ],
    'screenshot' => [
        'full_page' => true
    ],
    'metadata' => [
        
    ],
    'max_age' => 0
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.olostep.com/v1/scrapes"

	payload := strings.NewReader("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.olostep.com/v1/scrapes")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.olostep.com/v1/scrapes")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}"

response = http.request(request)
puts response.read_body

{
  "id": "<string>",
  "object": "<string>",
  "created": 123,
  "metadata": {},
  "url_to_scrape": "<string>",
  "result": {
    "html_content": "<string>",
    "markdown_content": "<string>",
    "text_content": "<string>",
    "json_content": "<string>",
    "screenshot_hosted_url": "<string>",
    "html_hosted_url": "<string>",
    "markdown_hosted_url": "<string>",
    "text_hosted_url": "<string>",
    "links_on_page": [
      "<string>"
    ],
    "page_metadata": {
      "status_code": 123,
      "title": "<string>"
    }
  },
  "credits_consumed": 123,
  "cost_usd": 123
}

{
  "id": "error_x2nmu5bqn6",
  "object": "error",
  "created": 1777923912,
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "dns_resolution_failed",
    "message": "The URL contains a typo, or the domain does not exist."
  }
}

{
  "id": "error_ogeb6rik8c",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "tls_error",
    "detail": "err_ssl_tlsv1_alert_internal_error",
    "message": "The website closed or rejected the TLS handshake. The server may be misconfigured or use an unsupported SSL/TLS version."
  }
}

{
  "id": "error_qat3d1amjt",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "request_timeout",
    "code": "scrape_poll_timeout",
    "message": "Request timed out while waiting for scrape result. The page may be slow, blocked for our fetchers, or temporarily unavailable."
  }
}

POST

scrapes

Iniciar un scrapeo de página web

curl --request POST \
  --url https://api.olostep.com/v1/scrapes \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "url_to_scrape": "<string>",
  "wait_before_scraping": 123,
  "formats": [],
  "actions": [
    {
      "type": "wait",
      "milliseconds": 1
    }
  ],
  "country": "<string>",
  "remove_images": false,
  "remove_class_names": [
    "<string>"
  ],
  "llm_extract": {
    "schema": {}
  },
  "links_on_page": {
    "query_to_order_links_by": "<string>",
    "include_links": [
      "<string>"
    ],
    "exclude_links": [
      "<string>"
    ]
  },
  "screen_size": {
    "screen_width": 123,
    "screen_height": 123
  },
  "screenshot": {
    "full_page": true
  },
  "metadata": {},
  "max_age": 0
}
'

import requests

url = "https://api.olostep.com/v1/scrapes"

payload = {
    "url_to_scrape": "<string>",
    "wait_before_scraping": 123,
    "formats": [],
    "actions": [
        {
            "type": "wait",
            "milliseconds": 1
        }
    ],
    "country": "<string>",
    "remove_images": False,
    "remove_class_names": ["<string>"],
    "llm_extract": { "schema": {} },
    "links_on_page": {
        "query_to_order_links_by": "<string>",
        "include_links": ["<string>"],
        "exclude_links": ["<string>"]
    },
    "screen_size": {
        "screen_width": 123,
        "screen_height": 123
    },
    "screenshot": { "full_page": True },
    "metadata": {},
    "max_age": 0
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    url_to_scrape: '<string>',
    wait_before_scraping: 123,
    formats: [],
    actions: [{type: 'wait', milliseconds: 1}],
    country: '<string>',
    remove_images: false,
    remove_class_names: ['<string>'],
    llm_extract: {schema: {}},
    links_on_page: {
      query_to_order_links_by: '<string>',
      include_links: ['<string>'],
      exclude_links: ['<string>']
    },
    screen_size: {screen_width: 123, screen_height: 123},
    screenshot: {full_page: true},
    metadata: {},
    max_age: 0
  })
};

fetch('https://api.olostep.com/v1/scrapes', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.olostep.com/v1/scrapes",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'url_to_scrape' => '<string>',
    'wait_before_scraping' => 123,
    'formats' => [
        
    ],
    'actions' => [
        [
                'type' => 'wait',
                'milliseconds' => 1
        ]
    ],
    'country' => '<string>',
    'remove_images' => false,
    'remove_class_names' => [
        '<string>'
    ],
    'llm_extract' => [
        'schema' => [
                
        ]
    ],
    'links_on_page' => [
        'query_to_order_links_by' => '<string>',
        'include_links' => [
                '<string>'
        ],
        'exclude_links' => [
                '<string>'
        ]
    ],
    'screen_size' => [
        'screen_width' => 123,
        'screen_height' => 123
    ],
    'screenshot' => [
        'full_page' => true
    ],
    'metadata' => [
        
    ],
    'max_age' => 0
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.olostep.com/v1/scrapes"

	payload := strings.NewReader("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.olostep.com/v1/scrapes")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.olostep.com/v1/scrapes")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"url_to_scrape\": \"<string>\",\n  \"wait_before_scraping\": 123,\n  \"formats\": [],\n  \"actions\": [\n    {\n      \"type\": \"wait\",\n      \"milliseconds\": 1\n    }\n  ],\n  \"country\": \"<string>\",\n  \"remove_images\": false,\n  \"remove_class_names\": [\n    \"<string>\"\n  ],\n  \"llm_extract\": {\n    \"schema\": {}\n  },\n  \"links_on_page\": {\n    \"query_to_order_links_by\": \"<string>\",\n    \"include_links\": [\n      \"<string>\"\n    ],\n    \"exclude_links\": [\n      \"<string>\"\n    ]\n  },\n  \"screen_size\": {\n    \"screen_width\": 123,\n    \"screen_height\": 123\n  },\n  \"screenshot\": {\n    \"full_page\": true\n  },\n  \"metadata\": {},\n  \"max_age\": 0\n}"

response = http.request(request)
puts response.read_body

{
  "id": "<string>",
  "object": "<string>",
  "created": 123,
  "metadata": {},
  "url_to_scrape": "<string>",
  "result": {
    "html_content": "<string>",
    "markdown_content": "<string>",
    "text_content": "<string>",
    "json_content": "<string>",
    "screenshot_hosted_url": "<string>",
    "html_hosted_url": "<string>",
    "markdown_hosted_url": "<string>",
    "text_hosted_url": "<string>",
    "links_on_page": [
      "<string>"
    ],
    "page_metadata": {
      "status_code": 123,
      "title": "<string>"
    }
  },
  "credits_consumed": 123,
  "cost_usd": 123
}

{
  "id": "error_x2nmu5bqn6",
  "object": "error",
  "created": 1777923912,
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "dns_resolution_failed",
    "message": "The URL contains a typo, or the domain does not exist."
  }
}

{
  "id": "error_ogeb6rik8c",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "invalid_request_error",
    "code": "tls_error",
    "detail": "err_ssl_tlsv1_alert_internal_error",
    "message": "The website closed or rejected the TLS handshake. The server may be misconfigured or use an unsupported SSL/TLS version."
  }
}

{
  "id": "error_qat3d1amjt",
  "object": "error",
  "created": 1777923969,
  "url": "https://example.com",
  "metadata": {},
  "error": {
    "type": "request_timeout",
    "code": "scrape_poll_timeout",
    "message": "Request timed out while waiting for scrape result. The page may be slow, blocked for our fetchers, or temporarily unavailable."
  }
}

Caché opcional: Pasa max_age (en segundos) para reutilizar un scrape reciente con los mismos parámetros en lugar de volver a obtener la página. Por defecto es 0 (siempre fresco). En el área de pruebas del panel, el valor predeterminado es de 24 horas. Consulta Caché para más detalles.

Autorizaciones

Authorization

string

header

requerido

Encabezado de autenticación Bearer de la forma Bearer , donde es tu token de autenticación.

Cuerpo

application/json

url_to_scrape

string<uri>

requerido

La URL desde la cual comenzar el scraping.

wait_before_scraping

integer

Tiempo de espera en milisegundos antes de comenzar el scrapeo.

formats

enum<string>[]

Formatos en los que quieres el contenido.

Opciones disponibles:

html,

markdown,

text,

json,

raw_pdf,

screenshot

remove_css_selectors

enum<string>

Opción para eliminar ciertos selectores CSS del contenido. Opcionalmente, también puedes pasar un array en formato JSON stringificado de selectores específicos que deseas eliminar. Los selectores CSS eliminados cuando esta opción está configurada por defecto son ['nav','footer','script','style','noscript','svg',[role=alert],[role=banner],[role=dialog],[role=alertdialog],[role=region][aria-label*=skip i],[aria-modal=true]]

Opciones disponibles:

default,

none,

array

actions

(Esperar · object | Hacer clic · object | Rellenar Entrada · object | Desplazar · object)[]

Acciones a realizar en la página antes de obtener el contenido.

Esperar
Hacer clic
Rellenar Entrada
Desplazar

Show child attributes

country

string

País residencial desde el cual cargar la solicitud. Valores soportados son: - US (Estados Unidos) - CA (Canadá) - IT (Italia) - IN (India) - GB (Inglaterra) - JP (Japón) - MX (México) - AU (Australia) - ID (Indonesia) - UA (EAU) - RU (Rusia) - RANDOM Algunas operaciones, como el scrapeo de Google Search y Google News, soportan todos los países.

transformer

enum<string>

Especifica el transformador HTML a usar, si hay alguno. La biblioteca Mercury Parser de Postlight se utiliza para eliminar anuncios y otros contenidos no deseados del contenido extraído.

Opciones disponibles:

postlight,

none

remove_images

boolean

predeterminado:false

Opción para eliminar imágenes del contenido scrapeado. Por defecto es false.

remove_class_names

string[]

Lista de nombres de clase a eliminar del contenido.

parser

object

Al definir json como formato, puedes usar este parámetro para especificar el parser a utilizar. Los parsers son útiles para extraer contenido estructurado de páginas web. Olostep tiene algunos parsers integrados para las páginas web más comunes, y también puedes crear tus propios parsers.

Show child attributes

llm_extract

object

Show child attributes

links_on_page

object

Con esta opción, puedes obtener todos los enlaces presentes en la página que scrapeas. Los enlaces siempre se devuelven como URLs absolutas.

Show child attributes

screen_size

object

Configuración para el tamaño de pantalla. Las dimensiones predefinidas están disponibles a través de screen_type: desktop (1920x1080), mobile (414x896) o default (768x1024).

Show child attributes

screenshot

object

Show child attributes

metadata

object

Metadatos definidos por el usuario. Aún no soportado.

max_age

integer

predeterminado:0

Edad máxima aceptable del contenido en caché, en segundos. Cuando ya existe un scrape coincidente y es más reciente que max_age segundos, Olostep devuelve el resultado almacenado en lugar de iniciar un nuevo scrape de navegador. Por defecto es 0 (siempre hacer un scrape nuevo). En el playground del dashboard, el valor por defecto es 86400 (24 horas). El valor máximo permitido es 604800 (7 días). Consulta la sección de Caching en la documentación de la función Scrapes para más detalles.

Rango requerido: x >= 0

Respuesta

Respuesta exitosa con los detalles de inicio del scrape.

string

ID del Scrape

object

string

El tipo de objeto. "scrape" para este endpoint.

created

number

Época creada

metadata

object

Metadatos definidos por el usuario.

url_to_scrape

string

La URL que fue scrapeada.

result

object

Show child attributes

credits_consumed

integer | null

Número de créditos consumidos por esta solicitud. Se completa después de que la ejecución finaliza. Los créditos son la fuente de verdad para la facturación.

cost_usd

number | null

Costo estimado en USD para esta solicitud. Se completa después de que la ejecución finaliza. Calculado a partir de los créditos consumidos y tu tarifa de plan — 99% preciso, pero credits_consumed es el valor autoritativo.

Webhooks Obtener Scrape

Común

Raspaduras

Lotes

Rastreos

Mapas

Respuestas

Buscar

Monitores

Archivos

Horarios

Recuperar

Saldo y Facturación

Autorizaciones

Cuerpo

Respuesta