{
  "openapi": "3.0.1",
  "info": {
    "title": "Contextractor — clean web content extraction for LLMs",
    "description": "Crawl any website and extract clean main-content text as Markdown, plain text, JSON, or HTML — ready for LLMs, RAG pipelines, and vector databases. Built on the rs-trafilatura engine and an adaptive Crawlee + Playwright crawler.",
    "version": "0.4",
    "x-build-id": "z7NiJSAprpo3j0eAy"
  },
  "servers": [
    {
      "url": "https://api.apify.com/v2"
    }
  ],
  "paths": {
    "/acts/glueo~contextractor/run-sync-get-dataset-items": {
      "post": {
        "operationId": "run-sync-get-dataset-items-glueo-contextractor",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for its completion, and returns Actor's dataset items in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    },
    "/acts/glueo~contextractor/runs": {
      "post": {
        "operationId": "runs-sync-glueo-contextractor",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor and returns information about the initiated run in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/runsResponseSchema"
                }
              }
            }
          }
        }
      }
    },
    "/acts/glueo~contextractor/run-sync": {
      "post": {
        "operationId": "run-sync-glueo-contextractor",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    }
  },
  "components": {
    "schemas": {
      "inputSchema": {
        "type": "object",
        "required": [
          "startUrls"
        ],
        "properties": {
          "startUrls": {
            "title": "Start URLs",
            "minItems": 1,
            "type": "array",
            "description": "URLs to extract content from",
            "items": {
              "type": "object",
              "required": [
                "url"
              ],
              "properties": {
                "url": {
                  "type": "string",
                  "title": "URL of a web page",
                  "format": "uri"
                }
              }
            }
          },
          "crawlerType": {
            "title": "Crawler type",
            "enum": [
              "playwright-adaptive",
              "playwright-firefox",
              "playwright-chromium",
              "cheerio"
            ],
            "type": "string",
            "description": "Browser engine or HTTP client for crawling. playwright-adaptive automatically switches between browser and HTTP client per page. cheerio uses raw HTTP only (fastest, no JS).",
            "default": "playwright-adaptive"
          },
          "renderingTypeDetectionRatio": {
            "title": "Rendering type detection",
            "minimum": 0,
            "maximum": 1,
            "type": "number",
            "description": "(Adaptive only) Ratio (0–1) of pages on which the crawler runs a rendering-type detection probe. Higher values are more accurate but slower.",
            "default": 0.1
          },
          "globs": {
            "title": "Include URLs (globs)",
            "type": "array",
            "description": "Glob patterns matching URLs of pages that will be included in crawling. Setting this option allows you to customize the crawling scope. For example `https://{store,docs}.example.com/**` lets the crawler access all URLs starting with `https://store.example.com/` or `https://docs.example.com/`.",
            "default": [],
            "items": {
              "type": "object",
              "required": [
                "glob"
              ],
              "properties": {
                "glob": {
                  "type": "string",
                  "title": "Glob of a web page"
                }
              }
            }
          },
          "exclude": {
            "title": "Exclude URLs (globs)",
            "type": "array",
            "description": "Glob patterns matching URLs of pages that will be excluded from crawling. Note that this affects only links found on pages, but not Start URLs, which are always crawled.",
            "default": [],
            "items": {
              "type": "object",
              "required": [
                "glob"
              ],
              "properties": {
                "glob": {
                  "type": "string",
                  "title": "Glob of a web page"
                }
              }
            }
          },
          "selector": {
            "title": "Link Selector",
            "type": "string",
            "description": "CSS selector for links to enqueue. Leave empty to disable link enqueueing.",
            "default": ""
          },
          "keepUrlFragment": {
            "title": "Keep URL fragment",
            "type": "boolean",
            "description": "URL fragments (the parts of URL after a #) are not considered when the scraper determines whether a URL has already been visited. Turn this on to treat URLs with different fragments as different pages.",
            "default": false
          },
          "useSitemaps": {
            "title": "Use sitemaps",
            "type": "boolean",
            "description": "If enabled, the crawler looks for sitemap.xml at the root of each start URL domain and enqueues matching URLs from it in addition to link-following.",
            "default": false
          },
          "deduplication": {
            "title": "Deduplication",
            "enum": [
              "minimal",
              "standard",
              "aggressive"
            ],
            "type": "string",
            "description": "Deduplication level applied on top of Crawlee's built-in URL deduplication. standard (default): skip pages whose <link rel=\"canonical\"> was already extracted, across all handler types. aggressive: also skip pages whose extracted text content matches a previously extracted page. minimal: disable additional deduplication — only Crawlee's built-in URL dedup remains active.",
            "default": "standard"
          },
          "respectRobotsTxtFile": {
            "title": "Respect robots.txt",
            "type": "boolean",
            "description": "If enabled, the crawler will consult the robots.txt file for each domain before crawling pages.",
            "default": false
          },
          "initialCookies": {
            "title": "Initial cookies",
            "type": "array",
            "description": "Cookies that will be pre-set to all pages the scraper opens. This is useful for pages that require login. The value is expected to be a JSON array of objects with `name` and `value` properties. For example: \n\n```json\n[\n  {\n    \"name\": \"cookieName\",\n    \"value\": \"cookieValue\",\n    \"path\": \"/\",\n    \"domain\": \".example.com\"\n  }\n]\n```\n\nYou can use the [EditThisCookie](https://docs.apify.com/academy/tools/edit-this-cookie) browser extension to copy browser cookies in this format, and paste it here.\n\nNote that the value is secret and encrypted to protect your login cookies."
          },
          "customHttpHeaders": {
            "title": "Custom HTTP headers",
            "type": "object",
            "description": "HTTP headers that will be added to all requests made by the crawler. This is useful for setting custom authentication headers or other headers required by the target website. The value is expected to be a JSON object with header names as keys and header values as values. For example: `{ \"Authorization\": \"Bearer token123\", \"X-Custom-Header\": \"value\" }`."
          },
          "maxRequestsPerCrawl": {
            "title": "Max requests per crawl",
            "minimum": 0,
            "maximum": 9007199254740991,
            "type": "integer",
            "description": "Maximum number of requests the crawler will handle. Counts handled page outcomes (successes and final failures), including start URLs and pagination pages. The crawler automatically finishes after reaching this number. 0 means unlimited.",
            "default": 0
          },
          "maxResultsPerCrawl": {
            "title": "Max results",
            "minimum": 0,
            "maximum": 9007199254740991,
            "type": "integer",
            "description": "Maximum number of results that will be saved to dataset. The scraper will terminate after reaching this number. 0 means unlimited.",
            "default": 0
          },
          "maxCrawlDepth": {
            "title": "Max crawling depth",
            "minimum": 0,
            "maximum": 9007199254740991,
            "type": "integer",
            "description": "Maximum link depth from Start URLs. Pages discovered further from start URLs than this limit will not be crawled. 0 means unlimited.",
            "default": 0
          },
          "initialConcurrency": {
            "title": "Initial concurrency",
            "minimum": 0,
            "maximum": 9007199254740991,
            "type": "integer",
            "description": "Initial number of browser pages or HTTP clients running in parallel. Crawlee auto-scales up to maxConcurrency. 0 lets Crawlee pick the default.",
            "default": 0
          },
          "maxConcurrency": {
            "title": "Max concurrency",
            "minimum": 1,
            "maximum": 9007199254740991,
            "type": "integer",
            "description": "Maximum number of browser pages running in parallel. Kept low by default because the browser crawler cannot abort in-flight pages, so concurrency is the only hard cap on peak memory — large pages can exhaust memory at higher values. Raise it for lightweight pages or the HTTP (cheerio) crawler. This setting also avoids overloading target websites and getting blocked.",
            "default": 3
          },
          "maxRequestRetries": {
            "title": "Max request retries",
            "minimum": 0,
            "maximum": 9007199254740991,
            "type": "integer",
            "description": "Maximum number of retries for failed requests on network, proxy, or server errors.",
            "default": 3
          },
          "mode": {
            "title": "Extraction mode",
            "enum": [
              "precision",
              "balanced",
              "recall"
            ],
            "type": "string",
            "description": "Extraction mode. precision minimizes noise (may miss some content); recall maximizes content (may include noise); balanced is the default.",
            "default": "balanced"
          },
          "includeComments": {
            "title": "Include comments",
            "type": "boolean",
            "description": "Include HTML comments in the extracted text.",
            "default": true
          },
          "includeTables": {
            "title": "Include tables",
            "type": "boolean",
            "description": "Include table content in the extracted text.",
            "default": true
          },
          "includeImages": {
            "title": "Include images",
            "type": "boolean",
            "description": "Include image alt text and captions in the extracted text.",
            "default": false
          },
          "includeLinks": {
            "title": "Include links",
            "type": "boolean",
            "description": "Include hyperlinks in the extracted text.",
            "default": true
          },
          "languageCode": {
            "title": "Language",
            "type": "string",
            "description": "Filter extracted content by language code (e.g. \"en\"). Leave empty to accept any language.",
            "default": ""
          },
          "save": {
            "title": "Save",
            "minItems": 1,
            "type": "array",
            "description": "What to save and where, as `format-destination` tokens. Format is one of `txt`, `markdown`, `json`, `html`, `original` (raw page HTML before extraction); destination is `dataset` (inline in the dataset record) or `kvs` (a blob in the key-value store). List a format twice to save it to both, e.g. `markdown-dataset markdown-kvs`. Saving `original` or `html` (large content) to the dataset is not recommended — it risks out-of-memory on large pages; prefer `kvs`.",
            "items": {
              "type": "string",
              "enum": [
                "txt-dataset",
                "txt-kvs",
                "markdown-dataset",
                "markdown-kvs",
                "json-dataset",
                "json-kvs",
                "html-dataset",
                "html-kvs",
                "original-dataset",
                "original-kvs"
              ],
              "enumTitles": [
                "Plain text → Dataset",
                "Plain text → Key-value store",
                "Markdown → Dataset",
                "Markdown → Key-value store",
                "JSON → Dataset",
                "JSON → Key-value store",
                "HTML → Dataset (large; OOM risk)",
                "HTML → Key-value store",
                "Original HTML → Dataset (large; OOM risk)",
                "Original HTML → Key-value store"
              ]
            },
            "default": [
              "markdown-kvs"
            ]
          },
          "datasetName": {
            "title": "Dataset name",
            "type": "string",
            "description": "Name or ID of the dataset for storing results. Leave empty to use the default run dataset."
          },
          "keyValueStoreName": {
            "title": "Key-value store name",
            "type": "string",
            "description": "Name or ID of the key-value store for content files. Leave empty to use the default store."
          },
          "requestQueueName": {
            "title": "Request queue name",
            "type": "string",
            "description": "Name of the request queue for pending URLs. Leave empty to use the default queue."
          },
          "storeSkippedUrls": {
            "title": "Store skipped URLs",
            "type": "boolean",
            "description": "If enabled, pushes a dataset record for each URL skipped during crawling (excluded by globs, robots.txt, depth limit, or concurrency cap). Can produce high record volume — enable for auditing only.",
            "default": false
          },
          "proxyConfiguration": {
            "title": "Proxy configuration",
            "type": "object",
            "description": "Enables loading websites from IP addresses in specific geographies and to circumvent blocking."
          },
          "proxyRotation": {
            "title": "Proxy rotation",
            "enum": [
              "recommended",
              "per-request",
              "until-failure"
            ],
            "type": "string",
            "description": "Proxy rotation strategy. recommended automatically picks the best proxies. per-request uses a new proxy for each request. until-failure uses one proxy until it fails.",
            "default": "recommended"
          },
          "sessionPoolName": {
            "title": "Session pool name",
            "pattern": "^[0-9A-Za-z_-]+$",
            "minLength": 3,
            "maxLength": 200,
            "type": "string",
            "description": "Name for a persistent, shared session pool. Sessions (IP + cookies) are saved under this key and reused across Actor runs. Useful when proxies are frequently blocked — previously working sessions are preferred over random ones."
          },
          "maxSessionRotations": {
            "title": "Max session rotations",
            "minimum": 0,
            "maximum": 20,
            "type": "integer",
            "description": "Maximum number of session (IP + browser fingerprint) rotations per request on block detection. Independent of maxRequestRetries. Set to 0 to disable session rotation.",
            "default": 10
          },
          "navigationTimeoutSecs": {
            "title": "Navigation timeout",
            "minimum": 1,
            "maximum": 9007199254740991,
            "type": "integer",
            "description": "Maximum time to wait for page navigation in seconds",
            "default": 60
          },
          "blockMedia": {
            "title": "Block media",
            "type": "boolean",
            "description": "Block loading of images, stylesheets, fonts (.woff), PDFs, and ZIPs. On by default: it cuts browser memory and bandwidth substantially, which helps avoid out-of-memory on large pages. Disable it (set to false) if a page needs media to render its content (e.g. image- or CSS-driven lazy loading). Has no effect when using the raw HTTP crawler type or non-Chromium browsers (Chromium only).",
            "default": true
          },
          "waitForSelector": {
            "title": "Wait for selector",
            "type": "string",
            "description": "Wait for this CSS selector to appear before extracting content. The request fails and is retried if the selector does not appear within the timeout. Leave empty to disable.",
            "default": ""
          },
          "softWaitForSelector": {
            "title": "Soft wait for selector",
            "type": "string",
            "description": "Wait for this CSS selector to appear before extracting content. Unlike waitForSelector, the request continues even if the selector does not appear within the timeout. Leave empty to disable.",
            "default": ""
          },
          "waitForDynamicContentSecs": {
            "title": "Wait for dynamic content",
            "minimum": 0,
            "maximum": 9007199254740991,
            "type": "integer",
            "description": "Maximum seconds to wait for dynamic page content to load after navigation. The crawler continues when the network goes idle or this timeout elapses, whichever comes first. 0 disables this wait. Also used as the timeout for waitForSelector and softWaitForSelector.",
            "default": 0
          },
          "waitUntil": {
            "title": "Navigation wait until",
            "enum": [
              "load",
              "domcontentloaded",
              "networkidle",
              "commit"
            ],
            "type": "string",
            "description": "When to consider navigation finished. networkidle waits for 500ms of network silence (best for JS-heavy SPAs, slower); load waits for the load event (default, good for most articles); domcontentloaded is fastest but may fire before client-side rendering completes; commit fires when network response is received and the document has started loading.",
            "default": "load"
          },
          "headless": {
            "title": "Headless mode",
            "type": "boolean",
            "description": "Run browser in headless mode",
            "default": true
          },
          "ignoreCorsAndCsp": {
            "title": "Ignore CORS and CSP",
            "type": "boolean",
            "description": "Ignore Content Security Policy and Cross-Origin Resource Sharing restrictions. Enables free XHR/Fetch requests from pages.",
            "default": false
          },
          "closeCookieModals": {
            "title": "Close cookie modals",
            "type": "boolean",
            "description": "Automatically handle cookie consent: Ghostery-based ad/tracker blocking, accepting consent walls that replace the page (e.g. consent-or-pay) via the site’s own consent manager and re-fetching the article, and removing residual consent/CMP containers before extraction.",
            "default": true
          },
          "maxScrollHeight": {
            "title": "Max scroll height",
            "minimum": 0,
            "maximum": 9007199254740991,
            "type": "integer",
            "description": "Maximum pixels (px) to scroll down the page until all content is loaded. Setting to 0 disables scrolling.",
            "default": 5000
          },
          "userAgent": {
            "title": "User-Agent",
            "type": "string",
            "description": "Custom User-Agent string for the browser. Leave empty to use the default browser User-Agent.",
            "default": ""
          },
          "ignoreHttpsErrors": {
            "title": "Ignore HTTPS errors",
            "type": "boolean",
            "description": "Ignore HTTPS certificate errors. Use at your own risk.",
            "default": false
          }
        }
      },
      "runsResponseSchema": {
        "type": "object",
        "properties": {
          "data": {
            "type": "object",
            "properties": {
              "id": {
                "type": "string"
              },
              "actId": {
                "type": "string"
              },
              "userId": {
                "type": "string"
              },
              "startedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "finishedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "status": {
                "type": "string",
                "example": "READY"
              },
              "meta": {
                "type": "object",
                "properties": {
                  "origin": {
                    "type": "string",
                    "example": "API"
                  },
                  "userAgent": {
                    "type": "string"
                  }
                }
              },
              "stats": {
                "type": "object",
                "properties": {
                  "inputBodyLen": {
                    "type": "integer",
                    "example": 2000
                  },
                  "rebootCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "restartCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "resurrectCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "computeUnits": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "options": {
                "type": "object",
                "properties": {
                  "build": {
                    "type": "string",
                    "example": "latest"
                  },
                  "timeoutSecs": {
                    "type": "integer",
                    "example": 300
                  },
                  "memoryMbytes": {
                    "type": "integer",
                    "example": 1024
                  },
                  "diskMbytes": {
                    "type": "integer",
                    "example": 2048
                  }
                }
              },
              "buildId": {
                "type": "string"
              },
              "defaultKeyValueStoreId": {
                "type": "string"
              },
              "defaultDatasetId": {
                "type": "string"
              },
              "defaultRequestQueueId": {
                "type": "string"
              },
              "buildNumber": {
                "type": "string",
                "example": "1.0.0"
              },
              "containerUrl": {
                "type": "string"
              },
              "usage": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "integer",
                    "example": 1
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "usageTotalUsd": {
                "type": "number",
                "example": 0.00005
              },
              "usageUsd": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "number",
                    "example": 0.00005
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}