{
  "openapi": "3.0.1",
  "info": {
    "title": "AI-Ready Web Content Crawler (LLM/RAG Optimized)",
    "description": "Deep-crawl websites and extract LLM-ready Markdown with OG tags, JSON-LD, author, dates, token estimates, native RAG chunking, language filtering, content-hash dedup, and per-page error reporting. Enforced timeouts. Zero silent failures.",
    "version": "1.0",
    "x-build-id": "Z27Y7rakQsHoV7goa"
  },
  "servers": [
    {
      "url": "https://api.apify.com/v2"
    }
  ],
  "paths": {
    "/acts/brilliant_gum~web-content-crawler/run-sync-get-dataset-items": {
      "post": {
        "operationId": "run-sync-get-dataset-items-brilliant_gum-web-content-crawler",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for its completion, and returns Actor's dataset items in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    },
    "/acts/brilliant_gum~web-content-crawler/runs": {
      "post": {
        "operationId": "runs-sync-brilliant_gum-web-content-crawler",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor and returns information about the initiated run in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/runsResponseSchema"
                }
              }
            }
          }
        }
      }
    },
    "/acts/brilliant_gum~web-content-crawler/run-sync": {
      "post": {
        "operationId": "run-sync-brilliant_gum-web-content-crawler",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    }
  },
  "components": {
    "schemas": {
      "inputSchema": {
        "type": "object",
        "required": [
          "startUrls"
        ],
        "properties": {
          "startUrls": {
            "title": "Start URLs",
            "type": "array",
            "description": "One or more seed URLs. The crawler stays within each URL's domain by default. Supports plain strings or {url, userData} objects.",
            "items": {
              "type": "object"
            }
          },
          "crawlerType": {
            "title": "Crawler Type",
            "enum": [
              "playwright:chrome",
              "playwright:firefox",
              "playwright:adaptive",
              "cheerio",
              "jsdom"
            ],
            "type": "string",
            "description": "Rendering engine. playwright:chrome (default) handles JS and bot protection. cheerio is 3-5x faster and cheaper for static HTML sites.",
            "default": "playwright:chrome"
          },
          "ignoreSslErrors": {
            "title": "Ignore SSL Errors",
            "type": "boolean",
            "description": "Ignore SSL/TLS certificate errors (e.g. self-signed certificates).",
            "default": false
          },
          "maxCrawlDepth": {
            "title": "Max Crawl Depth",
            "minimum": 0,
            "maximum": 100,
            "type": "integer",
            "description": "Maximum link depth from the seed URL. Depth 0 = only the seed URL. Depth 1 = seed + directly linked pages.",
            "default": 5
          },
          "maxCrawlPages": {
            "title": "Max Saved Pages",
            "minimum": 1,
            "maximum": 100000,
            "type": "integer",
            "description": "Maximum number of pages to save to the dataset. Filtered pages (language, content length, duplicates) do NOT count.",
            "default": 100
          },
          "maxConcurrency": {
            "title": "Max Concurrency",
            "minimum": 1,
            "maximum": 20,
            "type": "integer",
            "description": "Maximum parallel browser/HTTP requests. Lower = less chance of rate-limiting.",
            "default": 3
          },
          "requestTimeoutSecs": {
            "title": "Request Timeout (seconds)",
            "minimum": 5,
            "maximum": 300,
            "type": "integer",
            "description": "Hard timeout per page. The crawl is aborted if a page takes longer. This is strictly enforced (unlike some other crawlers).",
            "default": 60
          },
          "globs": {
            "title": "Include URL Patterns (globs)",
            "type": "array",
            "description": "Only crawl URLs matching these glob patterns (e.g. 'https://example.com/blog/**'). Leave empty to crawl all pages within the seed domain.",
            "items": {
              "type": "string"
            }
          },
          "excludeGlobs": {
            "title": "Exclude URL Patterns (globs)",
            "type": "array",
            "description": "Skip URLs matching these glob patterns (e.g. '**/*.pdf', '**/tag/**').",
            "items": {
              "type": "string"
            }
          },
          "useSitemaps": {
            "title": "Discover via sitemap.xml",
            "type": "boolean",
            "description": "Auto-fetch {domain}/sitemap.xml and enqueue all discovered URLs. Faster for large sites.",
            "default": false
          },
          "htmlTransformer": {
            "title": "Content Extraction Method",
            "enum": [
              "readability",
              "raw"
            ],
            "type": "string",
            "description": "How to extract page content. 'readability' uses Mozilla Readability (Firefox Reader View) for clean article extraction. 'raw' uses the full body HTML.",
            "default": "readability"
          },
          "removeElementsCssSelector": {
            "title": "Remove Elements (CSS Selector)",
            "type": "string",
            "description": "Remove matching elements before extraction. Default: nav, footer, header, aside, cookie banners, ads. Override with your own selectors."
          },
          "keepElementsCssSelector": {
            "title": "Keep Only Elements (CSS Selector)",
            "type": "string",
            "description": "When set, extract ONLY content inside matching elements (removes everything else first)."
          },
          "includeImages": {
            "title": "Include Images in Markdown",
            "type": "boolean",
            "description": "Keep image Markdown (![alt](url)) in output. Disabled by default to reduce noise for LLM use cases.",
            "default": false
          },
          "aggressivePrune": {
            "title": "Aggressive Pruning",
            "type": "boolean",
            "description": "Remove additional noisy elements: sidebars, social buttons, share widgets, comment sections, related-article widgets.",
            "default": false
          },
          "waitForSelector": {
            "title": "Wait for CSS Selector (Playwright only)",
            "type": "string",
            "description": "Wait until this element appears before extracting content. Useful for heavily dynamic pages."
          },
          "dismissCookieBanners": {
            "title": "Auto-dismiss Cookie Banners (Playwright only)",
            "type": "boolean",
            "description": "Automatically click Accept on common cookie consent dialogs before extracting content.",
            "default": true
          },
          "expandClickableElements": {
            "title": "Expand Clickable Elements (Playwright only)",
            "type": "boolean",
            "description": "Auto-click 'Read more', 'Show more' buttons and expand collapsed sections before extracting content.",
            "default": false
          },
          "languageFilter": {
            "title": "Language Filter",
            "type": "array",
            "description": "Only save pages in these languages (ISO 639-1 codes, e.g. ['en', 'de']). Pages in other languages are skipped but do NOT count against maxCrawlPages. Leave empty for all languages.",
            "items": {
              "type": "string"
            }
          },
          "contentMinLength": {
            "title": "Minimum Content Length (chars)",
            "minimum": 0,
            "type": "integer",
            "description": "Skip pages with fewer than this many characters of extracted text. Prevents saving empty pages, 403 error pages, redirects, etc.",
            "default": 100
          },
          "deduplicateByContent": {
            "title": "Deduplicate by Content Hash",
            "type": "boolean",
            "description": "Skip pages with identical content (MD5 of text). Goes beyond URL/canonical deduplication — catches pages with different URLs but the same body.",
            "default": false
          },
          "saveMarkdown": {
            "title": "Save Markdown",
            "type": "boolean",
            "description": "Include the 'markdown' field in each output record.",
            "default": true
          },
          "saveText": {
            "title": "Save Plain Text",
            "type": "boolean",
            "description": "Include the 'text' (plain text) field in each output record.",
            "default": true
          },
          "saveHtml": {
            "title": "Save Cleaned HTML",
            "type": "boolean",
            "description": "Save the cleaned page HTML to the key-value store and include the URL in 'htmlUrl'.",
            "default": false
          },
          "saveScreenshots": {
            "title": "Save Screenshots (Playwright only)",
            "type": "boolean",
            "description": "Capture a screenshot of each page and save to the key-value store. Adds 'screenshotUrl' to each record.",
            "default": false
          },
          "extractMetadata": {
            "title": "Extract Rich Metadata",
            "type": "boolean",
            "description": "Extract OG tags, JSON-LD structured data, author, publish date, modified date, language, hreflang, Twitter Card, word count, reading time, token estimate, and content type classification.",
            "default": true
          },
          "extractLinks": {
            "title": "Extract Links",
            "type": "boolean",
            "description": "Include arrays of internal links, external links, and PDF links found on each page.",
            "default": false
          },
          "chunkContent": {
            "title": "Chunk Content for LLM/RAG",
            "type": "boolean",
            "description": "Split Markdown into semantic chunks and add a 'chunks' array to each record. Each chunk includes text, position, and token estimate. Eliminates need for LangChain/LlamaIndex post-processing.",
            "default": false
          },
          "chunkSize": {
            "title": "Chunk Size (characters)",
            "minimum": 100,
            "maximum": 50000,
            "type": "integer",
            "description": "Target maximum characters per chunk. Chunks respect paragraph boundaries.",
            "default": 2000
          },
          "chunkOverlap": {
            "title": "Chunk Overlap (characters)",
            "minimum": 0,
            "maximum": 2000,
            "type": "integer",
            "description": "Characters of overlap between consecutive chunks for context continuity.",
            "default": 200
          },
          "initialCookies": {
            "title": "Initial Cookies",
            "type": "array",
            "description": "Cookies to inject before crawling (e.g. for authenticated sessions). Each entry: {name, value, domain, path}.",
            "items": {
              "type": "object"
            }
          },
          "customHeaders": {
            "title": "Custom HTTP Headers",
            "type": "object",
            "description": "Additional headers to send with every request (e.g. {\"Authorization\": \"Bearer token\"})."
          },
          "userAgent": {
            "title": "Custom User-Agent",
            "type": "string",
            "description": "Override the browser's User-Agent string."
          },
          "requestsPerMinute": {
            "title": "Requests per Minute (rate limit)",
            "minimum": 0,
            "type": "integer",
            "description": "Maximum requests per minute to be polite to the target server. 0 = no limit.",
            "default": 0
          },
          "proxyConfiguration": {
            "title": "Proxy Configuration",
            "type": "object",
            "description": "Proxy settings. Built-in residential proxy is used by default. Override here only if you need a specific proxy group (e.g. RESIDENTIAL for heavily protected sites)."
          },
          "debugMode": {
            "title": "Debug Mode",
            "type": "boolean",
            "description": "Add 'extractedBy' field to each record showing whether Readability or raw extraction was used.",
            "default": false
          }
        }
      },
      "runsResponseSchema": {
        "type": "object",
        "properties": {
          "data": {
            "type": "object",
            "properties": {
              "id": {
                "type": "string"
              },
              "actId": {
                "type": "string"
              },
              "userId": {
                "type": "string"
              },
              "startedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "finishedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "status": {
                "type": "string",
                "example": "READY"
              },
              "meta": {
                "type": "object",
                "properties": {
                  "origin": {
                    "type": "string",
                    "example": "API"
                  },
                  "userAgent": {
                    "type": "string"
                  }
                }
              },
              "stats": {
                "type": "object",
                "properties": {
                  "inputBodyLen": {
                    "type": "integer",
                    "example": 2000
                  },
                  "rebootCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "restartCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "resurrectCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "computeUnits": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "options": {
                "type": "object",
                "properties": {
                  "build": {
                    "type": "string",
                    "example": "latest"
                  },
                  "timeoutSecs": {
                    "type": "integer",
                    "example": 300
                  },
                  "memoryMbytes": {
                    "type": "integer",
                    "example": 1024
                  },
                  "diskMbytes": {
                    "type": "integer",
                    "example": 2048
                  }
                }
              },
              "buildId": {
                "type": "string"
              },
              "defaultKeyValueStoreId": {
                "type": "string"
              },
              "defaultDatasetId": {
                "type": "string"
              },
              "defaultRequestQueueId": {
                "type": "string"
              },
              "buildNumber": {
                "type": "string",
                "example": "1.0.0"
              },
              "containerUrl": {
                "type": "string"
              },
              "usage": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "integer",
                    "example": 1
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "usageTotalUsd": {
                "type": "number",
                "example": 0.00005
              },
              "usageUsd": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "number",
                    "example": 0.00005
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}