{
  "openapi": "3.0.1",
  "info": {
    "title": "Docs-to-RAG AI Crawler",
    "description": "Stop wasting space on website headers, footers, cookie banners, and navigation menus.\n\nExtract clean body text, chunk it for RAG, and detect page changes across runs crawling public docs, blogs, and knowledge bases,",
    "version": "0.0",
    "x-build-id": "z0fvOAl49PZ0wGc17"
  },
  "servers": [
    {
      "url": "https://api.apify.com/v2"
    }
  ],
  "paths": {
    "/acts/charitable_jeopardy~WebScraperAp/run-sync-get-dataset-items": {
      "post": {
        "operationId": "run-sync-get-dataset-items-charitable_jeopardy-WebScraperAp",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for its completion, and returns Actor's dataset items in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    },
    "/acts/charitable_jeopardy~WebScraperAp/runs": {
      "post": {
        "operationId": "runs-sync-charitable_jeopardy-WebScraperAp",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor and returns information about the initiated run in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/runsResponseSchema"
                }
              }
            }
          }
        }
      }
    },
    "/acts/charitable_jeopardy~WebScraperAp/run-sync": {
      "post": {
        "operationId": "run-sync-charitable_jeopardy-WebScraperAp",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    }
  },
  "components": {
    "schemas": {
      "inputSchema": {
        "type": "object",
        "properties": {
          "startUrls": {
            "title": "Start URLs",
            "type": "array",
            "description": "Public seed URLs to fetch and crawl from.",
            "default": [
              {
                "url": "https://example.com/"
              }
            ],
            "items": {
              "type": "object",
              "required": [
                "url"
              ],
              "properties": {
                "url": {
                  "type": "string",
                  "title": "URL of a web page",
                  "format": "uri"
                }
              }
            }
          },
          "sitemapUrls": {
            "title": "Sitemap URLs",
            "type": "array",
            "description": "Sitemap XML, sitemap index, or plain text URL list sources.",
            "default": [],
            "items": {
              "type": "object",
              "required": [
                "url"
              ],
              "properties": {
                "url": {
                  "type": "string",
                  "title": "URL of a web page",
                  "format": "uri"
                }
              }
            }
          },
          "maxPagesPerSite": {
            "title": "Max pages per site",
            "minimum": 1,
            "maximum": 10000,
            "type": "integer",
            "description": "The maximum number of successfully crawled pages per hostname/site.",
            "default": 1
          },
          "includePatterns": {
            "title": "Include URL patterns",
            "type": "array",
            "description": "Glob patterns. Empty allows all in-scope URLs unless excluded.",
            "default": [],
            "items": {
              "type": "string"
            }
          },
          "excludePatterns": {
            "title": "Exclude URL patterns",
            "type": "array",
            "description": "Glob patterns. Exclusions take precedence.",
            "default": [],
            "items": {
              "type": "string"
            }
          },
          "crawlDepth": {
            "title": "Crawl depth",
            "minimum": 0,
            "maximum": 10,
            "type": "integer",
            "description": "The maximum depth of links to traverse from seed URLs (sitemaps are depth 0).",
            "default": 0
          },
          "maxCrawlRetries": {
            "title": "Max crawl retries",
            "minimum": 0,
            "maximum": 5,
            "type": "integer",
            "description": "The maximum number of retry attempts for failed requests.",
            "default": 1
          },
          "useBrowserRendering": {
            "title": "Use browser rendering",
            "type": "boolean",
            "description": "Enable to render pages using a headless browser (Playwright/Chrome) for JS-heavy sites.",
            "default": false
          },
          "languageDetection": {
            "title": "Language detection",
            "type": "boolean",
            "description": "Enable to detect primary language of clean text.",
            "default": true
          },
          "chunkText": {
            "title": "Chunk text",
            "type": "boolean",
            "description": "Enable to split extracted text into smaller chunk records for RAG.",
            "default": false
          },
          "chunkSize": {
            "title": "Chunk size",
            "minimum": 200,
            "maximum": 5000,
            "type": "integer",
            "description": "Target character length of each text chunk.",
            "default": 1000
          },
          "chunkOverlap": {
            "title": "Chunk overlap",
            "minimum": 0,
            "maximum": 1000,
            "type": "integer",
            "description": "The number of overlapping characters between consecutive chunks.",
            "default": 150
          },
          "outputFormat": {
            "title": "Output format",
            "enum": [
              "pages",
              "chunks",
              "pagesAndChunks"
            ],
            "type": "string",
            "description": "Determines records written to the default Dataset.",
            "default": "pages"
          },
          "detectChanges": {
            "title": "Detect changes",
            "type": "boolean",
            "description": "Enable to compare content hashes against prior runs using a persistent store.",
            "default": false
          },
          "storeRawHtml": {
            "title": "Store raw HTML",
            "type": "boolean",
            "description": "Enable to store raw fetched/rendered HTML in the default Key-Value Store.",
            "default": false
          },
          "storeCleanText": {
            "title": "Store clean text",
            "type": "boolean",
            "description": "Include cleanText in page records (always used internally for chunking/hashing).",
            "default": true
          }
        }
      },
      "runsResponseSchema": {
        "type": "object",
        "properties": {
          "data": {
            "type": "object",
            "properties": {
              "id": {
                "type": "string"
              },
              "actId": {
                "type": "string"
              },
              "userId": {
                "type": "string"
              },
              "startedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "finishedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "status": {
                "type": "string",
                "example": "READY"
              },
              "meta": {
                "type": "object",
                "properties": {
                  "origin": {
                    "type": "string",
                    "example": "API"
                  },
                  "userAgent": {
                    "type": "string"
                  }
                }
              },
              "stats": {
                "type": "object",
                "properties": {
                  "inputBodyLen": {
                    "type": "integer",
                    "example": 2000
                  },
                  "rebootCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "restartCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "resurrectCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "computeUnits": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "options": {
                "type": "object",
                "properties": {
                  "build": {
                    "type": "string",
                    "example": "latest"
                  },
                  "timeoutSecs": {
                    "type": "integer",
                    "example": 300
                  },
                  "memoryMbytes": {
                    "type": "integer",
                    "example": 1024
                  },
                  "diskMbytes": {
                    "type": "integer",
                    "example": 2048
                  }
                }
              },
              "buildId": {
                "type": "string"
              },
              "defaultKeyValueStoreId": {
                "type": "string"
              },
              "defaultDatasetId": {
                "type": "string"
              },
              "defaultRequestQueueId": {
                "type": "string"
              },
              "buildNumber": {
                "type": "string",
                "example": "1.0.0"
              },
              "containerUrl": {
                "type": "string"
              },
              "usage": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "integer",
                    "example": 1
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "usageTotalUsd": {
                "type": "number",
                "example": 0.00005
              },
              "usageUsd": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "number",
                    "example": 0.00005
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}