{
  "openapi": "3.0.1",
  "info": {
    "title": "Website Content Extractor for RAG: Markdown, HTML, Text",
    "description": "Turn docs sites, help centers, blogs, and websites into clean markdown, text, or HTML for RAG, AI knowledge bases, and internal search. Crawl from start URLs or sitemaps and keep the crawl in scope.",
    "version": "0.1",
    "x-build-id": "D3m79KEDVJDrqhjFs"
  },
  "servers": [
    {
      "url": "https://api.apify.com/v2"
    }
  ],
  "paths": {
    "/acts/nezha~website-content-crawler/run-sync-get-dataset-items": {
      "post": {
        "operationId": "run-sync-get-dataset-items-nezha-website-content-crawler",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for its completion, and returns Actor's dataset items in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    },
    "/acts/nezha~website-content-crawler/runs": {
      "post": {
        "operationId": "runs-sync-nezha-website-content-crawler",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor and returns information about the initiated run in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/runsResponseSchema"
                }
              }
            }
          }
        }
      }
    },
    "/acts/nezha~website-content-crawler/run-sync": {
      "post": {
        "operationId": "run-sync-nezha-website-content-crawler",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    }
  },
  "components": {
    "schemas": {
      "inputSchema": {
        "type": "object",
        "required": [
          "startUrls"
        ],
        "properties": {
          "startUrls": {
            "title": "Website or Docs URLs",
            "type": "array",
            "description": "One or more docs, help center, blog, or website URLs to crawl.",
            "items": {
              "type": "object",
              "required": [
                "url"
              ],
              "properties": {
                "url": {
                  "type": "string",
                  "title": "URL of a web page",
                  "format": "uri"
                }
              }
            }
          },
          "crawlMode": {
            "title": "How To Discover Pages",
            "enum": [
              "website",
              "sitemap"
            ],
            "type": "string",
            "description": "Choose whether to follow links from website pages or load URLs from sitemaps. Sitemap mode is the fastest and most stable first run for docs sites.",
            "default": "sitemap"
          },
          "sitemapUrls": {
            "title": "Sitemap URLs",
            "type": "array",
            "description": "Optional sitemap URLs. Used in sitemap mode. If empty, the actor tries /sitemap.xml from each start URL origin.",
            "items": {
              "type": "string"
            }
          },
          "maxPages": {
            "title": "Max Pages",
            "minimum": 1,
            "type": "integer",
            "description": "Maximum number of pages to extract in one run. Keep 3 for a fast preview and increase it after validation.",
            "default": 3
          },
          "maxDepth": {
            "title": "Max Depth",
            "minimum": 0,
            "type": "integer",
            "description": "How deep the crawler can follow links from the start URLs.",
            "default": 2
          },
          "outputFormat": {
            "title": "Main Content Format",
            "enum": [
              "markdown",
              "text",
              "html"
            ],
            "type": "string",
            "description": "Choose what the main content field should store in the dataset.",
            "default": "markdown"
          },
          "sameDomainOnly": {
            "title": "Stay In Target Site Scope",
            "type": "boolean",
            "description": "When enabled, only crawl links within the same origin and path scope as the start URLs.",
            "default": true
          },
          "contentSelector": {
            "title": "Content Selector",
            "type": "string",
            "description": "Optional CSS selector for the content root. Falls back to main/article/body."
          },
          "removeSelectors": {
            "title": "Remove Selectors",
            "type": "array",
            "description": "Optional CSS selectors to remove before extracting content.",
            "items": {
              "type": "string"
            }
          },
          "includeUrlGlobs": {
            "title": "Include URL Globs",
            "type": "array",
            "description": "Optional glob patterns for links you want to keep.",
            "items": {
              "type": "string"
            }
          },
          "excludeUrlGlobs": {
            "title": "Exclude URL Globs",
            "type": "array",
            "description": "Optional glob patterns for links you want to skip.",
            "items": {
              "type": "string"
            }
          },
          "excludeFileExtensions": {
            "title": "Exclude File Extensions",
            "type": "array",
            "description": "File extensions to skip, for example pdf, jpg, png, or zip.",
            "items": {
              "type": "string"
            }
          },
          "minTextLength": {
            "title": "Min Text Length",
            "minimum": 0,
            "type": "integer",
            "description": "Skip thin pages when extracted text is shorter than this number of characters.",
            "default": 0
          },
          "waitForSelector": {
            "title": "Wait For Selector",
            "type": "string",
            "description": "Optional CSS selector to wait for before extracting content."
          },
          "navigationTimeoutSecs": {
            "title": "Navigation Timeout (secs)",
            "minimum": 15,
            "type": "integer",
            "description": "Timeout for page navigation and optional selector waiting.",
            "default": 25
          },
          "saveCleanHtml": {
            "title": "Store Clean HTML Separately",
            "type": "boolean",
            "description": "Store cleaned HTML separately in key-value store records and index them in CLEAN_HTML_INDEX for downstream chunking or parsing. Disable for the fastest preview run.",
            "default": false
          },
          "proxyConfiguration": {
            "title": "Proxy Configuration",
            "type": "object",
            "description": "Optional Apify proxy configuration."
          }
        }
      },
      "runsResponseSchema": {
        "type": "object",
        "properties": {
          "data": {
            "type": "object",
            "properties": {
              "id": {
                "type": "string"
              },
              "actId": {
                "type": "string"
              },
              "userId": {
                "type": "string"
              },
              "startedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "finishedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "status": {
                "type": "string",
                "example": "READY"
              },
              "meta": {
                "type": "object",
                "properties": {
                  "origin": {
                    "type": "string",
                    "example": "API"
                  },
                  "userAgent": {
                    "type": "string"
                  }
                }
              },
              "stats": {
                "type": "object",
                "properties": {
                  "inputBodyLen": {
                    "type": "integer",
                    "example": 2000
                  },
                  "rebootCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "restartCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "resurrectCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "computeUnits": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "options": {
                "type": "object",
                "properties": {
                  "build": {
                    "type": "string",
                    "example": "latest"
                  },
                  "timeoutSecs": {
                    "type": "integer",
                    "example": 300
                  },
                  "memoryMbytes": {
                    "type": "integer",
                    "example": 1024
                  },
                  "diskMbytes": {
                    "type": "integer",
                    "example": 2048
                  }
                }
              },
              "buildId": {
                "type": "string"
              },
              "defaultKeyValueStoreId": {
                "type": "string"
              },
              "defaultDatasetId": {
                "type": "string"
              },
              "defaultRequestQueueId": {
                "type": "string"
              },
              "buildNumber": {
                "type": "string",
                "example": "1.0.0"
              },
              "containerUrl": {
                "type": "string"
              },
              "usage": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "integer",
                    "example": 1
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "usageTotalUsd": {
                "type": "number",
                "example": 0.00005
              },
              "usageUsd": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "number",
                    "example": 0.00005
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}