{
  "openapi": "3.0.1",
  "info": {
    "title": "Website Content Extractor for RAG: Markdown, HTML, Text",
    "description": "Turn docs sites, help centers, blogs, and websites into clean markdown, text, or HTML for RAG, AI knowledge bases, and internal search. Crawl from start URLs or sitemaps and keep the crawl in scope.",
    "version": "0.1",
    "x-build-id": "53A2pGaHqToLuzBCc"
  },
  "servers": [
    {
      "url": "https://api.apify.com/v2"
    }
  ],
  "paths": {
    "/acts/nezha~website-content-crawler/run-sync-get-dataset-items": {
      "post": {
        "operationId": "run-sync-get-dataset-items-nezha-website-content-crawler",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for its completion, and returns Actor's dataset items in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    },
    "/acts/nezha~website-content-crawler/runs": {
      "post": {
        "operationId": "runs-sync-nezha-website-content-crawler",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor and returns information about the initiated run in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/runsResponseSchema"
                }
              }
            }
          }
        }
      }
    },
    "/acts/nezha~website-content-crawler/run-sync": {
      "post": {
        "operationId": "run-sync-nezha-website-content-crawler",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    }
  },
  "components": {
    "schemas": {
      "inputSchema": {
        "type": "object",
        "required": [
          "startUrls"
        ],
        "properties": {
          "startUrls": {
            "title": "Website, Docs, or Help Center URLs",
            "type": "array",
            "description": "Paste the page or section you want to turn into clean content. Use a docs homepage, help center category, blog index, or a specific website section. The path matters when Target scope is enabled.",
            "items": {
              "type": "object",
              "required": [
                "url"
              ],
              "properties": {
                "url": {
                  "type": "string",
                  "title": "URL of a web page",
                  "format": "uri"
                }
              }
            }
          },
          "maxPages": {
            "title": "Max Pages to Extract",
            "minimum": 1,
            "type": "integer",
            "description": "Maximum number of pages saved to the dataset. Keep 3 for a quick preview, then raise it for a full crawl. This is the main control for runtime and cost.",
            "default": 3
          },
          "crawlMode": {
            "title": "Page Discovery Method",
            "enum": [
              "auto",
              "website",
              "sitemap"
            ],
            "type": "string",
            "description": "Auto first tries sitemap URLs for fast coverage. If no crawlable sitemap pages are found, it falls back to following links from your start URLs. Choose Website links for section-by-section crawling, or Sitemap only when you want strict sitemap discovery.",
            "default": "auto"
          },
          "sitemapUrls": {
            "title": "Sitemap URLs",
            "type": "array",
            "description": "Optional sitemap.xml URLs. Leave empty to try /sitemap.xml for each start URL domain. Useful for docs sites, help centers, and blogs with reliable sitemaps.",
            "items": {
              "type": "string"
            }
          },
          "maxDepth": {
            "title": "Link Depth",
            "minimum": 0,
            "type": "integer",
            "description": "How many link levels to follow in Website links mode. Use 0 to extract only the pasted URLs, 1 to include pages linked from them, and 2+ for broader section crawls. Ignored in Sitemap only mode.",
            "default": 1
          },
          "sameDomainOnly": {
            "title": "Target Scope Only",
            "type": "boolean",
            "description": "Keep crawled pages inside the same domain and path as the start URLs. Example: starting from /docs only keeps /docs pages. Turn off only when you intentionally want cross-domain or wider-site links.",
            "default": true
          },
          "outputFormat": {
            "title": "Main Content Format",
            "enum": [
              "markdown",
              "text",
              "html"
            ],
            "type": "string",
            "description": "Controls the dataset content field. Markdown is best for most RAG and vector database pipelines, plain text is best for lightweight search, and HTML preserves more structure.",
            "default": "markdown"
          },
          "saveCleanHtml": {
            "title": "Store Clean HTML Records",
            "type": "boolean",
            "description": "Also store each page's cleaned HTML in key-value store records and list them in CLEAN_HTML_INDEX. Leave off for quick previews; enable when your downstream pipeline needs separate HTML files.",
            "default": false
          },
          "contentSelector": {
            "title": "Main Content CSS Selector",
            "type": "string",
            "description": "Optional CSS selector for the content area, such as main, article, .docs-content, or #content. Leave empty to auto-detect main, article, [role=main], then body."
          },
          "removeSelectors": {
            "title": "Remove CSS Selectors",
            "type": "array",
            "description": "Optional selectors to remove before extraction, such as .sidebar, .cookie-banner, .newsletter, .toc, or .ads. Use this when output includes navigation or repeated page chrome.",
            "items": {
              "type": "string"
            }
          },
          "minTextLength": {
            "title": "Minimum Text Length",
            "minimum": 0,
            "type": "integer",
            "description": "Skip pages with less extracted text than this number of characters. Keep 0 for the first run. Increase it later to remove empty, redirect, or index pages.",
            "default": 0
          },
          "includeUrlGlobs": {
            "title": "Include URL Patterns",
            "type": "array",
            "description": "Optional Crawlee glob patterns for URLs to keep, for example **/docs/** or **/help/**. Leave empty unless you need a narrower crawl.",
            "items": {
              "type": "string"
            }
          },
          "excludeUrlGlobs": {
            "title": "Exclude URL Patterns",
            "type": "array",
            "description": "Optional Crawlee glob patterns for URLs to skip, for example **/search/**, **/login**, **?*utm_*, or **/*.pdf.",
            "items": {
              "type": "string"
            }
          },
          "excludeFileExtensions": {
            "title": "Extra File Extensions to Skip",
            "type": "array",
            "description": "The Actor already skips common non-HTML files such as pdf, images, videos, Office files, and archives. Add extra extensions here only when your target site uses custom downloadable file types.",
            "items": {
              "type": "string"
            }
          },
          "waitForSelector": {
            "title": "Wait for CSS Selector",
            "type": "string",
            "description": "Optional selector to wait for before extraction, such as main or .article-body. Use this for JavaScript-rendered sites where content appears after initial page load."
          },
          "navigationTimeoutSecs": {
            "title": "Page Load Timeout",
            "minimum": 15,
            "type": "integer",
            "description": "Seconds to wait for navigation and optional selector waiting. Increase for slow JavaScript-heavy sites.",
            "default": 25
          },
          "proxyConfiguration": {
            "title": "Proxy Configuration",
            "type": "object",
            "description": "Optional Apify proxy configuration. The default direct connection is usually enough for public docs, help centers, and blogs."
          }
        }
      },
      "runsResponseSchema": {
        "type": "object",
        "properties": {
          "data": {
            "type": "object",
            "properties": {
              "id": {
                "type": "string"
              },
              "actId": {
                "type": "string"
              },
              "userId": {
                "type": "string"
              },
              "startedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "finishedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "status": {
                "type": "string",
                "example": "READY"
              },
              "meta": {
                "type": "object",
                "properties": {
                  "origin": {
                    "type": "string",
                    "example": "API"
                  },
                  "userAgent": {
                    "type": "string"
                  }
                }
              },
              "stats": {
                "type": "object",
                "properties": {
                  "inputBodyLen": {
                    "type": "integer",
                    "example": 2000
                  },
                  "rebootCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "restartCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "resurrectCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "computeUnits": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "options": {
                "type": "object",
                "properties": {
                  "build": {
                    "type": "string",
                    "example": "latest"
                  },
                  "timeoutSecs": {
                    "type": "integer",
                    "example": 300
                  },
                  "memoryMbytes": {
                    "type": "integer",
                    "example": 1024
                  },
                  "diskMbytes": {
                    "type": "integer",
                    "example": 2048
                  }
                }
              },
              "buildId": {
                "type": "string"
              },
              "defaultKeyValueStoreId": {
                "type": "string"
              },
              "defaultDatasetId": {
                "type": "string"
              },
              "defaultRequestQueueId": {
                "type": "string"
              },
              "buildNumber": {
                "type": "string",
                "example": "1.0.0"
              },
              "containerUrl": {
                "type": "string"
              },
              "usage": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "integer",
                    "example": 1
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "usageTotalUsd": {
                "type": "number",
                "example": 0.00005
              },
              "usageUsd": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "number",
                    "example": 0.00005
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}