{
  "openapi": "3.0.1",
  "info": {
    "title": "Website Content Crawler",
    "description": "Crawl any website and ship clean Markdown, plain text, and HTML for AI, LLM, and RAG pipelines. Each row carries token estimates, JSON LD metadata, link graph, and optional auto chunk splitting for vector databases. Pay per page.",
    "version": "0.1",
    "x-build-id": "g4fnZZKLCNWKg5PeM"
  },
  "servers": [
    {
      "url": "https://api.apify.com/v2"
    }
  ],
  "paths": {
    "/acts/scrapemint~website-content-crawler/run-sync-get-dataset-items": {
      "post": {
        "operationId": "run-sync-get-dataset-items-scrapemint-website-content-crawler",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for its completion, and returns Actor's dataset items in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    },
    "/acts/scrapemint~website-content-crawler/runs": {
      "post": {
        "operationId": "runs-sync-scrapemint-website-content-crawler",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor and returns information about the initiated run in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/runsResponseSchema"
                }
              }
            }
          }
        }
      }
    },
    "/acts/scrapemint~website-content-crawler/run-sync": {
      "post": {
        "operationId": "run-sync-scrapemint-website-content-crawler",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    }
  },
  "components": {
    "schemas": {
      "inputSchema": {
        "type": "object",
        "required": [
          "startUrls"
        ],
        "properties": {
          "startUrls": {
            "title": "Start URLs",
            "type": "array",
            "description": "URLs to start the crawl from. Each URL is treated as the entry point for that website.",
            "default": [],
            "items": {
              "type": "string"
            }
          },
          "crawlerType": {
            "title": "Crawler type",
            "enum": [
              "adaptive",
              "playwright",
              "cheerio"
            ],
            "type": "string",
            "description": "Adaptive switches between a real browser and raw HTTP based on whether the page renders content via JavaScript. Use Playwright for heavy JS sites, Cheerio for static pages.",
            "default": "adaptive"
          },
          "maxPages": {
            "title": "Max pages",
            "minimum": 0,
            "maximum": 100000,
            "type": "integer",
            "description": "Hard cap on pages crawled per run across all start URLs. Set to 0 for unlimited.",
            "default": 25
          },
          "maxDepth": {
            "title": "Max link depth",
            "minimum": 0,
            "maximum": 20,
            "type": "integer",
            "description": "How many link hops away from the start URL the crawler is allowed to follow. 0 means only the start URLs.",
            "default": 3
          },
          "useSitemap": {
            "title": "Use sitemap",
            "type": "boolean",
            "description": "Auto discover sitemap.xml, sitemap_index.xml, and robots.txt sitemap entries to seed the crawl. Recommended for large sites.",
            "default": true
          },
          "respectRobotsTxt": {
            "title": "Respect robots.txt",
            "type": "boolean",
            "description": "Skip URLs disallowed by the site's robots.txt. Turn off for internal scraping work where you have a contract.",
            "default": true
          },
          "includeUrlPatterns": {
            "title": "Include URL patterns",
            "type": "array",
            "description": "Glob patterns. Pages must match at least one of these to be crawled. Empty means everything on the same domain. Examples: '**/docs/**', '**/blog/*'.",
            "default": [],
            "items": {
              "type": "string"
            }
          },
          "excludeUrlPatterns": {
            "title": "Exclude URL patterns",
            "type": "array",
            "description": "Glob patterns. Pages matching any of these are skipped. Examples: '**/login/**', '**/api/**', '**/*.zip'.",
            "default": [
              "**/login/**",
              "**/signup/**",
              "**/cart/**"
            ],
            "items": {
              "type": "string"
            }
          },
          "stayOnDomain": {
            "title": "Stay on the same domain",
            "type": "boolean",
            "description": "Only follow links on the same registrable domain as the start URL (e.g. apify.com and docs.apify.com both count when on).",
            "default": true
          },
          "stayOnSubdomain": {
            "title": "Stay on the same subdomain",
            "type": "boolean",
            "description": "Only follow links on the exact same hostname as the start URL. Stricter than stayOnDomain.",
            "default": false
          },
          "removeFluff": {
            "title": "Remove navigation, footers, ads, and modals",
            "type": "boolean",
            "description": "Strip nav, footer, header, aside, ads, cookie banners, and modals before extracting content. Recommended for AI pipelines.",
            "default": true
          },
          "extractor": {
            "title": "Main content extractor",
            "enum": [
              "auto",
              "readability",
              "main",
              "body"
            ],
            "type": "string",
            "description": "Auto picks the best result from Readability, a custom main detector, and a body fallback. Force one if your pipeline needs consistency.",
            "default": "auto"
          },
          "outputFormats": {
            "title": "Output formats per page",
            "type": "array",
            "description": "Each row carries the formats you select. Markdown is the default for AI pipelines. Plain text is best for token tight LLM contexts. HTML is the cleaned post extraction HTML.",
            "items": {
              "type": "string",
              "enum": [
                "markdown",
                "text",
                "html"
              ],
              "enumTitles": [
                "Markdown",
                "Plain text",
                "Cleaned HTML"
              ]
            },
            "default": [
              "markdown",
              "text"
            ]
          },
          "minContentLength": {
            "title": "Minimum content length",
            "minimum": 0,
            "maximum": 100000,
            "type": "integer",
            "description": "Drop pages whose extracted content is shorter than this many characters. Useful for filtering out empty templates and 404 pages.",
            "default": 100
          },
          "chunkOutput": {
            "title": "Auto split into RAG chunks",
            "type": "boolean",
            "description": "Push one row per chunk instead of one row per page. Each chunk row carries url, chunkIndex, totalChunks, markdown, tokens, and the page metadata. Built for vector database ingestion.",
            "default": false
          },
          "chunkSize": {
            "title": "Chunk size in tokens",
            "minimum": 100,
            "maximum": 8192,
            "type": "integer",
            "description": "Target token count per chunk when chunkOutput is on. 800 to 1000 works well for most embedding models.",
            "default": 1000
          },
          "chunkOverlap": {
            "title": "Chunk overlap in tokens",
            "minimum": 0,
            "maximum": 2000,
            "type": "integer",
            "description": "Tokens of overlap between consecutive chunks. Helps preserve context at chunk boundaries during retrieval.",
            "default": 100
          },
          "redactPII": {
            "title": "Redact PII",
            "type": "boolean",
            "description": "Replace emails, phone numbers, IBANs, and US Social Security numbers with [REDACTED] tokens before output. Useful for GDPR safe RAG indexing.",
            "default": false
          },
          "extractMetadata": {
            "title": "Extract metadata",
            "type": "boolean",
            "description": "Pull JSON LD article and product schemas, OpenGraph tags, author, publish date, and modified date for every page.",
            "default": true
          },
          "extractLinks": {
            "title": "Extract link graph",
            "type": "boolean",
            "description": "Each row carries outbound link count split by internal vs external, plus a sample of up to 25 link URLs.",
            "default": true
          },
          "infiniteScroll": {
            "title": "Trigger infinite scroll",
            "type": "boolean",
            "description": "Scroll the page in stages so lazy loaded content renders before extraction. Playwright crawler only.",
            "default": false
          },
          "waitForSelector": {
            "title": "Wait for selector",
            "type": "string",
            "description": "Optional CSS selector. The crawler waits for this element before extracting content. Playwright crawler only.",
            "default": ""
          },
          "cookies": {
            "title": "Cookies",
            "type": "array",
            "description": "Cookies to set before crawling. Use this for pages behind a login. Format: array of {name, value, domain}.",
            "default": []
          },
          "downloadFiles": {
            "title": "Download linked files",
            "type": "boolean",
            "description": "Save linked PDF, DOC, DOCX, XLS, XLSX, and CSV files to the key value store. Useful for indexing knowledge bases and research libraries.",
            "default": false
          },
          "downloadFileTypes": {
            "title": "File extensions to download",
            "type": "array",
            "description": "File extensions the crawler should download when downloadFiles is on.",
            "items": {
              "type": "string",
              "enum": [
                "pdf",
                "doc",
                "docx",
                "xls",
                "xlsx",
                "csv",
                "txt",
                "json",
                "xml"
              ]
            },
            "default": [
              "pdf",
              "doc",
              "docx"
            ]
          },
          "concurrency": {
            "title": "Concurrency",
            "minimum": 1,
            "maximum": 64,
            "type": "integer",
            "description": "Pages processed in parallel. Eight is a safe default. Drop to two or three for sites with strict rate limits.",
            "default": 8
          },
          "requestTimeoutSecs": {
            "title": "Request timeout in seconds",
            "minimum": 5,
            "maximum": 600,
            "type": "integer",
            "description": "Per page timeout. Long pages or slow sites may need 60 to 90 seconds.",
            "default": 45
          },
          "proxyConfiguration": {
            "title": "Proxy configuration",
            "type": "object",
            "description": "Apify proxy. Datacenter is fine for most documentation sites. Use residential for sites with anti scraping protection.",
            "default": {
              "useApifyProxy": true
            }
          }
        }
      },
      "runsResponseSchema": {
        "type": "object",
        "properties": {
          "data": {
            "type": "object",
            "properties": {
              "id": {
                "type": "string"
              },
              "actId": {
                "type": "string"
              },
              "userId": {
                "type": "string"
              },
              "startedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "finishedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "status": {
                "type": "string",
                "example": "READY"
              },
              "meta": {
                "type": "object",
                "properties": {
                  "origin": {
                    "type": "string",
                    "example": "API"
                  },
                  "userAgent": {
                    "type": "string"
                  }
                }
              },
              "stats": {
                "type": "object",
                "properties": {
                  "inputBodyLen": {
                    "type": "integer",
                    "example": 2000
                  },
                  "rebootCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "restartCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "resurrectCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "computeUnits": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "options": {
                "type": "object",
                "properties": {
                  "build": {
                    "type": "string",
                    "example": "latest"
                  },
                  "timeoutSecs": {
                    "type": "integer",
                    "example": 300
                  },
                  "memoryMbytes": {
                    "type": "integer",
                    "example": 1024
                  },
                  "diskMbytes": {
                    "type": "integer",
                    "example": 2048
                  }
                }
              },
              "buildId": {
                "type": "string"
              },
              "defaultKeyValueStoreId": {
                "type": "string"
              },
              "defaultDatasetId": {
                "type": "string"
              },
              "defaultRequestQueueId": {
                "type": "string"
              },
              "buildNumber": {
                "type": "string",
                "example": "1.0.0"
              },
              "containerUrl": {
                "type": "string"
              },
              "usage": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "integer",
                    "example": 1
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "usageTotalUsd": {
                "type": "number",
                "example": 0.00005
              },
              "usageUsd": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "number",
                    "example": 0.00005
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}