{
  "openapi": "3.0.1",
  "info": {
    "title": "RAG Web Extractor — Chunked Content for AI Pipelines",
    "description": "Extract clean markdown from websites for RAG pipelines. Strip nav, ads, boilerplate. Preserve headings, links, images. Recursive crawling with depth control. Chunked output for embedding pipelines. Build AI knowledge bases.",
    "version": "1.0",
    "x-build-id": "Xxw5OJrxVbZA1jbP3"
  },
  "servers": [
    {
      "url": "https://api.apify.com/v2"
    }
  ],
  "paths": {
    "/acts/junipr~rag-web-extractor/run-sync-get-dataset-items": {
      "post": {
        "operationId": "run-sync-get-dataset-items-junipr-rag-web-extractor",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for its completion, and returns Actor's dataset items in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    },
    "/acts/junipr~rag-web-extractor/runs": {
      "post": {
        "operationId": "runs-sync-junipr-rag-web-extractor",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor and returns information about the initiated run in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/runsResponseSchema"
                }
              }
            }
          }
        }
      }
    },
    "/acts/junipr~rag-web-extractor/run-sync": {
      "post": {
        "operationId": "run-sync-junipr-rag-web-extractor",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    }
  },
  "components": {
    "schemas": {
      "inputSchema": {
        "type": "object",
        "properties": {
          "startUrls": {
            "title": "Start URLs",
            "type": "array",
            "description": "List of URLs to scrape. Only the URL field is required per entry.",
            "default": [
              {
                "url": "https://crawlee.dev/docs/introduction"
              }
            ],
            "items": {
              "type": "object",
              "required": [
                "url"
              ],
              "properties": {
                "url": {
                  "type": "string",
                  "title": "URL of a web page",
                  "format": "uri"
                }
              }
            }
          },
          "maxPages": {
            "title": "Max Pages",
            "minimum": 1,
            "maximum": 100000,
            "type": "integer",
            "description": "Maximum number of pages to scrape per run.",
            "default": 100
          },
          "maxDepth": {
            "title": "Max Crawl Depth",
            "minimum": 0,
            "maximum": 10,
            "type": "integer",
            "description": "Link-following depth from start URLs. 0 = only start URLs.",
            "default": 0
          },
          "outputFormats": {
            "title": "Output Formats",
            "type": "array",
            "description": "Output format(s) to generate. Multiple allowed. Valid values: markdown, plainText, structuredJson, html",
            "items": {
              "type": "string",
              "enum": [
                "markdown",
                "plainText",
                "structuredJson",
                "html"
              ]
            },
            "default": [
              "markdown"
            ]
          },
          "includeRawHtml": {
            "title": "Include Raw HTML",
            "type": "boolean",
            "description": "Include cleaned raw HTML in output alongside selected formats.",
            "default": false
          },
          "enableChunking": {
            "title": "Enable Content Chunking",
            "type": "boolean",
            "description": "Split content into chunks for RAG pipelines.",
            "default": false
          },
          "chunkSize": {
            "title": "Chunk Size (characters)",
            "minimum": 100,
            "maximum": 10000,
            "type": "integer",
            "description": "Target chunk size in characters.",
            "default": 1000
          },
          "chunkOverlap": {
            "title": "Chunk Overlap (characters)",
            "minimum": 0,
            "maximum": 5000,
            "type": "integer",
            "description": "Overlap between consecutive chunks in characters. Max is half of chunk size.",
            "default": 200
          },
          "chunkStrategy": {
            "title": "Chunking Strategy",
            "enum": [
              "semantic",
              "fixed",
              "sentence"
            ],
            "type": "string",
            "description": "How to split content into chunks. Semantic splits on heading/paragraph boundaries. Fixed splits by exact character count. Sentence splits on sentence boundaries.",
            "default": "semantic"
          },
          "renderJs": {
            "title": "Render JavaScript",
            "type": "boolean",
            "description": "Use headless browser for JavaScript rendering. Disable for static sites to save compute.",
            "default": true
          },
          "waitForSelector": {
            "title": "Wait for Selector",
            "type": "string",
            "description": "CSS selector to wait for before extraction. Useful for SPAs where content loads dynamically."
          },
          "waitForTimeout": {
            "title": "Wait Timeout (ms)",
            "minimum": 1000,
            "maximum": 60000,
            "type": "integer",
            "description": "Maximum milliseconds to wait for page load.",
            "default": 5000
          },
          "handleInfiniteScroll": {
            "title": "Handle Infinite Scroll",
            "type": "boolean",
            "description": "Scroll to bottom to load lazy content.",
            "default": false
          },
          "maxScrolls": {
            "title": "Max Scroll Iterations",
            "minimum": 1,
            "maximum": 100,
            "type": "integer",
            "description": "Maximum scroll iterations for infinite scroll pages.",
            "default": 20
          },
          "handlePagination": {
            "title": "Handle Pagination",
            "type": "boolean",
            "description": "Follow pagination links automatically.",
            "default": false
          },
          "paginationSelector": {
            "title": "Pagination Selector",
            "type": "string",
            "description": "CSS selector for 'next page' button/link. Auto-detected if not set."
          },
          "paginationMaxPages": {
            "title": "Max Pagination Pages",
            "minimum": 1,
            "maximum": 100,
            "type": "integer",
            "description": "Maximum pagination pages to follow per start URL.",
            "default": 10
          },
          "removeSelectors": {
            "title": "Remove Selectors",
            "type": "array",
            "description": "CSS selectors for elements to remove before extraction (e.g., nav, footer, ads).",
            "items": {
              "type": "string"
            },
            "default": []
          },
          "includeSelectors": {
            "title": "Include Selectors",
            "type": "array",
            "description": "If set, ONLY extract content from these selectors. Overrides removeSelectors.",
            "items": {
              "type": "string"
            },
            "default": []
          },
          "removeNavigation": {
            "title": "Remove Navigation",
            "type": "boolean",
            "description": "Automatically remove <nav>, <header>, <footer> elements.",
            "default": true
          },
          "removeAds": {
            "title": "Remove Ads",
            "type": "boolean",
            "description": "Automatically remove common ad elements and iframes.",
            "default": true
          },
          "removeCookieBanners": {
            "title": "Remove Cookie Banners",
            "type": "boolean",
            "description": "Automatically remove cookie consent banners.",
            "default": true
          },
          "minContentLength": {
            "title": "Min Content Length",
            "minimum": 0,
            "maximum": 10000,
            "type": "integer",
            "description": "Skip pages with main content shorter than this many characters.",
            "default": 50
          },
          "language": {
            "title": "Language Filter",
            "type": "string",
            "description": "Filter pages by detected language (ISO 639-1 code, e.g., 'en', 'de', 'ja'). Leave empty to accept all languages."
          },
          "extractMetadata": {
            "title": "Extract Metadata",
            "type": "boolean",
            "description": "Extract page metadata (title, description, OG tags, etc.).",
            "default": true
          },
          "extractSchemaOrg": {
            "title": "Extract Schema.org",
            "type": "boolean",
            "description": "Extract schema.org / JSON-LD structured data.",
            "default": true
          },
          "extractLinks": {
            "title": "Extract Links",
            "type": "boolean",
            "description": "Extract and categorize all links on the page.",
            "default": true
          },
          "extractImages": {
            "title": "Extract Images",
            "type": "boolean",
            "description": "Extract image URLs, alt text, and dimensions.",
            "default": true
          },
          "extractTables": {
            "title": "Extract Tables",
            "type": "boolean",
            "description": "Extract HTML tables as structured JSON arrays.",
            "default": false
          },
          "proxyConfiguration": {
            "title": "Proxy Configuration",
            "type": "object",
            "description": "Proxy settings. Defaults to Apify datacenter proxies.",
            "default": {
              "useApifyProxy": true
            }
          },
          "httpHeaders": {
            "title": "HTTP Headers",
            "type": "object",
            "description": "Custom HTTP headers for all requests.",
            "default": {}
          },
          "cookies": {
            "title": "Cookies",
            "type": "array",
            "description": "Cookies to set for all requests. Each object needs name, value, and domain.",
            "items": {
              "type": "object",
              "properties": {
                "name": {
                  "type": "string",
                  "title": "Cookie Name",
                  "description": "Name of the cookie"
                },
                "value": {
                  "type": "string",
                  "title": "Cookie Value",
                  "description": "Value of the cookie"
                },
                "domain": {
                  "type": "string",
                  "title": "Cookie Domain",
                  "description": "Domain the cookie applies to"
                }
              },
              "required": [
                "name",
                "value",
                "domain"
              ]
            }
          },
          "deduplicateContent": {
            "title": "Deduplicate Content",
            "type": "boolean",
            "description": "Skip pages with content >90% similar to already-scraped pages.",
            "default": true
          },
          "respectRobotsTxt": {
            "title": "Respect robots.txt",
            "type": "boolean",
            "description": "Honor robots.txt directives.",
            "default": true
          },
          "maxRetries": {
            "title": "Max Retries",
            "minimum": 0,
            "maximum": 10,
            "type": "integer",
            "description": "Retry failed pages up to this many times.",
            "default": 3
          },
          "requestTimeout": {
            "title": "Request Timeout (ms)",
            "minimum": 5000,
            "maximum": 120000,
            "type": "integer",
            "description": "Timeout per individual page request in milliseconds.",
            "default": 30000
          }
        }
      },
      "runsResponseSchema": {
        "type": "object",
        "properties": {
          "data": {
            "type": "object",
            "properties": {
              "id": {
                "type": "string"
              },
              "actId": {
                "type": "string"
              },
              "userId": {
                "type": "string"
              },
              "startedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "finishedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "status": {
                "type": "string",
                "example": "READY"
              },
              "meta": {
                "type": "object",
                "properties": {
                  "origin": {
                    "type": "string",
                    "example": "API"
                  },
                  "userAgent": {
                    "type": "string"
                  }
                }
              },
              "stats": {
                "type": "object",
                "properties": {
                  "inputBodyLen": {
                    "type": "integer",
                    "example": 2000
                  },
                  "rebootCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "restartCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "resurrectCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "computeUnits": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "options": {
                "type": "object",
                "properties": {
                  "build": {
                    "type": "string",
                    "example": "latest"
                  },
                  "timeoutSecs": {
                    "type": "integer",
                    "example": 300
                  },
                  "memoryMbytes": {
                    "type": "integer",
                    "example": 1024
                  },
                  "diskMbytes": {
                    "type": "integer",
                    "example": 2048
                  }
                }
              },
              "buildId": {
                "type": "string"
              },
              "defaultKeyValueStoreId": {
                "type": "string"
              },
              "defaultDatasetId": {
                "type": "string"
              },
              "defaultRequestQueueId": {
                "type": "string"
              },
              "buildNumber": {
                "type": "string",
                "example": "1.0.0"
              },
              "containerUrl": {
                "type": "string"
              },
              "usage": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "integer",
                    "example": 1
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "usageTotalUsd": {
                "type": "number",
                "example": 0.00005
              },
              "usageUsd": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "number",
                    "example": 0.00005
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}