{
  "openapi": "3.0.1",
  "info": {
    "title": "News Source Crawler",
    "description": "Given a news website URL, discover and extract articles with full metadata with title, authors, publish date, body text, top image, keywords, and summary. Works with any news site via sitemap or HTML discovery.",
    "version": "0.1",
    "x-build-id": "hu96H1gun4janft5C"
  },
  "servers": [
    {
      "url": "https://api.apify.com/v2"
    }
  ],
  "paths": {
    "/acts/crawlerbros~news-source-crawler/run-sync-get-dataset-items": {
      "post": {
        "operationId": "run-sync-get-dataset-items-crawlerbros-news-source-crawler",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for its completion, and returns Actor's dataset items in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    },
    "/acts/crawlerbros~news-source-crawler/runs": {
      "post": {
        "operationId": "runs-sync-crawlerbros-news-source-crawler",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor and returns information about the initiated run in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/runsResponseSchema"
                }
              }
            }
          }
        }
      }
    },
    "/acts/crawlerbros~news-source-crawler/run-sync": {
      "post": {
        "operationId": "run-sync-crawlerbros-news-source-crawler",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    }
  },
  "components": {
    "schemas": {
      "inputSchema": {
        "type": "object",
        "required": [
          "websiteUrl"
        ],
        "properties": {
          "websiteUrl": {
            "title": "News website URL",
            "pattern": "^https?://.+",
            "type": "string",
            "description": "Root URL of the news site to crawl. The crawler will discover articles via sitemap first, then fall back to HTML link extraction from the homepage."
          },
          "maxArticles": {
            "title": "Maximum articles",
            "minimum": 1,
            "maximum": 500,
            "type": "integer",
            "description": "Hard cap on the number of articles to extract.",
            "default": 20
          },
          "keywordFilter": {
            "title": "Keyword filter",
            "type": "string",
            "description": "Boolean expression applied to article title and body. Supports AND, OR, NOT and parentheses, case-insensitive. Examples: `AI AND NOT crypto`, `(startup OR funding) AND SaaS`. Leave blank to keep every article."
          },
          "minWordCount": {
            "title": "Minimum article word count",
            "minimum": 0,
            "maximum": 10000,
            "type": "integer",
            "description": "Drop articles whose extracted body text is shorter than this.",
            "default": 100
          },
          "concurrency": {
            "title": "Concurrency",
            "minimum": 1,
            "maximum": 20,
            "type": "integer",
            "description": "Number of articles fetched in parallel. Higher is faster but more likely to trigger rate limits on small sites.",
            "default": 5
          },
          "extractKeywords": {
            "title": "Extract keywords",
            "type": "boolean",
            "description": "Compute the top 10 most frequent content words per article (stopwords removed).",
            "default": true
          },
          "extractSummary": {
            "title": "Extract summary",
            "type": "boolean",
            "description": "Emit the first three sentences of each article as a summary, with a meta-description fallback.",
            "default": true
          },
          "language": {
            "title": "Article language",
            "enum": [
              "auto",
              "en",
              "de",
              "fr",
              "es",
              "it",
              "pt",
              "nl",
              "sv",
              "da",
              "no",
              "fi",
              "pl",
              "ru",
              "uk",
              "cs",
              "sk",
              "hu",
              "ro",
              "bg",
              "hr",
              "sr",
              "sl",
              "et",
              "lt",
              "lv",
              "el",
              "tr",
              "he",
              "ar",
              "fa",
              "hi",
              "bn",
              "th",
              "vi",
              "id",
              "ms",
              "ja",
              "ko",
              "zh"
            ],
            "type": "string",
            "description": "Language hint for the articles. Use `auto` to detect from site metadata.",
            "default": "auto"
          },
          "autoProxyFallback": {
            "title": "Auto-retry via proxy when direct fetch finds zero candidates",
            "type": "boolean",
            "description": "If the direct datacenter-IP fetch of the sitemap/homepage returns 0 article candidates (site likely blocks scrapers), automatically retry discovery + article fetching through Apify residential proxy. Only blocked sites burn proxy credits. Turn off to guarantee zero proxy spend.",
            "default": true
          }
        }
      },
      "runsResponseSchema": {
        "type": "object",
        "properties": {
          "data": {
            "type": "object",
            "properties": {
              "id": {
                "type": "string"
              },
              "actId": {
                "type": "string"
              },
              "userId": {
                "type": "string"
              },
              "startedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "finishedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "status": {
                "type": "string",
                "example": "READY"
              },
              "meta": {
                "type": "object",
                "properties": {
                  "origin": {
                    "type": "string",
                    "example": "API"
                  },
                  "userAgent": {
                    "type": "string"
                  }
                }
              },
              "stats": {
                "type": "object",
                "properties": {
                  "inputBodyLen": {
                    "type": "integer",
                    "example": 2000
                  },
                  "rebootCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "restartCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "resurrectCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "computeUnits": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "options": {
                "type": "object",
                "properties": {
                  "build": {
                    "type": "string",
                    "example": "latest"
                  },
                  "timeoutSecs": {
                    "type": "integer",
                    "example": 300
                  },
                  "memoryMbytes": {
                    "type": "integer",
                    "example": 1024
                  },
                  "diskMbytes": {
                    "type": "integer",
                    "example": 2048
                  }
                }
              },
              "buildId": {
                "type": "string"
              },
              "defaultKeyValueStoreId": {
                "type": "string"
              },
              "defaultDatasetId": {
                "type": "string"
              },
              "defaultRequestQueueId": {
                "type": "string"
              },
              "buildNumber": {
                "type": "string",
                "example": "1.0.0"
              },
              "containerUrl": {
                "type": "string"
              },
              "usage": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "integer",
                    "example": 1
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "usageTotalUsd": {
                "type": "number",
                "example": 0.00005
              },
              "usageUsd": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "number",
                    "example": 0.00005
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}