{
  "openapi": "3.0.1",
  "info": {
    "title": "🧠 Smart Article Extractor",
    "description": null,
    "version": "0.3",
    "x-build-id": "TvaCmQgjsQalPmyJ2"
  },
  "servers": [
    {
      "url": "https://api.apify.com/v2"
    }
  ],
  "paths": {
    "/acts/scrapier~smart-article-extractor/run-sync-get-dataset-items": {
      "post": {
        "operationId": "run-sync-get-dataset-items-scrapier-smart-article-extractor",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for its completion, and returns Actor's dataset items in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    },
    "/acts/scrapier~smart-article-extractor/runs": {
      "post": {
        "operationId": "runs-sync-scrapier-smart-article-extractor",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor and returns information about the initiated run in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/runsResponseSchema"
                }
              }
            }
          }
        }
      }
    },
    "/acts/scrapier~smart-article-extractor/run-sync": {
      "post": {
        "operationId": "run-sync-scrapier-smart-article-extractor",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    }
  },
  "components": {
    "schemas": {
      "inputSchema": {
        "type": "object",
        "required": [
          "startUrls"
        ],
        "properties": {
          "startUrls": {
            "title": "🌐 Website / Category URLs",
            "type": "array",
            "description": "Top-level pages the crawler should start from — homepages, sections, topic pages. Each one is treated as a category page and articles are discovered from it.",
            "items": {
              "type": "string"
            }
          },
          "articleUrls": {
            "title": "📰 Article URLs",
            "type": "array",
            "description": "Already-known article URLs to extract directly (no discovery needed). Mix with Website URLs for hybrid runs.",
            "items": {
              "type": "string"
            }
          },
          "onlyNewArticles": {
            "title": "🆕 Only new articles (only for small runs)",
            "type": "boolean",
            "description": "Skip articles that were extracted in any previous run (deduplicated globally via the key-value store). Best for low-volume runs.",
            "default": false
          },
          "onlyNewArticlesPerDomain": {
            "title": "🌍 Only new articles (saved per domain, preferable)",
            "type": "boolean",
            "description": "Same as above, but the deduplication memory is kept separately per domain — preferable for multi-domain runs.",
            "default": false
          },
          "onlyInsideArticles": {
            "title": "🔗 Only inside domain articles",
            "type": "boolean",
            "description": "When enqueueing from an article, accept only links that point back to the same registrable domain.",
            "default": true
          },
          "enqueueFromArticles": {
            "title": "🧭 Enqueue articles from articles",
            "type": "boolean",
            "description": "Discover further article links inside extracted articles and add them to the crawl queue.",
            "default": false
          },
          "crawlWholeSubdomain": {
            "title": "🕸️ Crawl whole subdomain (same base as Start URL)",
            "type": "boolean",
            "description": "Treat every same-subdomain link as a potential category page (depth-limited).",
            "default": false
          },
          "onlySubdomainArticles": {
            "title": "🏷️ Limit articles to only from subdomain",
            "type": "boolean",
            "description": "Restrict articles to URLs starting with the same path prefix as the Start URL (e.g. example.com/news/*).",
            "default": false
          },
          "scanSitemaps": {
            "title": "🗺️ Find articles in sitemaps (caution)",
            "type": "boolean",
            "description": "Discover article URLs from robots.txt → Sitemap entries and the usual /sitemap.xml candidates. Disable if it produces too many noisy candidates.",
            "default": false
          },
          "sitemapUrls": {
            "title": "🗺️ Sitemap URLs (safer)",
            "type": "array",
            "description": "Explicit sitemap URLs — skips auto-discovery and only uses these. Safer than full robots.txt scanning.",
            "default": [],
            "items": {
              "type": "string"
            }
          },
          "saveHtml": {
            "title": "💾 Save full HTML",
            "type": "boolean",
            "description": "Include the full page HTML in the dataset record (produces large records).",
            "default": false
          },
          "saveHtmlAsLink": {
            "title": "🔗 Save full HTML (only as link to it)",
            "type": "boolean",
            "description": "Save HTML to the run's key-value store and put the link in the record (smaller dataset).",
            "default": false
          },
          "saveSnapshots": {
            "title": "📸 Save screenshots of article pages (browser only)",
            "type": "boolean",
            "description": "Take a PNG screenshot of every article. Only effective when the headless browser is enabled.",
            "default": false
          },
          "useGoogleBotHeaders": {
            "title": "🤖 Use Googlebot headers",
            "type": "boolean",
            "description": "Send the Googlebot User-Agent + headers. Many publishers allow Googlebot through paywalls / soft-blocks.",
            "default": false
          },
          "minWords": {
            "title": "📏 Minimum words",
            "minimum": 0,
            "maximum": 100000,
            "type": "integer",
            "description": "Reject articles whose extracted text has fewer than this many words.",
            "default": 150
          },
          "dateFrom": {
            "title": "📆 Extract articles from [date]",
            "type": "string",
            "description": "ISO date (YYYY-MM-DD). Only keep articles published on or after this date."
          },
          "onlyArticlesForLastDays": {
            "title": "🕒 Only articles for last X days",
            "minimum": 0,
            "maximum": 3650,
            "type": "integer",
            "description": "Drop anything older than X days. Combined with dateFrom, the stricter of the two wins."
          },
          "mustHaveDate": {
            "title": "📅 Must have date",
            "type": "boolean",
            "description": "Drop articles where no publication-date metadata can be detected.",
            "default": true
          },
          "isUrlArticleDefinition": {
            "title": "🧪 Is the URL an article?",
            "type": "object",
            "description": "Heuristics for classifying a URL as an article. minDashes = minimum dashes in the path, hasDate = path contains a /YYYY/MM/DD/ pattern, linkIncludes = substrings that mark a URL as an article.",
            "default": {
              "minDashes": 4,
              "hasDate": true,
              "linkIncludes": [
                "article",
                "storyid",
                "?p=",
                "id=",
                "/fpss/track",
                ".html",
                "/content/"
              ]
            }
          },
          "pseudoUrls": {
            "title": "🧩 Pseudo URLs",
            "type": "array",
            "description": "Additional URL patterns ([.*], [\\d+]) that mark a page as a crawlable category. If you want to enqueue direct article URLs this way, you have to add { \"label\": \"article\" } to the userData.",
            "default": [],
            "items": {
              "type": "object",
              "required": [
                "purl"
              ],
              "properties": {
                "purl": {
                  "type": "string",
                  "title": "Pseudo-URL of a web page"
                }
              }
            }
          },
          "linkSelector": {
            "title": "🎯 Link selector",
            "type": "string",
            "description": "Optional CSS selector restricting which parts of a category page contribute links (e.g. main a, .article-list a)."
          },
          "maxDepth": {
            "title": "🪜 Max depth",
            "minimum": 0,
            "maximum": 20,
            "type": "integer",
            "description": "Maximum BFS depth from the Start URL (Start URL = 0). Empty = no extra cap.",
            "default": 2
          },
          "maxPagesPerCrawl": {
            "title": "📃 Max pages per crawl",
            "minimum": 1,
            "maximum": 100000,
            "type": "integer",
            "description": "Hard cap on pages fetched in one run (articles + category pages combined).",
            "default": 50
          },
          "maxArticlesPerCrawl": {
            "title": "✨ Max articles per crawl",
            "minimum": 1,
            "maximum": 100000,
            "type": "integer",
            "description": "Hard cap on extracted articles per run.",
            "default": 25
          },
          "maxArticlesPerStartUrl": {
            "title": "🎯 Max articles per start URL",
            "minimum": 1,
            "maximum": 100000,
            "type": "integer",
            "description": "Cap how many articles can be attributed to a single Start URL.",
            "default": 25
          },
          "maxConcurrency": {
            "title": "⚡ Max concurrency",
            "minimum": 1,
            "maximum": 100,
            "type": "integer",
            "description": "How many fetches the crawler may run in parallel. Higher = faster, but more pressure on the target site and proxy quota. Leave empty for safe sequential mode.",
            "default": 1
          },
          "proxyConfiguration": {
            "title": "🛡️ Proxy configuration",
            "type": "object",
            "description": "Proxy settings. Default = NO PROXY (direct). If the target blocks the request, the actor automatically falls back to DATACENTER, then RESIDENTIAL (with up to 3 retries on residential). Once a fallback occurs, it sticks."
          },
          "useBrowser": {
            "title": "🎭 Use browser (Playwright)",
            "type": "boolean",
            "description": "Render with Chromium when raw HTTP fails or the page is JS-heavy. Slower but bypasses many anti-bot walls.",
            "default": false
          },
          "pageWaitMs": {
            "title": "⏱️ Wait on each page (ms)",
            "minimum": 0,
            "maximum": 60000,
            "type": "integer",
            "description": "Extra time to wait after navigation finishes (milliseconds). Useful for lazily-loaded scripts.",
            "default": 0
          },
          "waitUntil": {
            "title": "🚦 Wait until navigation event is finished",
            "enum": [
              "load",
              "domcontentloaded",
              "networkidle",
              "commit"
            ],
            "type": "string",
            "description": "Which navigation event Playwright waits for before considering the page ready.",
            "default": "load"
          },
          "categoryWaitForSelector": {
            "title": "🗂️ Wait for selector on each category page",
            "type": "string",
            "description": "Optional CSS selector. The browser will wait for this element to appear before extracting links from category pages."
          },
          "articleWaitForSelector": {
            "title": "📰 Wait for selector on each article page",
            "type": "string",
            "description": "Optional CSS selector. The browser will wait for this element to appear before extracting article content."
          },
          "scrollToBottom": {
            "title": "🖱️ Scroll to bottom of the page (infinite scroll)",
            "type": "boolean",
            "description": "Auto-scroll to the bottom of category/article pages so lazy-loaded content is rendered.",
            "default": false
          },
          "scrollToBottomButtonSelector": {
            "title": "🔘 Scroll to bottom button selector",
            "type": "string",
            "description": "Optional CSS selector for a 'Load more' button. The crawler will click it repeatedly while scrolling."
          },
          "scrollToBottomMaxSeconds": {
            "title": "⏲️ Scroll to bottom max seconds",
            "minimum": 1,
            "maximum": 600,
            "type": "integer",
            "description": "Maximum time spent scrolling per page (safety cap).",
            "default": 60
          },
          "extendOutputFunction": {
            "title": "🛠️ Extend output function",
            "type": "string",
            "description": "Only needed if you want more data than is included in the default output. Keep in mind that you should provide a valid Python function: def extend(soup, article, html): return {...}. The returned dict is merged into each article record."
          },
          "maxCUs": {
            "title": "🧮 Limit CU consumption",
            "minimum": 0,
            "maximum": 100000,
            "type": "integer",
            "description": "Soft cap on Apify Compute Units this run may consume. The actor checks usage between requests and exits gracefully when the cap is hit. Leave empty for no cap."
          },
          "notificationEmails": {
            "title": "📧 Emails address for notifications",
            "type": "array",
            "description": "Email addresses to notify when the CU thresholds below are crossed.",
            "default": [],
            "items": {
              "type": "string"
            }
          },
          "notifyAfterCUs": {
            "title": "🔔 Notify after [number] CUs",
            "minimum": 0,
            "maximum": 100000,
            "type": "integer",
            "description": "Send a one-time notification once this many CUs have been consumed."
          },
          "notifyAfterCUsEvery": {
            "title": "🔁 Notify every [number] CUs",
            "minimum": 0,
            "maximum": 100000,
            "type": "integer",
            "description": "Send a notification every N CUs after the initial notification threshold."
          }
        }
      },
      "runsResponseSchema": {
        "type": "object",
        "properties": {
          "data": {
            "type": "object",
            "properties": {
              "id": {
                "type": "string"
              },
              "actId": {
                "type": "string"
              },
              "userId": {
                "type": "string"
              },
              "startedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "finishedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "status": {
                "type": "string",
                "example": "READY"
              },
              "meta": {
                "type": "object",
                "properties": {
                  "origin": {
                    "type": "string",
                    "example": "API"
                  },
                  "userAgent": {
                    "type": "string"
                  }
                }
              },
              "stats": {
                "type": "object",
                "properties": {
                  "inputBodyLen": {
                    "type": "integer",
                    "example": 2000
                  },
                  "rebootCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "restartCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "resurrectCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "computeUnits": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "options": {
                "type": "object",
                "properties": {
                  "build": {
                    "type": "string",
                    "example": "latest"
                  },
                  "timeoutSecs": {
                    "type": "integer",
                    "example": 300
                  },
                  "memoryMbytes": {
                    "type": "integer",
                    "example": 1024
                  },
                  "diskMbytes": {
                    "type": "integer",
                    "example": 2048
                  }
                }
              },
              "buildId": {
                "type": "string"
              },
              "defaultKeyValueStoreId": {
                "type": "string"
              },
              "defaultDatasetId": {
                "type": "string"
              },
              "defaultRequestQueueId": {
                "type": "string"
              },
              "buildNumber": {
                "type": "string",
                "example": "1.0.0"
              },
              "containerUrl": {
                "type": "string"
              },
              "usage": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "integer",
                    "example": 1
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "usageTotalUsd": {
                "type": "number",
                "example": 0.00005
              },
              "usageUsd": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "number",
                    "example": 0.00005
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}