{
  "openapi": "3.0.1",
  "info": {
    "title": "Website Contact & Social Discovery Crawler",
    "description": "High-throughput crawler that extracts emails, phone numbers, and social media profiles from websites using HTTP-first Crawlee crawling with Selectolax parsing and Playwright SPA fallback.",
    "version": "0.1",
    "x-build-id": "w0xaQa7eFE9euekhi"
  },
  "servers": [
    {
      "url": "https://api.apify.com/v2"
    }
  ],
  "paths": {
    "/acts/competent_clarinet~website-contacts-scraper/run-sync-get-dataset-items": {
      "post": {
        "operationId": "run-sync-get-dataset-items-competent_clarinet-website-contacts-scraper",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for its completion, and returns Actor's dataset items in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    },
    "/acts/competent_clarinet~website-contacts-scraper/runs": {
      "post": {
        "operationId": "runs-sync-competent_clarinet-website-contacts-scraper",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor and returns information about the initiated run in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/runsResponseSchema"
                }
              }
            }
          }
        }
      }
    },
    "/acts/competent_clarinet~website-contacts-scraper/run-sync": {
      "post": {
        "operationId": "run-sync-competent_clarinet-website-contacts-scraper",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    }
  },
  "components": {
    "schemas": {
      "inputSchema": {
        "type": "object",
        "required": [
          "websites"
        ],
        "properties": {
          "websites": {
            "title": "Websites",
            "type": "array",
            "description": "Seed URLs to crawl. Each entry may be a URL string or an object with url and optional countryCode (ISO 3166-1 alpha-2, e.g. US, IN, GB). When countryCode is omitted on an entry, defaultCountryCode is used."
          },
          "defaultCountryCode": {
            "title": "Default country code",
            "type": "string",
            "description": "Default ISO 3166-1 alpha-2 country code for parsing phone numbers without an explicit country prefix. Used for website entries that do not specify countryCode.",
            "default": "US"
          },
          "maxPagesPerSite": {
            "title": "Max pages per site",
            "minimum": 1,
            "maximum": 500,
            "type": "integer",
            "description": "Maximum number of pages to crawl per website before stopping.",
            "default": 25
          },
          "maxDepthPerSite": {
            "title": "Max crawl depth per site",
            "minimum": 0,
            "maximum": 30,
            "type": "integer",
            "description": "Maximum link hops from the seed URL (0 = seed pages only). Applies to both termination strategies.",
            "default": 3
          },
          "terminationStrategy": {
            "title": "Termination strategy",
            "enum": [
              "early",
              "lazy"
            ],
            "type": "string",
            "description": "Early stops each site once an email, phone, and social profile are found. Lazy crawls until max pages and max depth limits are reached to collect as many contacts as possible.",
            "default": "early"
          },
          "maxConcurrency": {
            "title": "Max concurrency",
            "minimum": 1,
            "maximum": 100,
            "type": "integer",
            "description": "Maximum number of concurrent httpx workers across all websites.",
            "default": 10
          },
          "maxConcurrencyPerDomain": {
            "title": "Max concurrency per domain",
            "minimum": 1,
            "maximum": 20,
            "type": "integer",
            "description": "Maximum in-flight HTTP requests per website host. Combined with round-robin scheduling to avoid hammering one domain.",
            "default": 2
          },
          "maxRequestsPerDomainPerSecond": {
            "title": "Max requests per domain per second",
            "minimum": 0.1,
            "maximum": 50,
            "type": "number",
            "description": "Per-domain request rate limit enforced by httpx-limiter.",
            "default": 2
          },
          "requestTimeoutSecs": {
            "title": "Request timeout (seconds)",
            "minimum": 5,
            "maximum": 300,
            "type": "integer",
            "description": "HTTP request timeout.",
            "default": 30
          },
          "maxRequestRetries": {
            "title": "Max request retries",
            "minimum": 0,
            "maximum": 10,
            "type": "integer",
            "description": "Retries for failed page requests.",
            "default": 3
          },
          "bufferFlushSize": {
            "title": "Buffer flush size",
            "minimum": 1,
            "maximum": 1000,
            "type": "integer",
            "description": "Number of events to buffer before pushing to the dataset.",
            "default": 200
          },
          "useSitemapDiscovery": {
            "title": "Use sitemap discovery",
            "type": "boolean",
            "description": "Resolve redirects, read robots.txt, and seed the crawl queue from sitemap.xml URLs before link discovery.",
            "default": true
          },
          "maxSitemapUrls": {
            "title": "Max sitemap URLs per site",
            "minimum": 1,
            "maximum": 500,
            "type": "integer",
            "description": "Maximum number of same-site page URLs to seed from sitemaps after score-based ranking. Capped by maxPagesPerSite minus one (canonical URL slot).",
            "default": 50
          },
          "minEnqueueScore": {
            "title": "Min enqueue score",
            "minimum": 0,
            "maximum": 1,
            "type": "number",
            "description": "Only queue URLs whose normalized contact-path score (0.0–1.0) meets this threshold. Homepage is always queued. Lower values discover more pages; higher values focus on contact/about/support paths. Legacy integer inputs from 0–100 are accepted and converted automatically.",
            "default": 0.333
          },
          "useSemanticScoring": {
            "title": "Use semantic link scoring (spaCy)",
            "type": "boolean",
            "description": "When enabled, score each discovered link by semantic similarity between contact-intent phrases and a combined profile of the raw URL (path, query params) plus anchor and surrounding text.",
            "default": true
          },
          "treatSubdomainsAsSameSite": {
            "title": "Treat subdomains as same site",
            "type": "boolean",
            "description": "When enabled, follow links on subdomains of the same registrable domain (e.g. stores.example.com while crawling www.example.com). Disabled by default; unrelated domains and other tenants on shared platforms (e.g. different *.myshopify.com shops) are never combined.",
            "default": false
          },
          "additionalPaths": {
            "title": "Additional seed paths",
            "type": "array",
            "description": "Path suffixes queued as seeds for every site (after the canonical landing URL). Useful for Shopify contact/policy pages that may not appear in navigation or sitemaps. Resolved against each site's post-redirect origin; duplicates are removed per domain.",
            "items": {
              "type": "string"
            },
            "default": [
              "/pages/contact",
              "/policies/contact-information",
              "/pages/privacy-policy",
              "/privacy-policy",
              "/pages/terms-and-conditions",
              "/terms-and-conditions"
            ]
          },
          "proxyConfiguration": {
            "title": "Proxy configuration",
            "type": "object",
            "description": "Optional proxies. Requests go direct first; after HTTP 403/429 a site uses proxy when configured, otherwise it is skipped. Proxy sessions rotate on 403/429."
          },
          "maxProxySessions": {
            "title": "Max proxy sessions",
            "minimum": 1,
            "maximum": 20,
            "type": "integer",
            "description": "Maximum number of active proxy sessions kept at once. Domains are assigned to a session and stay on it until rotation. If multiple domains share a session and one rotates, all domains on that session move together.",
            "default": 10
          }
        }
      },
      "runsResponseSchema": {
        "type": "object",
        "properties": {
          "data": {
            "type": "object",
            "properties": {
              "id": {
                "type": "string"
              },
              "actId": {
                "type": "string"
              },
              "userId": {
                "type": "string"
              },
              "startedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "finishedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "status": {
                "type": "string",
                "example": "READY"
              },
              "meta": {
                "type": "object",
                "properties": {
                  "origin": {
                    "type": "string",
                    "example": "API"
                  },
                  "userAgent": {
                    "type": "string"
                  }
                }
              },
              "stats": {
                "type": "object",
                "properties": {
                  "inputBodyLen": {
                    "type": "integer",
                    "example": 2000
                  },
                  "rebootCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "restartCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "resurrectCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "computeUnits": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "options": {
                "type": "object",
                "properties": {
                  "build": {
                    "type": "string",
                    "example": "latest"
                  },
                  "timeoutSecs": {
                    "type": "integer",
                    "example": 300
                  },
                  "memoryMbytes": {
                    "type": "integer",
                    "example": 1024
                  },
                  "diskMbytes": {
                    "type": "integer",
                    "example": 2048
                  }
                }
              },
              "buildId": {
                "type": "string"
              },
              "defaultKeyValueStoreId": {
                "type": "string"
              },
              "defaultDatasetId": {
                "type": "string"
              },
              "defaultRequestQueueId": {
                "type": "string"
              },
              "buildNumber": {
                "type": "string",
                "example": "1.0.0"
              },
              "containerUrl": {
                "type": "string"
              },
              "usage": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "integer",
                    "example": 1
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "usageTotalUsd": {
                "type": "number",
                "example": 0.00005
              },
              "usageUsd": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "number",
                    "example": 0.00005
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}