{
  "openapi": "3.0.1",
  "info": {
    "title": "Deduplicate, Merge & Transform Datasets",
    "description": "Merge multiple datasets, deduplicate items by a combination of fields, and apply custom transforms — powered by Polars.",
    "version": "0.0",
    "x-build-id": "DdaZ99xyXwHhJRSbp"
  },
  "servers": [
    {
      "url": "https://api.apify.com/v2"
    }
  ],
  "paths": {
    "/acts/datacach~deduplicate-datasets/run-sync-get-dataset-items": {
      "post": {
        "operationId": "run-sync-get-dataset-items-datacach-deduplicate-datasets",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for its completion, and returns Actor's dataset items in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    },
    "/acts/datacach~deduplicate-datasets/runs": {
      "post": {
        "operationId": "runs-sync-datacach-deduplicate-datasets",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor and returns information about the initiated run in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/runsResponseSchema"
                }
              }
            }
          }
        }
      }
    },
    "/acts/datacach~deduplicate-datasets/run-sync": {
      "post": {
        "operationId": "run-sync-datacach-deduplicate-datasets",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    }
  },
  "components": {
    "schemas": {
      "inputSchema": {
        "type": "object",
        "properties": {
          "datasetIds": {
            "title": "Dataset IDs",
            "type": "array",
            "description": "One or more Apify dataset IDs (e.g. <code>dHZ7Xy9aBc...</code>) or dataset names to merge and deduplicate. Items are concatenated in the order listed, which decides which duplicate is kept — the first occurrence wins. Leave this empty (and <b>Actor or Task ID</b> empty too) to run on built-in sample data for a quick first test.",
            "items": {
              "type": "string"
            }
          },
          "fields": {
            "title": "Deduplication fields",
            "type": "array",
            "description": "The field(s) whose combined value makes an item unique — e.g. <code>[\"url\"]</code> or <code>[\"name\", \"id\"]</code>. Each value is JSON-stringified and concatenated into a single key, and the first item seen for each key is kept. Leave empty to merge datasets without removing any duplicates.",
            "items": {
              "type": "string"
            }
          },
          "output": {
            "title": "Items to output",
            "enum": [
              "unique-items",
              "duplicate-items",
              "nothing"
            ],
            "type": "string",
            "description": "Choose what the run returns: the <b>unique</b> items, only the <b>duplicate</b> items that were removed, or <b>nothing</b> (just the unique/duplicate counts — handy for a quick duplicate audit).",
            "default": "unique-items"
          },
          "mode": {
            "title": "Deduplication mode",
            "enum": [
              "dedup-after-load",
              "dedup-as-loading"
            ],
            "type": "string",
            "description": "<b>Dedup after load</b> loads everything into memory, then deduplicates — fastest for typical datasets. <b>Dedup as loading</b> deduplicates batch-by-batch and streams the result to the output dataset as it goes, keeping memory near-constant for very large (10M+) datasets. Streaming applies when the output destination is a dataset; the post-dedup transform is then applied per pushed batch.",
            "default": "dedup-after-load"
          },
          "nullAsUnique": {
            "title": "Treat null fields as unique",
            "type": "boolean",
            "description": "When on, items whose deduplication field(s) are null or missing are always kept as unique and never removed. When off, a null/missing value is treated like any other value when matching duplicates.",
            "default": false
          },
          "actorOrTaskId": {
            "title": "Actor or Task ID",
            "type": "string",
            "description": "Optionally pull source datasets from every run of an Actor or Task. Enter an ID or full name (e.g. <code>apify/web-scraper</code>); the default dataset of each successful run is merged together with anything in <b>Dataset IDs</b>."
          },
          "onlyRunsNewerThan": {
            "title": "Only runs newer than",
            "pattern": "^(\\d{4})-(0[1-9]|1[0-2])-(0[1-9]|[12]\\d|3[01])(T[0-2]\\d:[0-5]\\d(:[0-5]\\d)?(\\.\\d+)?(Z|[+-]\\d{2}:\\d{2})?)?$",
            "type": "string",
            "description": "When using <b>Actor or Task ID</b>, only include runs that finished on or after this date. Accepts a date (<code>2024-01-31</code>) or a full ISO datetime."
          },
          "onlyRunsOlderThan": {
            "title": "Only runs older than",
            "pattern": "^(\\d{4})-(0[1-9]|1[0-2])-(0[1-9]|[12]\\d|3[01])(T[0-2]\\d:[0-5]\\d(:[0-5]\\d)?(\\.\\d+)?(Z|[+-]\\d{2}:\\d{2})?)?$",
            "type": "string",
            "description": "When using <b>Actor or Task ID</b>, only include runs that finished on or before this date. Accepts a date (<code>2024-01-31</code>) or a full ISO datetime."
          },
          "datasetIdsOfFilterItems": {
            "title": "Dataset IDs of filter items",
            "type": "array",
            "description": "Datasets used only to pre-seed the 'already seen' keys. Items in <b>Dataset IDs</b> that match these keys are dropped as duplicates, but the filter items themselves are never output — perfect for outputting only records you haven't seen before.",
            "items": {
              "type": "string"
            }
          },
          "preDedupTransformFunction": {
            "title": "Pre-deduplication transform function",
            "type": "string",
            "description": "Optional Python that receives <code>items</code> (a list of dicts) plus <code>input_data</code>/<code>customInputData</code> and returns a new list of dicts. Runs on each loaded batch <b>before</b> deduplication, so you can filter or add items. Example: <code>[i for i in items if i.get('price')]</code>. You can also provide a full function, e.g. <code>def transform(items, input_data): ...</code>. JavaScript functions are not supported."
          },
          "postDedupTransformFunction": {
            "title": "Post-deduplication transform function",
            "type": "string",
            "description": "Optional Python that receives <code>items</code> (a list of dicts) and returns a new list of dicts. Runs <b>after</b> deduplication, just before the result is written — ideal for trimming or renaming fields."
          },
          "customInputData": {
            "title": "Custom input data",
            "type": "object",
            "description": "Arbitrary JSON passed straight to your transform functions as <code>input_data</code> / <code>customInputData</code>. Use it to parameterize the transforms without editing their code."
          },
          "outputTo": {
            "title": "Output destination",
            "enum": [
              "dataset",
              "key-value-store"
            ],
            "type": "string",
            "description": "Where to store the result: a <b>dataset</b> (downloadable as JSON, CSV, Excel, or HTML) or the run's <b>key-value store</b> under the <code>OUTPUT</code> key (results over the 9 MB record limit are split into <code>OUTPUT</code>, <code>OUTPUT-2</code>, ...).",
            "default": "dataset"
          },
          "outputDatasetId": {
            "title": "Output dataset ID",
            "type": "string",
            "description": "Dataset ID or name to push the output to. If a name doesn't exist yet, a named dataset is created. Leave empty to use this run's default dataset."
          },
          "appendDatasetIds": {
            "title": "Append dataset IDs",
            "type": "boolean",
            "description": "When on, every output item gets a <code>__datasetId__</code> field recording which source dataset it came from. When off, items are passed through unchanged.",
            "default": false
          },
          "fieldsToLoad": {
            "title": "Fields to load",
            "type": "array",
            "description": "Load only these fields from the source datasets to save memory and speed up loading — for example, just your deduplication fields when you only need counts. Your deduplication fields are always loaded in addition to this list, so dedup keys can be computed. Leave empty to load every field.",
            "items": {
              "type": "string"
            }
          },
          "offset": {
            "title": "Offset",
            "minimum": 0,
            "type": "integer",
            "description": "Skip this many items from the start of the merged input before processing. Leave empty to start from the first item."
          },
          "limit": {
            "title": "Limit",
            "minimum": 1,
            "type": "integer",
            "description": "Process at most this many items from the merged input. Leave empty to process all of them."
          },
          "batchSizeLoad": {
            "title": "Load batch size",
            "minimum": 1000,
            "type": "integer",
            "description": "How many items to fetch per load request. Larger batches load faster but use more memory.",
            "default": 50000
          },
          "uploadBatchSize": {
            "title": "Upload batch size",
            "minimum": 10,
            "maximum": 1000,
            "type": "integer",
            "description": "How many items to send per push request when writing the output.",
            "default": 500
          },
          "parallelLoads": {
            "title": "Parallel loads",
            "minimum": 1,
            "maximum": 100,
            "type": "integer",
            "description": "How many load batches to fetch at the same time. Higher values speed up large runs but use more memory and resources.",
            "default": 10
          },
          "parallelPushes": {
            "title": "Parallel pushes",
            "minimum": 1,
            "maximum": 50,
            "type": "integer",
            "description": "How many output batches to upload at the same time.",
            "default": 5
          },
          "verboseLog": {
            "title": "Verbose logging",
            "type": "boolean",
            "description": "Turn on detailed debug logging to trace exactly what the run is doing. Leave off for normal, quieter logs.",
            "default": false
          }
        }
      },
      "runsResponseSchema": {
        "type": "object",
        "properties": {
          "data": {
            "type": "object",
            "properties": {
              "id": {
                "type": "string"
              },
              "actId": {
                "type": "string"
              },
              "userId": {
                "type": "string"
              },
              "startedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "finishedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "status": {
                "type": "string",
                "example": "READY"
              },
              "meta": {
                "type": "object",
                "properties": {
                  "origin": {
                    "type": "string",
                    "example": "API"
                  },
                  "userAgent": {
                    "type": "string"
                  }
                }
              },
              "stats": {
                "type": "object",
                "properties": {
                  "inputBodyLen": {
                    "type": "integer",
                    "example": 2000
                  },
                  "rebootCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "restartCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "resurrectCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "computeUnits": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "options": {
                "type": "object",
                "properties": {
                  "build": {
                    "type": "string",
                    "example": "latest"
                  },
                  "timeoutSecs": {
                    "type": "integer",
                    "example": 300
                  },
                  "memoryMbytes": {
                    "type": "integer",
                    "example": 1024
                  },
                  "diskMbytes": {
                    "type": "integer",
                    "example": 2048
                  }
                }
              },
              "buildId": {
                "type": "string"
              },
              "defaultKeyValueStoreId": {
                "type": "string"
              },
              "defaultDatasetId": {
                "type": "string"
              },
              "defaultRequestQueueId": {
                "type": "string"
              },
              "buildNumber": {
                "type": "string",
                "example": "1.0.0"
              },
              "containerUrl": {
                "type": "string"
              },
              "usage": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "integer",
                    "example": 1
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "usageTotalUsd": {
                "type": "number",
                "example": 0.00005
              },
              "usageUsd": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "number",
                    "example": 0.00005
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}