nf-core · nictru · Jun 7, 2026 · Jun 16, 2026 · Jun 17, 2026 · Jun 18, 2026
@@ -318,11 +318,18 @@ This can be useful if you have assigned cell type annotations to the integrated
 
 If you want to run tasks after the integration step without performing integration, you can provide a previous result of the pipeline as [`base_adata`](https://nf-co.re/scdownstream/parameters#base_adata).
 You do not need to provide a samplesheet via the [`input`](https://nf-co.re/scdownstream/parameters#input) parameter in this case.
-You also need [`base_embeddings`](https://nf-co.re/scdownstream/parameters#base_embeddings), and optionally [`base_label_col`](https://nf-co.re/scdownstream/parameters#base_label_col) and [`base_condition_col`](https://nf-co.re/scdownstream/parameters#base_condition_col) if your label or condition columns are not named `label` and `condition`.
+You also need either [`base_embeddings`](https://nf-co.re/scdownstream/parameters#base_embeddings) to reuse existing embeddings, or [`integrate_per_label`](https://nf-co.re/scdownstream/parameters#integrate_per_label) to compute new integrations independently for each group in [`base_label_col`](https://nf-co.re/scdownstream/parameters#base_label_col).
+Set [`base_condition_col`](https://nf-co.re/scdownstream/parameters#base_condition_col) if your condition column is not named `condition`.
 
 The pipeline will then re-execute the tasks after the integration step without performing integration again.
 Most interestingly, the pipeline will generate cell type specific UMAPs, clusterings, and PAGA graphs, if [`clustering_per_label`](https://nf-co.re/scdownstream/parameters#clustering_per_label) is set to `true`.
 
+If [`integrate_per_label`](https://nf-co.re/scdownstream/parameters#integrate_per_label) is enabled, [`base_label_col`](https://nf-co.re/scdownstream/parameters#base_label_col) is the split/grouping column, not necessarily the supervised cell-type label used by integration methods.
+Use [`base_batch_col`](https://nf-co.re/scdownstream/parameters#base_batch_col) to select the batch column for batch-aware integration methods.
+When using scANVI, use [`base_scanvi_label_col`](https://nf-co.re/scdownstream/parameters#base_scanvi_label_col) and [`base_scanvi_unknown_label`](https://nf-co.re/scdownstream/parameters#base_scanvi_unknown_label) to select the supervised labels and unlabeled category.
+Subset names in `analysis_plan` match the filesystem-safe keys produced by splitting the AnnData object; spaces in group values are replaced with underscores.
+Per-label integrations are treated as already split for clustering, so the pipeline creates subset-specific embedding keys such as `X_pca-SRR28679756_pca` and UMAP keys such as `X_pca-SRR28679756_umap` in the finalized base AnnData.
+
 ### GPU acceleration
 
 :::warning{title="Experimental feature"}

@@ -70,6 +70,10 @@ workflow NFCORE_SCDOWNSTREAM {
     base_embeddings               //   value: string
     base_label_col                //   value: string
     base_condition_col            //   value: string
+    base_batch_col                //   value: string
+    base_scanvi_label_col         //   value: string
+    base_scanvi_unknown_label     //   value: string
+    integrate_per_label           //   value: boolean
     cluster_per_label             //   value: boolean
     cluster_global                //   value: boolean
     clustering_resolutions        //   value: string
@@ -128,6 +132,10 @@ workflow NFCORE_SCDOWNSTREAM {
         base_embeddings,
         base_label_col,
         base_condition_col,
+        base_batch_col,
+        base_scanvi_label_col,
+        base_scanvi_unknown_label,
+        integrate_per_label,
         cluster_per_label,
         cluster_global,
         clustering_resolutions,
@@ -228,6 +236,10 @@ workflow {
         params.base_embeddings,
         params.base_label_col,
         params.base_condition_col,
+        params.base_batch_col,
+        params.base_scanvi_label_col,
+        params.base_scanvi_unknown_label,
+        params.integrate_per_label,
         params.cluster_per_label,
         params.cluster_global,
         params.clustering_resolutions,

@@ -25,7 +25,7 @@
 
 adata = ad.read_h5ad("${h5ad}")
 reference_model_path = "reference_model"
-reference_model_type = "${meta2.id ?: ''}"
+reference_model_type = "${meta2.integration ?: meta2.id ?: ''}"
 
 plan_kwargs = {}
 

@@ -51,6 +51,10 @@ params {
     base_embeddings               = null
     base_label_col                = 'label'
     base_condition_col            = 'condition'
+    base_batch_col                = 'batch'
+    base_scanvi_label_col         = 'label'
+    base_scanvi_unknown_label     = 'Unknown'
+    integrate_per_label           = false
 
     // Clustering options
     clustering_resolutions        = '0.5,1.0'

@@ -225,17 +225,37 @@
                 "base_label_col": {
                     "type": "string",
                     "default": "label",
-                    "description": "The column in the base AnnData object that contains the label (e.g. cell type) information."
+                    "description": "The column in the base AnnData object used to group cells for downstream per-label analysis and, when integrate_per_label is true, to split the object before integration."
                 },
                 "base_condition_col": {
                     "type": "string",
                     "default": "condition",
                     "description": "The column in the base AnnData object that contains the condition (e.g. disease state, treatment) information."
                 },
+                "base_batch_col": {
+                    "type": "string",
+                    "default": "batch",
+                    "description": "The column in the base AnnData object that contains batch information for integration methods used with integrate_per_label."
+                },
+                "base_scanvi_label_col": {
+                    "type": "string",
+                    "default": "label",
+                    "description": "The column in the base AnnData object that contains cell labels for scANVI when integrate_per_label is true and scanvi is selected."
+                },
+                "base_scanvi_unknown_label": {
+                    "type": "string",
+                    "default": "Unknown",
+                    "description": "The category in base_scanvi_label_col that should be treated as unlabeled by scANVI when integrate_per_label is true."
+                },
+                "integrate_per_label": {
+                    "type": "boolean",
+                    "default": false,
+                    "description": "In base_adata-only runs, split the base AnnData by base_label_col and run the selected integration_methods independently for each group before clustering."
+                },
                 "base_embeddings": {
                     "type": "string",
-                    "description": "The keys in the obsm of the base AnnData object that contain the embeddings (without leading `X_`). Required if `input` is not provided - otherwise it is ignored.",
-                    "help_text": "If the `input` parameter is not provided (no new data to add), integration will not be performed. In order to be able to utilize existing integration results, you need to provide the keys in the obsm of the base AnnData object that contain the embeddings (without leading `X_`).",
+                    "description": "The keys in the obsm of the base AnnData object that contain the embeddings (without leading `X_`). Required if input is not provided and integrate_per_label is false; otherwise it is ignored.",
+                    "help_text": "If the `input` parameter is not provided (no new data to add), integration is skipped unless `integrate_per_label` is true. To reuse existing integration results, provide the keys in the obsm of the base AnnData object that contain the embeddings (without leading `X_`).",
                     "pattern": "^((scvi|scanvi|symphony|bbknn|combat|seurat)(,(scvi|scanvi|symphony|bbknn|combat|seurat))*)?$"
                 }
             }

@@ -20,15 +20,25 @@ workflow CLUSTER {
     ch_multiqc_files = channel.empty()
     ch_h5ad = channel.empty()
 
+    ch_input_by_subset = ch_input.branch { meta, _h5ad ->
+        already_split: meta.subset != null
+        needs_split: true
+    }
+
+    ch_h5ad = ch_h5ad.mix(
+        ch_input_by_subset.already_split
+            .map { meta, h5ad -> [meta + [already_split: true], h5ad] }
+    )
+
     if (global) {
         ch_h5ad = ch_h5ad
-            .mix(ch_input
+            .mix(ch_input_by_subset.needs_split
                 .map { meta, h5ad -> [meta + [subset: "global"], h5ad] })
     }
 
     if (per_label) {
         SPLITCOL (
-            ch_input,
+            ch_input_by_subset.needs_split,
             split_col
         )
 
@@ -42,7 +52,10 @@ workflow CLUSTER {
     ch_h5ad = ch_h5ad
         .map {
             meta, h5ad ->
-            [meta + [id: meta.integration + "-" + meta.subset], h5ad]
+            def cluster_id = meta.subset != null
+                ? meta.integration + "-" + meta.subset
+                : meta.integration
+            [meta + [id: cluster_id], h5ad]
         }
 
     ch_h5ad = ch_h5ad.branch { meta, _h5ad ->
@@ -57,14 +70,26 @@ workflow CLUSTER {
     ch_h5ad = NEIGHBORS.out.h5ad.mix(ch_h5ad.has_neighbors)
     ch_h5ad_neighbours = NEIGHBORS.out.h5ad
 
+    ch_h5ad_for_umap = ch_h5ad
+        .map { meta, h5ad ->
+            meta.already_split
+                ? [meta + [id: meta.id + "-umap", cluster_id: meta.id], h5ad]
+                : [meta, h5ad]
+        }
+
     UMAP (
-        ch_h5ad
+        ch_h5ad_for_umap
     )
     ch_obsm = ch_obsm.mix(UMAP.out.obsm)
 
     ch_resolutions = channel.fromList(default_resolutions)
 
     ch_h5ad_for_leiden = UMAP.out.h5ad
+        .map { meta, h5ad ->
+            meta.cluster_id
+                ? [meta.findAll { key, _value -> key != 'cluster_id' } + [id: meta.cluster_id], h5ad]
+                : [meta, h5ad]
+        }
         .combine(ch_resolutions)
         .filter { meta, _h5ad, resolution ->
             analysis_plan_rows.any { row ->
@@ -77,7 +102,7 @@ workflow CLUSTER {
             [
                 meta + [
                     resolution: resolution,
-                    id: meta.integration + "-" + meta.subset + "-" + resolution,
+                    id: meta.id + "-" + resolution,
                 ],
                 h5ad,
             ]

@@ -20,6 +20,9 @@ workflow COMBINE {
     symphony_reference           //   value: string
     expimap_gmt                 //   value: string
     condition_col               //   value: string
+    batch_col                   //   value: string
+    scanvi_label_col            //   value: string
+    scanvi_unlabeled_category   //   value: string
     scib                        //   value: boolean
 
     main:
@@ -55,7 +58,10 @@ workflow COMBINE {
         scimilarity_model,
         symphony_reference,
         expimap_gmt,
-        condition_col
+        condition_col,
+        batch_col,
+        scanvi_label_col,
+        scanvi_unlabeled_category
     )
     ch_var           = ch_var.mix(INTEGRATE.out.var)
 

@@ -34,7 +34,10 @@ nextflow_workflow {
                 input[11] = null
                 input[12] = null
                 input[13] = 'condition'
-                input[14] = false
+                input[14] = 'batch'
+                input[15] = 'label'
+                input[16] = 'Unknown'
+                input[17] = false
                 """
             }
         }
@@ -71,7 +74,10 @@ nextflow_workflow {
                 input[11] = null
                 input[12] = null
                 input[13] = 'condition'
-                input[14] = false
+                input[14] = 'batch'
+                input[15] = 'label'
+                input[16] = 'Unknown'
+                input[17] = false
                 """
             }
         }