diff --git a/README.md b/README.md index 0964e70c..ddffa516 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,8 @@ It also performs basic QC and coverage analysis. The pipeline is built using Nextflow, a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible. +The pipeline also supports Unique Molecular Identifier (UMI) data. If your samplesheet includes a `umi_type` column (`seq` or `readname`), UMI-aware preprocessing is enabled automatically. Rows with no `umi_type` specified will be processed as non-UMI sequencing data. + Steps inlcude: 1. Demultiplexing using [`BCLconvert`](https://emea.support.illumina.com/sequencing/sequencing_software/bcl-convert.html) @@ -24,7 +26,13 @@ Steps inlcude: 6. Alignment QC using [`samtools flagstat`](http://www.htslib.org/doc/samtools-flagstat.html), [`samtools stats`](http://www.htslib.org/doc/samtools-stats.html), [`samtools idxstats`](http://www.htslib.org/doc/samtools-idxstats.html) and [`picard CollecHsMetrics`](https://broadinstitute.github.io/picard/command-line-overview.html#CollectHsMetrics), [`picard CollectWgsMetrics`](https://broadinstitute.github.io/picard/command-line-overview.html#CollectWgsMetrics), [`picard CollectMultipleMetrics`](https://broadinstitute.github.io/picard/command-line-overview.html#CollectMultipleMetrics) 7. QC aggregation using [`multiqc`](https://multiqc.info/) -![metro map](docs/images/metro_map.png) +![metro map](docs/images/metro_mapumi.png) + +UMI processing (only for rows with `umi_type`): +- Extract UMI from read sequence (`seq`) or read name (`readname`) +- Group reads by UMI (fgbio GroupReadsByUmi) +- Call molecular consensus (fgbio CallMolecularConsensusReads) and filter (fgbio FilterConsensusReads) +- Re-align filtered consensus reads with BWA-MEM (`-Y`), then sort/index ## Usage @@ -41,6 +49,12 @@ First, prepare a samplesheet with your input data that looks as follows: id,samplename,organism,library,fastq_1,fastq_2 sample1,sample1,Homo sapiens,Library_Name,reads1.fq.gz,reads2.fq.gz ``` +`samplesheet.csv` for fastq inputs with UMI: + +```csv +id,samplename,organism,library,umi_type,fastq_1,fastq_2 +umi_sample1,umi_sample1,Homo sapiens,Library_Name,seq,reads1.fq.gz,reads2.fq.gz +``` `samplesheet.csv` for flowcell inputs: diff --git a/assets/schema_input.json b/assets/schema_input.json index beaa2eb6..73e9e721 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -72,6 +72,13 @@ "pattern": "^[a-zA-Z0-9_]+.bed$", "default": null }, + "umi_type": { + "meta": ["umi_type"], + "type": "string", + "description": "Distinguish UMI samples (readname, seq) from non-UMI samples", + "enum": ["readname", "seq", "none"], + "default": "none" + }, "lane": { "type": "integer", "meta": ["lane"], @@ -110,13 +117,13 @@ }, "anyOf": [ { - "required": ["id", "samplename", "organism", "tag", "fastq_1", "fastq_2"] + "required": ["id", "samplename", "organism", "tag", "fastq_1", "fastq_2", "umi_type"] }, { - "required": ["id", "samplename", "genome", "tag", "fastq_1", "fastq_2"] + "required": ["id", "samplename", "genome", "tag", "fastq_1", "fastq_2", "umi_type"] }, { - "required": ["id", "samplesheet", "sample_info", "flowcell"] + "required": ["id", "samplesheet", "sample_info", "flowcell", "umi_type"] } ] }, diff --git a/docs/images/metro_mapumi.png b/docs/images/metro_mapumi.png new file mode 100644 index 00000000..56b43b75 Binary files /dev/null and b/docs/images/metro_mapumi.png differ diff --git a/docs/images/metro_mapumi.svg b/docs/images/metro_mapumi.svg new file mode 100644 index 00000000..73f54ac2 --- /dev/null +++ b/docs/images/metro_mapumi.svg @@ -0,0 +1,1351 @@ + + + +nf-cmgg/preprocessingfastqflowcellfastqORfastqfastqbclbclbclbclbclFastP trimming,QC andadapterremovalAlign +bowtie2, bwamem, +bwamem2, snap +dragmap or starUMI preprocessing +copy UMI + group +Consensus post-align +zipperbams + sort +Consensus reads +call + filter +cramMultiQCreportindexfastaORMandatory process(es)UMI flowSupported genomes flowReport filesLegendsOptional process(es)Additional outputs/inputsUnsupported genomes flowCreate indexwhen index ismissing Sort & mark duplicatessamtools sormadup or bamsormadupsamtools convertbam -> crambedmosdepthsamtools import +fastq -> ucramPicard metricspicard CollectMultipleMetrics,picard CollectWgsMetrics andpicard CollectHsMetrics--disable_picard_metrics falsesamtools metricssamtools stats,samtools flagstat andsamtools idxstatsucramreportsAll generatedreportssamtools cat +merge ucrams +from same samplebcl-convertbcl -> fastqinteropdetermine coveragein genelist panelssamtoolscoveragebed diff --git a/modules.json b/modules.json index d860bfc6..adcf495d 100644 --- a/modules.json +++ b/modules.json @@ -51,6 +51,55 @@ "installed_by": ["modules"], "patch": "modules/nf-core/fastp/fastp.diff" }, + "fgbio/callduplexconsensusreads": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, + "fgbio/callmolecularconsensusreads": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, + "fgbio/collectduplexseqmetrics": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, + "fgbio/copyumifromreadname": { + "branch": "master", + "git_sha": "47dbfc0fbcd8e4e3b73d843f4659069441ca8692", + "installed_by": ["modules"], + "patch": "modules/nf-core/fgbio/copyumifromreadname/fgbio-copyumifromreadname.diff" + }, + "fgbio/fastqtobam": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"], + "patch": "modules/nf-core/fgbio/fastqtobam/fgbio-fastqtobam.diff" + }, + "fgbio/filterconsensusreads": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"], + "patch": "modules/nf-core/fgbio/filterconsensusreads/fgbio-filterconsensusreads.diff" + }, + "fgbio/groupreadsbyumi": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, + "fgbio/sortbam": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, + "fgbio/zipperbams": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"], + "patch": "modules/nf-core/fgbio/zipperbams/fgbio-zipperbams.diff" + }, "md5sum": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", @@ -103,6 +152,11 @@ "installed_by": ["modules"], "patch": "modules/nf-core/samtools/coverage/samtools-coverage.diff" }, + "samtools/fastq": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, "samtools/flagstat": { "branch": "master", "git_sha": "2d20463181b1c38981a02e90d3084b5f9fa8d540", @@ -118,6 +172,11 @@ "git_sha": "2d20463181b1c38981a02e90d3084b5f9fa8d540", "installed_by": ["modules"] }, + "samtools/index": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, "samtools/sormadup": { "branch": "master", "git_sha": "38f3b0200093498b70ac2d63a83eac5642e3c873", diff --git a/modules/nf-core/fgbio/callduplexconsensusreads/environment.yml b/modules/nf-core/fgbio/callduplexconsensusreads/environment.yml new file mode 100644 index 00000000..4dbb6856 --- /dev/null +++ b/modules/nf-core/fgbio/callduplexconsensusreads/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::fgbio=2.5.21 diff --git a/modules/nf-core/fgbio/callduplexconsensusreads/main.nf b/modules/nf-core/fgbio/callduplexconsensusreads/main.nf new file mode 100644 index 00000000..be6fc97a --- /dev/null +++ b/modules/nf-core/fgbio/callduplexconsensusreads/main.nf @@ -0,0 +1,70 @@ +process FGBIO_CALLDUPLEXCONSENSUSREADS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/b4/b4047e3e517b57fae311eab139a12f0887d898b7da5fceeb2a1029c73b9e3904/data' : + 'community.wave.seqera.io/library/fgbio:2.5.21--368dab1b4f308243' }" + + input: + tuple val(meta), path(grouped_bam) + val min_reads + val min_baseq + + output: + tuple val(meta), path("${prefix}.bam"), emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}_consensus_unmapped" + + def mem_gb = 8 + if (!task.memory) { + log.info '[fgbio CallDuplexConsensusReads] Available memory not known - defaulting to 8GB. Specify process memory requirements to change this.' + } else if (mem_gb > task.memory.giga) { + if (task.memory.giga < 2) { + mem_gb = 1 + } else { + mem_gb = task.memory.giga - 1 + } + } + if ("$grouped_bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + + """ + fgbio \\ + -Xmx${mem_gb}g \\ + --tmp-dir=. \\ + --async-io=true \\ + --compression=1 \\ + CallDuplexConsensusReads \\ + --input $grouped_bam \\ + --output ${prefix}.bam \\ + --min-reads ${min_reads} \\ + --min-input-base-quality ${min_baseq} \\ + --threads ${task.cpus} \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}_consensus_unmapped" + if ("$grouped_bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + END_VERSIONS + """ + +} diff --git a/modules/nf-core/fgbio/callduplexconsensusreads/meta.yml b/modules/nf-core/fgbio/callduplexconsensusreads/meta.yml new file mode 100644 index 00000000..3b615eda --- /dev/null +++ b/modules/nf-core/fgbio/callduplexconsensusreads/meta.yml @@ -0,0 +1,57 @@ +name: "fgbio_callduplexconsensusreads" +description: Uses FGBIO CallDuplexConsensusReads to call duplex consensus sequences + from reads generated from the same double-stranded source molecule. +keywords: + - umi + - duplex + - fgbio +tools: + - "fgbio": + description: "A set of tools for working with genomic and high throughput sequencing + data, including UMIs" + homepage: http://fulcrumgenomics.github.io/fgbio/ + documentation: http://fulcrumgenomics.github.io/fgbio/tools/latest/CallDuplexConsensusReads.html + tool_dev_url: https://github.com/fulcrumgenomics/fgbio + licence: ["MIT"] + identifier: biotools:fgbio +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - grouped_bam: + type: file + description: Grouped BAM file + pattern: "*.bam" + ontologies: [] + - min_reads: + type: string + description: Minimum number of raw/original reads to build each consensus read. Can + be a space delimited list of 1-3 values. See fgbio documentation for more details. + - min_baseq: + type: integer + description: Ignore bases in raw reads that have Q below this value +output: + bam: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}.bam: + type: file + description: consensus BAM file + pattern: "*.bam" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@lescai" +maintainers: + - "@lescai" diff --git a/modules/nf-core/fgbio/callduplexconsensusreads/tests/main.nf.test b/modules/nf-core/fgbio/callduplexconsensusreads/tests/main.nf.test new file mode 100644 index 00000000..0144e0ea --- /dev/null +++ b/modules/nf-core/fgbio/callduplexconsensusreads/tests/main.nf.test @@ -0,0 +1,62 @@ +nextflow_process { + + name "Test Process FGBIO_CALLDUPLEXCONSENSUSREADS" + script "../main.nf" + process "FGBIO_CALLDUPLEXCONSENSUSREADS" + + tag "modules" + tag "modules_nfcore" + tag "fgbio" + tag "fgbio/callduplexconsensusreads" + + test("homo_sapiens - bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_grouped.bam', checkIfExists: true) + ] + input[1] = 3 + input[2] = 20 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("homo_sapiens - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_grouped.bam', checkIfExists: true) + ] + input[1] = 3 + input[2] = 20 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/fgbio/callduplexconsensusreads/tests/main.nf.test.snap b/modules/nf-core/fgbio/callduplexconsensusreads/tests/main.nf.test.snap new file mode 100644 index 00000000..dcc7f9c8 --- /dev/null +++ b/modules/nf-core/fgbio/callduplexconsensusreads/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "homo_sapiens - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test_consensus_unmapped.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,337eebefcdf12475174a668e31bb4245" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test_consensus_unmapped.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,337eebefcdf12475174a668e31bb4245" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-06T16:32:47.930923" + }, + "homo_sapiens - bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test_consensus_unmapped.bam:md5,4f0e87feb7601d06617c9f29d7aec352" + ] + ], + "1": [ + "versions.yml:md5,337eebefcdf12475174a668e31bb4245" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test_consensus_unmapped.bam:md5,4f0e87feb7601d06617c9f29d7aec352" + ] + ], + "versions": [ + "versions.yml:md5,337eebefcdf12475174a668e31bb4245" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-08T13:04:50.447095" + } +} \ No newline at end of file diff --git a/modules/nf-core/fgbio/callmolecularconsensusreads/environment.yml b/modules/nf-core/fgbio/callmolecularconsensusreads/environment.yml new file mode 100644 index 00000000..4dbb6856 --- /dev/null +++ b/modules/nf-core/fgbio/callmolecularconsensusreads/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::fgbio=2.5.21 diff --git a/modules/nf-core/fgbio/callmolecularconsensusreads/main.nf b/modules/nf-core/fgbio/callmolecularconsensusreads/main.nf new file mode 100644 index 00000000..e1d869b3 --- /dev/null +++ b/modules/nf-core/fgbio/callmolecularconsensusreads/main.nf @@ -0,0 +1,64 @@ +process FGBIO_CALLMOLECULARCONSENSUSREADS { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/b4/b4047e3e517b57fae311eab139a12f0887d898b7da5fceeb2a1029c73b9e3904/data' : + 'community.wave.seqera.io/library/fgbio:2.5.21--368dab1b4f308243' }" + + input: + tuple val(meta), path(grouped_bam) + val min_reads + val min_baseq + + output: + tuple val(meta), path("*.bam"), emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}_consensus_unmapped" + def mem_gb = 8 + if (!task.memory) { + log.info '[fgbio CallMolecularConsensusReads] Available memory not known - defaulting to 8GB. Specify process memory requirements to change this.' + } else { + mem_gb = task.memory.giga + } + if ("$grouped_bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + fgbio \\ + -Xmx${mem_gb}g \\ + --tmp-dir=. \\ + --async-io=true \\ + --compression=1 \\ + CallMolecularConsensusReads \\ + --input $grouped_bam \\ + --output ${prefix}.bam \\ + --min-reads ${min_reads} \\ + --min-input-base-quality ${min_baseq} \\ + --threads ${task.cpus} \\ + $args; + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}_consensus_unmapped" + if ("$grouped_bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + END_VERSIONS + """ + +} diff --git a/modules/nf-core/fgbio/callmolecularconsensusreads/meta.yml b/modules/nf-core/fgbio/callmolecularconsensusreads/meta.yml new file mode 100644 index 00000000..c7b75eb7 --- /dev/null +++ b/modules/nf-core/fgbio/callmolecularconsensusreads/meta.yml @@ -0,0 +1,55 @@ +name: fgbio_callmolecularconsensusreads +description: Calls consensus sequences from reads with the same unique molecular tag. +keywords: + - UMIs + - consensus sequence + - bam +tools: + - fgbio: + description: Tools for working with genomic and high throughput sequencing data. + homepage: https://github.com/fulcrumgenomics/fgbio + documentation: http://fulcrumgenomics.github.io/fgbio/ + licence: ["MIT"] + identifier: biotools:fgbio +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false, collapse:false ] + - grouped_bam: + type: file + description: | + The input SAM or BAM file, grouped by UMIs + pattern: "*.{bam,sam}" + ontologies: [] + - min_reads: + type: integer + description: Minimum number of original reads to build each consensus read. + - min_baseq: + type: integer + description: Ignore bases in raw reads that have Q below this value. +output: + bam: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.bam": + type: file + description: | + Output SAM or BAM file to write consensus reads. + pattern: "*.{bam,sam}" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@sruthipsuresh" +maintainers: + - "@sruthipsuresh" diff --git a/modules/nf-core/fgbio/callmolecularconsensusreads/tests/main.nf.test b/modules/nf-core/fgbio/callmolecularconsensusreads/tests/main.nf.test new file mode 100644 index 00000000..8a906340 --- /dev/null +++ b/modules/nf-core/fgbio/callmolecularconsensusreads/tests/main.nf.test @@ -0,0 +1,72 @@ +nextflow_process { + + name "Test Process FGBIO_CALLMOLECULARCONSENSUSREADS" + script "../main.nf" + process "FGBIO_CALLMOLECULARCONSENSUSREADS" + + tag "modules" + tag "modules_nfcore" + tag "fgbio" + tag "fgbio/callmolecularconsensusreads" + tag "fgbio/sortbam" + + setup { + + run("FGBIO_SORTBAM") { + script "../../sortbam/main.nf" + config "./sort.config" + process { + """ + input[0] = [[ id:'homo_sapiens_genome' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.bam', checkIfExists: true) + ] + """ + } + } + } + + test("homo_sapiens - bam") { + + when { + process { + """ + input[0] = FGBIO_SORTBAM.out.bam + input[1] = 1 + input[2] = 20 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("homo_sapiens - stub") { + + options "-stub" + + when { + process { + """ + input[0] = FGBIO_SORTBAM.out.bam + input[1] = 1 + input[2] = 20 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/fgbio/callmolecularconsensusreads/tests/main.nf.test.snap b/modules/nf-core/fgbio/callmolecularconsensusreads/tests/main.nf.test.snap new file mode 100644 index 00000000..f37f1bd7 --- /dev/null +++ b/modules/nf-core/fgbio/callmolecularconsensusreads/tests/main.nf.test.snap @@ -0,0 +1,68 @@ +{ + "homo_sapiens - stub": { + "content": [ + { + "0": [ + [ + { + "id": "homo_sapiens_genome" + }, + "homo_sapiens_genome_consensus_unmapped.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,8dbdae0c815fd6be2c3090ca83f6bbc6" + ], + "bam": [ + [ + { + "id": "homo_sapiens_genome" + }, + "homo_sapiens_genome_consensus_unmapped.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,8dbdae0c815fd6be2c3090ca83f6bbc6" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-06T16:33:48.560245" + }, + "homo_sapiens - bam": { + "content": [ + { + "0": [ + [ + { + "id": "homo_sapiens_genome" + }, + "homo_sapiens_genome_consensus_unmapped.bam:md5,f56c861f1f604ecc9894dc9182b170f8" + ] + ], + "1": [ + "versions.yml:md5,8dbdae0c815fd6be2c3090ca83f6bbc6" + ], + "bam": [ + [ + { + "id": "homo_sapiens_genome" + }, + "homo_sapiens_genome_consensus_unmapped.bam:md5,f56c861f1f604ecc9894dc9182b170f8" + ] + ], + "versions": [ + "versions.yml:md5,8dbdae0c815fd6be2c3090ca83f6bbc6" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-08T13:05:45.874565" + } +} \ No newline at end of file diff --git a/modules/nf-core/fgbio/callmolecularconsensusreads/tests/sort.config b/modules/nf-core/fgbio/callmolecularconsensusreads/tests/sort.config new file mode 100644 index 00000000..b205c8f2 --- /dev/null +++ b/modules/nf-core/fgbio/callmolecularconsensusreads/tests/sort.config @@ -0,0 +1,6 @@ +process { + withName: FGBIO_SORTBAM { + ext.args = '-s TemplateCoordinate' + ext.prefix = { "${meta.id}_out" } + } +} diff --git a/modules/nf-core/fgbio/collectduplexseqmetrics/environment.yml b/modules/nf-core/fgbio/collectduplexseqmetrics/environment.yml new file mode 100644 index 00000000..f83d1262 --- /dev/null +++ b/modules/nf-core/fgbio/collectduplexseqmetrics/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::fgbio=2.5.21 + - conda-forge::r-ggplot2=3.5.2 diff --git a/modules/nf-core/fgbio/collectduplexseqmetrics/main.nf b/modules/nf-core/fgbio/collectduplexseqmetrics/main.nf new file mode 100644 index 00000000..9edf0ee8 --- /dev/null +++ b/modules/nf-core/fgbio/collectduplexseqmetrics/main.nf @@ -0,0 +1,80 @@ +process FGBIO_COLLECTDUPLEXSEQMETRICS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/b4/b4047e3e517b57fae311eab139a12f0887d898b7da5fceeb2a1029c73b9e3904/data' : + 'community.wave.seqera.io/library/fgbio:2.5.21--368dab1b4f308243' }" + + input: + tuple val(meta), path(grouped_bam) + path interval_list + + output: + tuple val(meta), path("**.family_sizes.txt") , emit: family_sizes + tuple val(meta), path("**.duplex_family_sizes.txt") , emit: duplex_family_sizes + tuple val(meta), path("**.duplex_yield_metrics.txt"), emit: duplex_yield_metrics + tuple val(meta), path("**.umi_counts.txt") , emit: umi_counts + tuple val(meta), path("**.duplex_qc.pdf") , emit: duplex_qc + tuple val(meta), path("**.duplex_umi_counts.txt") , emit: duplex_umi_counts, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def intervals = interval_list ? "--intervals ${interval_list}" : "" + def mem_gb = 8 + + if (!task.memory) { + log.info '[fgbio CollectDuplexSeqMetrics] Available memory not known - defaulting to 8GB. Specify process memory requirements to change this.' + } else if (mem_gb > task.memory.giga) { + if (task.memory.giga < 2) { + mem_gb = 1 + } else { + mem_gb = task.memory.giga - 1 + } + } + + """ + fgbio \\ + -Xmx${mem_gb}g \\ + --tmp-dir=. \\ + --async-io=true \\ + --compression=1 \\ + CollectDuplexSeqMetrics \\ + --input $grouped_bam \\ + --output ${prefix} \\ + $intervals \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + ggplot2: \$(Rscript -e "library(ggplot2); cat(as.character(packageVersion('ggplot2')))") + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def touch_duplex_umi = args.contains("--duplex-umi-counts") || args.contains("-u") ? "touch ${prefix}.duplex_umi_counts.txt" : "" + + """ + touch ${prefix}.family_sizes.txt + touch ${prefix}.duplex_family_sizes.txt + touch ${prefix}.duplex_yield_metrics.txt + touch ${prefix}.umi_counts.txt + touch ${prefix}.duplex_qc.pdf + $touch_duplex_umi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + ggplot2: \$(Rscript -e "library(ggplot2); cat(as.character(packageVersion('ggplot2')))") + END_VERSIONS + """ +} diff --git a/modules/nf-core/fgbio/collectduplexseqmetrics/meta.yml b/modules/nf-core/fgbio/collectduplexseqmetrics/meta.yml new file mode 100644 index 00000000..947540b3 --- /dev/null +++ b/modules/nf-core/fgbio/collectduplexseqmetrics/meta.yml @@ -0,0 +1,130 @@ +name: "fgbio_collectduplexseqmetrics" +description: Collects a suite of metrics to QC duplex sequencing data. +keywords: + - UMIs + - QC + - bam + - duplex +tools: + - "fgbio": + description: "A set of tools for working with genomic and high throughput sequencing + data, including UMIs" + homepage: "http://fulcrumgenomics.github.io/fgbio/" + documentation: "http://fulcrumgenomics.github.io/fgbio/" + tool_dev_url: "https://github.com/fulcrumgenomics/fgbio" + licence: ["MIT"] + identifier: biotools:fgbio + - "r-ggplot2": + description: "ggplot2 is a system for declaratively creating graphics, based on + The Grammar of Graphics. " + homepage: "https://ggplot2.tidyverse.org/" + documentation: "https://ggplot2.tidyverse.org/" + tool_dev_url: "https://github.com/tidyverse/ggplot2" + licence: ["MIT"] + identifier: biotools:fgbio + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - grouped_bam: + type: file + description: It has to be either 1)The exact BAM output by the GroupReadsByUmi + tool (in the sort-order it was produced in) 2)A BAM file that has MI tags + present on all reads (usually set by GroupReadsByUmi and has been sorted with + SortBam into TemplateCoordinate order. + pattern: "*.bam" + ontologies: [] + - interval_list: + type: file + description: Calculation of metrics may be restricted to a set of regions using + the --intervals parameter. The file format is descripted here + https://samtools.github.io/htsjdk/javadoc/htsjdk/index.html?htsjdk/samtools/util/Interval.html + pattern: "*.{tsv|txt|interval_list}" + ontologies: [] +output: + family_sizes: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "**.family_sizes.txt": + type: file + description: Metrics on the frequency of different types of families of different + sizes + pattern: "*.txt" + ontologies: [] + duplex_family_sizes: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "**.duplex_family_sizes.txt": + type: file + description: Metrics on the frequency of duplex tag families by the number + of observations from each strand + pattern: "*.txt" + ontologies: [] + duplex_yield_metrics: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "**.duplex_yield_metrics.txt": + type: file + description: Summary QC metrics produced using 5%, 10%, 15%...100% of the + data + pattern: "*.txt" + ontologies: [] + umi_counts: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "**.umi_counts.txt": + type: file + description: Metrics on the frequency of observations of UMIs within reads + and tag families + pattern: "*.txt" + ontologies: [] + duplex_qc: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "**.duplex_qc.pdf": + type: file + description: A series of plots generated from the preceding metrics files + for visualization + pattern: "*.pdf" + ontologies: [] + duplex_umi_counts: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "**.duplex_umi_counts.txt": + type: file + description: Metrics on the frequency of observations of duplex UMIs within + reads and tag families. + pattern: "*.txt" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@georgiakes" +maintainers: + - "@georgiakes" diff --git a/modules/nf-core/fgbio/collectduplexseqmetrics/tests/main.nf.test b/modules/nf-core/fgbio/collectduplexseqmetrics/tests/main.nf.test new file mode 100644 index 00000000..0021229b --- /dev/null +++ b/modules/nf-core/fgbio/collectduplexseqmetrics/tests/main.nf.test @@ -0,0 +1,79 @@ +nextflow_process { + + name "Test Process FGBIO_COLLECTDUPLEXSEQMETRICS" + script "../main.nf" + process "FGBIO_COLLECTDUPLEXSEQMETRICS" + + tag "modules" + tag "modules_nfcore" + tag "fgbio" + tag "fgbio/collectduplexseqmetrics" + + + test("homo_sapiens - bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_grouped.bam', checkIfExists: true) + ] + input[1]=[] + + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.family_sizes, + process.out.duplex_family_sizes, + process.out.duplex_yield_metrics, + process.out.umi_counts, + process.out.duplex_umi_counts, + process.out.versions, + file(process.out.duplex_qc[0][1]).name) + .match() } + + ) + } + + } + + test("homo_sapiens - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_grouped.bam', checkIfExists: true) + ] + input[1] = [] + + + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.family_sizes, + process.out.duplex_family_sizes, + process.out.duplex_yield_metrics, + process.out.umi_counts, + process.out.duplex_umi_counts, + process.out.versions, + file(process.out.duplex_qc[0][1]).name) + .match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/fgbio/collectduplexseqmetrics/tests/main.nf.test.snap b/modules/nf-core/fgbio/collectduplexseqmetrics/tests/main.nf.test.snap new file mode 100644 index 00000000..f7b9547f --- /dev/null +++ b/modules/nf-core/fgbio/collectduplexseqmetrics/tests/main.nf.test.snap @@ -0,0 +1,106 @@ +{ + "homo_sapiens - stub": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.family_sizes.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.duplex_family_sizes.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.duplex_yield_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.umi_counts.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + + ], + [ + "versions.yml:md5,d8d6be2d6162514abe0b38fa29f963c4" + ], + "test.duplex_qc.pdf" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-08T13:07:01.106818" + }, + "homo_sapiens - bam": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.family_sizes.txt:md5,a49de49bd587440c316fec830f502620" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.duplex_family_sizes.txt:md5,129e41170b9f5f2f8edce62a686c8548" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.duplex_yield_metrics.txt:md5,237e4e4ee713fdf672b0ee796827fb9d" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.umi_counts.txt:md5,9fe38b2a49ca80492b3a1c6a55679155" + ] + ], + [ + + ], + [ + "versions.yml:md5,d8d6be2d6162514abe0b38fa29f963c4" + ], + "test.duplex_qc.pdf" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-08T13:06:43.025228" + } +} \ No newline at end of file diff --git a/modules/nf-core/fgbio/copyumifromreadname/environment.yml b/modules/nf-core/fgbio/copyumifromreadname/environment.yml new file mode 100644 index 00000000..4ebc0924 --- /dev/null +++ b/modules/nf-core/fgbio/copyumifromreadname/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::fgbio=2.4.0 diff --git a/modules/nf-core/fgbio/copyumifromreadname/main.nf b/modules/nf-core/fgbio/copyumifromreadname/main.nf new file mode 100644 index 00000000..b15c970a --- /dev/null +++ b/modules/nf-core/fgbio/copyumifromreadname/main.nf @@ -0,0 +1,64 @@ +process FGBIO_COPYUMIFROMREADNAME { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/87/87626ef674e2f19366ae6214575a114fe80ce598e796894820550731706a84be/data' : + 'community.wave.seqera.io/library/fgbio:2.4.0--913bad9d47ff8ddc' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path("*.bai"), emit: bai + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}_umi_extracted" + def mem_gb = 8 + if (!task.memory) { + log.info '[fgbio CopyUmiFromReadName] Available memory not known - defaulting to 8GB. Specify process memory requirements to change this.' + } else if (mem_gb > task.memory.giga) { + if (task.memory.giga < 2) { + mem_gb = 1 + } else { + mem_gb = task.memory.giga - 1 + } + } + """ + fgbio \\ + -Xmx${mem_gb}g \\ + --tmp-dir=. \\ + --async-io=true \\ + CopyUmiFromReadName \\ + ${args} \\ + --input ${bam} \\ + --output ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + END_VERSIONS + """ + + + stub: + def prefix = task.ext.prefix ?: "${meta.id}_umi_extracted" + """ + + touch ${prefix}.bam + touch ${prefix}.bai + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$(fgbio --version) + END_VERSIONS + """ +} diff --git a/modules/nf-core/fgbio/copyumifromreadname/meta.yml b/modules/nf-core/fgbio/copyumifromreadname/meta.yml new file mode 100644 index 00000000..7cf4c994 --- /dev/null +++ b/modules/nf-core/fgbio/copyumifromreadname/meta.yml @@ -0,0 +1,80 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "fgbio_copyumifromreadname" +description: Copies the UMI at the end of a bam files read name to the RX tag. +keywords: + - fgbio + - copy + - umi + - readname +tools: + - "fgbio": + description: "A set of tools for working with genomic and high throughput sequencing + data, including UMIs" + homepage: http://fulcrumgenomics.github.io/fgbio/ + documentation: http://fulcrumgenomics.github.io/fgbio/tools/latest/CallDuplexConsensusReads.html + tool_dev_url: https://github.com/fulcrumgenomics/fgbio + licence: ["MIT"] + identifier: biotools:fgbio + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + ontologies: + - edam: "http://edamontology.org/format_2572" # BAM + - edam: "http://edamontology.org/format_2573" # CRAM + - edam: "http://edamontology.org/format_3462" # SAM + + - bai: + type: file + description: Index for bam file + pattern: "*.{bai}" + ontologies: + - edam: "http://edamontology.org/format_2572" # BAM + - edam: "http://edamontology.org/format_2573" # CRAM + - edam: "http://edamontology.org/format_3462" # SAM + +output: + bam: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*.bam": + type: file + description: Sorted BAM file + pattern: "*.{bam}" + ontologies: + - edam: "http://edamontology.org/format_2572" # BAM + bai: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*.bai": + type: file + description: Index for bam file + pattern: "*.{bai}" + ontologies: + - edam: "http://edamontology.org/format_3327" # BAI + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@sppearce" +maintainers: + - "@sppearce" diff --git a/modules/nf-core/fgbio/copyumifromreadname/tests/main.nf.test b/modules/nf-core/fgbio/copyumifromreadname/tests/main.nf.test new file mode 100644 index 00000000..83d67a42 --- /dev/null +++ b/modules/nf-core/fgbio/copyumifromreadname/tests/main.nf.test @@ -0,0 +1,75 @@ +nextflow_process { + + name "Test Process FGBIO_COPYUMIFROMREADNAME" + script "../main.nf" + process "FGBIO_COPYUMIFROMREADNAME" + + tag "modules" + tag "modules_nfcore" + tag "fgbio" + tag "fgbio/copyumifromreadname" + config "./nextflow.config" + + test("sarscov2 - bam") { + + when { + params { + module_args = '--field-delimiter "_" ' + } + process { + """ + input[0] = [ + [ id:'test'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.umi.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.umi.sorted.bam.bai', checkIfExists: true), + ] + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out, + path(process.out.versions[0]).yaml + ).match() + } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + params { + module_args = '' + } + process { + """ + input[0] = [ + [ id:'test'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.umi.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.umi.sorted.bam.bai', checkIfExists: true), + ] + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out, + path(process.out.versions[0]).yaml + ).match() + } + ) + } + + } + +} diff --git a/modules/nf-core/fgbio/copyumifromreadname/tests/main.nf.test.snap b/modules/nf-core/fgbio/copyumifromreadname/tests/main.nf.test.snap new file mode 100644 index 00000000..d65ff345 --- /dev/null +++ b/modules/nf-core/fgbio/copyumifromreadname/tests/main.nf.test.snap @@ -0,0 +1,110 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_umi_extracted.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test_umi_extracted.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,1440a1d99b4c503c037f5325445eb7e6" + ], + "bai": [ + [ + { + "id": "test" + }, + "test_umi_extracted.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "bam": [ + [ + { + "id": "test" + }, + "test_umi_extracted.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,1440a1d99b4c503c037f5325445eb7e6" + ] + }, + { + "FGBIO_COPYUMIFROMREADNAME": { + "fgbio": null + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-04-21T10:27:36.454432228" + }, + "sarscov2 - bam": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_umi_extracted.bam:md5,245b5f4c2002dc6560353c5183247df3" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test_umi_extracted.bai:md5,d99827a46b6de71e2338f59eb69b13fc" + ] + ], + "2": [ + "versions.yml:md5,047dd6edb85ae3f51a255523a2bfcfc6" + ], + "bai": [ + [ + { + "id": "test" + }, + "test_umi_extracted.bai:md5,d99827a46b6de71e2338f59eb69b13fc" + ] + ], + "bam": [ + [ + { + "id": "test" + }, + "test_umi_extracted.bam:md5,245b5f4c2002dc6560353c5183247df3" + ] + ], + "versions": [ + "versions.yml:md5,047dd6edb85ae3f51a255523a2bfcfc6" + ] + }, + { + "FGBIO_COPYUMIFROMREADNAME": { + "fgbio": "2.4.0" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-04-21T10:27:16.185419539" + } +} \ No newline at end of file diff --git a/modules/nf-core/fgbio/copyumifromreadname/tests/nextflow.config b/modules/nf-core/fgbio/copyumifromreadname/tests/nextflow.config new file mode 100644 index 00000000..d6d31951 --- /dev/null +++ b/modules/nf-core/fgbio/copyumifromreadname/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: "FGBIO_COPYUMIFROMREADNAME" { + ext.args = params.module_args + } +} diff --git a/modules/nf-core/fgbio/fastqtobam/environment.yml b/modules/nf-core/fgbio/fastqtobam/environment.yml new file mode 100644 index 00000000..4dbb6856 --- /dev/null +++ b/modules/nf-core/fgbio/fastqtobam/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::fgbio=2.5.21 diff --git a/modules/nf-core/fgbio/fastqtobam/main.nf b/modules/nf-core/fgbio/fastqtobam/main.nf new file mode 100644 index 00000000..6ee64bb3 --- /dev/null +++ b/modules/nf-core/fgbio/fastqtobam/main.nf @@ -0,0 +1,70 @@ +process FGBIO_FASTQTOBAM { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/b4/b4047e3e517b57fae311eab139a12f0887d898b7da5fceeb2a1029c73b9e3904/data' : + 'community.wave.seqera.io/library/fgbio:2.5.21--368dab1b4f308243' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.bam") , emit: bam , optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def suffix = task.ext.suffix ?: "bam" + def sample_name = args.contains("--sample") ? "" : "--sample ${prefix}" + def library_name = args.contains("--library") ? "" : "--library ${prefix}" + + def mem_gb = 8 + if (!task.memory) { + log.info '[fgbio FastqToBam] Available memory not known - defaulting to 8GB. Specify process memory requirements to change this.' + } else if (mem_gb > task.memory.giga) { + if (task.memory.giga < 2) { + mem_gb = 1 + } else { + mem_gb = task.memory.giga - 1 + } + } + + """ + fgbio \\ + -Xmx${mem_gb}g \\ + --tmp-dir=. \\ + --async-io=true \\ + FastqToBam \\ + ${args} \\ + --input ${reads} \\ + --output ${prefix}.${suffix} \\ + ${sample_name} \\ + ${library_name} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def suffix = task.ext.suffix ?: "bam" + + """ + touch ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/fgbio/fastqtobam/meta.yml b/modules/nf-core/fgbio/fastqtobam/meta.yml new file mode 100644 index 00000000..d92f0a60 --- /dev/null +++ b/modules/nf-core/fgbio/fastqtobam/meta.yml @@ -0,0 +1,65 @@ +name: fgbio_fastqtobam +description: | + Using the fgbio tools, converts FASTQ files sequenced into unaligned BAM or CRAM files possibly moving the UMI barcode into the RX field of the reads +keywords: + - unaligned + - bam + - cram +tools: + - fgbio: + description: A set of tools for working with genomic and high throughput sequencing + data, including UMIs + homepage: http://fulcrumgenomics.github.io/fgbio/ + documentation: http://fulcrumgenomics.github.io/fgbio/tools/latest/ + tool_dev_url: https://github.com/fulcrumgenomics/fgbio + licence: ["MIT"] + identifier: biotools:fgbio +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: pair of reads to be converted into BAM file + pattern: "*.{fastq.gz}" + ontologies: [] +output: + bam: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.bam": + type: file + description: Unaligned, unsorted BAM file + pattern: "*.{bam}" + ontologies: [] + cram: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.cram": + type: file + description: Unaligned, unsorted CRAM file + pattern: "*.{cram}" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@lescai" + - "@matthdsm" + - "@nvnieuwk" +maintainers: + - "@lescai" + - "@matthdsm" + - "@nvnieuwk" diff --git a/modules/nf-core/fgbio/fastqtobam/tests/bam.config b/modules/nf-core/fgbio/fastqtobam/tests/bam.config new file mode 100644 index 00000000..014ba920 --- /dev/null +++ b/modules/nf-core/fgbio/fastqtobam/tests/bam.config @@ -0,0 +1,3 @@ +process { + ext.suffix = "bam" +} \ No newline at end of file diff --git a/modules/nf-core/fgbio/fastqtobam/tests/cram.config b/modules/nf-core/fgbio/fastqtobam/tests/cram.config new file mode 100644 index 00000000..2406cb99 --- /dev/null +++ b/modules/nf-core/fgbio/fastqtobam/tests/cram.config @@ -0,0 +1,3 @@ +process { + ext.suffix = "cram" +} \ No newline at end of file diff --git a/modules/nf-core/fgbio/fastqtobam/tests/custom_sample.config b/modules/nf-core/fgbio/fastqtobam/tests/custom_sample.config new file mode 100644 index 00000000..2ed567b4 --- /dev/null +++ b/modules/nf-core/fgbio/fastqtobam/tests/custom_sample.config @@ -0,0 +1,3 @@ +process { + ext.args = "--sample CustomSample --library CustomLibrary" +} \ No newline at end of file diff --git a/modules/nf-core/fgbio/fastqtobam/tests/main.nf.test b/modules/nf-core/fgbio/fastqtobam/tests/main.nf.test new file mode 100644 index 00000000..d10a0052 --- /dev/null +++ b/modules/nf-core/fgbio/fastqtobam/tests/main.nf.test @@ -0,0 +1,218 @@ +nextflow_process { + + name "Test Process FGBIO_FASTQTOBAM" + script "../main.nf" + process "FGBIO_FASTQTOBAM" + + tag "modules" + tag "modules_nfcore" + tag "fgbio" + tag "fgbio/fastqtobam" + + test("homo_sapiens - [fastq1, fastq2] - default") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test.umi_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test.umi_2.fastq.gz', checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - [fastq1, fastq2] - cram") { + + config "./cram.config" + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test.umi_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test.umi_2.fastq.gz', checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.bam, + file(process.out.cram[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - [fastq1, fastq2] - bam") { + + config "./bam.config" + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test.umi_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test.umi_2.fastq.gz', checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - fastq1") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test.umi_1.fastq.gz', checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - [fastq1, fastq2] - umi") { + + config "./umi.config" + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test.umi_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test.umi_2.fastq.gz', checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - [fastq1, fastq2] - custom sample") { + + config "./custom_sample.config" + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test.umi_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test.umi_2.fastq.gz', checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - [fastq1, fastq2] - stub") { + + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test.umi_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test.umi_2.fastq.gz', checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.versions + ).match() } + ) + } + + } +} diff --git a/modules/nf-core/fgbio/fastqtobam/tests/main.nf.test.snap b/modules/nf-core/fgbio/fastqtobam/tests/main.nf.test.snap new file mode 100644 index 00000000..cc01344d --- /dev/null +++ b/modules/nf-core/fgbio/fastqtobam/tests/main.nf.test.snap @@ -0,0 +1,114 @@ +{ + "homo_sapiens - [fastq1, fastq2] - cram": { + "content": [ + [ + + ], + "test.cram", + [ + "versions.yml:md5,468bbf74a89c7db86a209ad9bbfa7736" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-05T16:33:29.120923" + }, + "homo_sapiens - fastq1": { + "content": [ + "test.bam", + [ + + ], + [ + "versions.yml:md5,468bbf74a89c7db86a209ad9bbfa7736" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-05T16:34:06.610383" + }, + "homo_sapiens - [fastq1, fastq2] - default": { + "content": [ + "test.bam", + [ + + ], + [ + "versions.yml:md5,468bbf74a89c7db86a209ad9bbfa7736" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-05T16:33:02.837327" + }, + "homo_sapiens - [fastq1, fastq2] - umi": { + "content": [ + "test.bam", + [ + + ], + [ + "versions.yml:md5,468bbf74a89c7db86a209ad9bbfa7736" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-05T16:34:25.224411" + }, + "homo_sapiens - [fastq1, fastq2] - bam": { + "content": [ + "test.bam", + [ + + ], + [ + "versions.yml:md5,468bbf74a89c7db86a209ad9bbfa7736" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-05T16:33:47.975145" + }, + "homo_sapiens - [fastq1, fastq2] - custom sample": { + "content": [ + "test.bam", + [ + + ], + [ + "versions.yml:md5,468bbf74a89c7db86a209ad9bbfa7736" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-05T16:34:43.606837" + }, + "homo_sapiens - [fastq1, fastq2] - stub": { + "content": [ + "test.bam", + [ + + ], + [ + "versions.yml:md5,468bbf74a89c7db86a209ad9bbfa7736" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-05T16:34:57.596241" + } +} \ No newline at end of file diff --git a/modules/nf-core/fgbio/fastqtobam/tests/umi.config b/modules/nf-core/fgbio/fastqtobam/tests/umi.config new file mode 100644 index 00000000..7b668aa9 --- /dev/null +++ b/modules/nf-core/fgbio/fastqtobam/tests/umi.config @@ -0,0 +1,3 @@ +process { + ext.args = "--read-structures +T 12M11S+T" +} \ No newline at end of file diff --git a/modules/nf-core/fgbio/filterconsensusreads/environment.yml b/modules/nf-core/fgbio/filterconsensusreads/environment.yml new file mode 100644 index 00000000..4dbb6856 --- /dev/null +++ b/modules/nf-core/fgbio/filterconsensusreads/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::fgbio=2.5.21 diff --git a/modules/nf-core/fgbio/filterconsensusreads/fgbio-filterconsensusreads.diff b/modules/nf-core/fgbio/filterconsensusreads/fgbio-filterconsensusreads.diff new file mode 100644 index 00000000..9d55d91e --- /dev/null +++ b/modules/nf-core/fgbio/filterconsensusreads/fgbio-filterconsensusreads.diff @@ -0,0 +1,20 @@ +Changes in component 'nf-core/fgbio/filterconsensusreads' +'modules/nf-core/fgbio/filterconsensusreads/environment.yml' is unchanged +Changes in 'fgbio/filterconsensusreads/main.nf': +--- modules/nf-core/fgbio/filterconsensusreads/main.nf ++++ modules/nf-core/fgbio/filterconsensusreads/main.nf +@@ -8,8 +8,7 @@ + 'community.wave.seqera.io/library/fgbio:2.5.21--368dab1b4f308243' }" + + input: +- tuple val(meta), path(bam) +- tuple val(meta2), path(fasta) ++ tuple val(meta), path(bam), path(fasta) + val(min_reads) + val(min_baseq) + val(max_base_error_rate) + +'modules/nf-core/fgbio/filterconsensusreads/meta.yml' is unchanged +'modules/nf-core/fgbio/filterconsensusreads/tests/main.nf.test' is unchanged +'modules/nf-core/fgbio/filterconsensusreads/tests/main.nf.test.snap' is unchanged +************************************************************ diff --git a/modules/nf-core/fgbio/filterconsensusreads/main.nf b/modules/nf-core/fgbio/filterconsensusreads/main.nf new file mode 100644 index 00000000..007887e2 --- /dev/null +++ b/modules/nf-core/fgbio/filterconsensusreads/main.nf @@ -0,0 +1,70 @@ +process FGBIO_FILTERCONSENSUSREADS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/b4/b4047e3e517b57fae311eab139a12f0887d898b7da5fceeb2a1029c73b9e3904/data' : + 'community.wave.seqera.io/library/fgbio:2.5.21--368dab1b4f308243' }" + + input: + tuple val(meta), path(bam), path(fasta) + val(min_reads) + val(min_baseq) + val(max_base_error_rate) + + output: + tuple val(meta), path("${prefix}.bam"), emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}_consensus_filtered" + + def mem_gb = 8 + if (!task.memory) { + log.info '[fgbio FilterConsensusReads] Available memory not known - defaulting to 8GB. Specify process memory requirements to change this.' + } else if (mem_gb > task.memory.giga) { + if (task.memory.giga < 2) { + mem_gb = 1 + } else { + mem_gb = task.memory.giga - 1 + } + } + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + + """ + fgbio \\ + -Xmx${mem_gb}g \\ + --tmp-dir=. \\ + --compression=0 \\ + FilterConsensusReads \\ + --input $bam \\ + --output ${prefix}.bam \\ + --ref ${fasta} \\ + --min-reads ${min_reads} \\ + --min-base-quality ${min_baseq} \\ + --max-base-error-rate ${max_base_error_rate} \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}_consensus_filtered" + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/fgbio/filterconsensusreads/meta.yml b/modules/nf-core/fgbio/filterconsensusreads/meta.yml new file mode 100644 index 00000000..17c4f8a1 --- /dev/null +++ b/modules/nf-core/fgbio/filterconsensusreads/meta.yml @@ -0,0 +1,74 @@ +name: "fgbio_filterconsensusreads" +description: Uses FGBIO FilterConsensusReads to filter consensus reads generated by + CallMolecularConsensusReads or CallDuplexConsensusReads. +keywords: + - fgbio + - filter + - consensus + - umi + - duplexumi +tools: + - "fgbio": + description: "A set of tools for working with genomic and high throughput sequencing + data, including UMIs" + homepage: http://fulcrumgenomics.github.io/fgbio/ + documentation: http://fulcrumgenomics.github.io/fgbio/tools/latest/FilterConsensusReads.html + tool_dev_url: https://github.com/fulcrumgenomics/fgbio + licence: ["MIT"] + identifier: biotools:fgbio +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file + pattern: "*.bam" + ontologies: [] + - - meta2: + type: map + description: | + Groovy Map containing genome information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Fasta file containing genomic sequence information + pattern: "*.bam" + ontologies: [] + - min_reads: + type: integer + description: Minimum number of reads required to keep a consensus read + - min_baseq: + type: file + description: Minimum base quality to consider + ontologies: [] + - max_base_error_rate: + type: file + description: Maximum base error rate for a position before it is replaced with + an N. + ontologies: [] +output: + bam: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}.bam: + type: file + description: Filtered consensus BAM file + pattern: "*.bam" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@lescai" +maintainers: + - "@lescai" diff --git a/modules/nf-core/fgbio/filterconsensusreads/tests/main.nf.test b/modules/nf-core/fgbio/filterconsensusreads/tests/main.nf.test new file mode 100644 index 00000000..e4f3511f --- /dev/null +++ b/modules/nf-core/fgbio/filterconsensusreads/tests/main.nf.test @@ -0,0 +1,69 @@ +nextflow_process { + + name "Test Process FGBIO_FILTERCONSENSUSREADS" + script "../main.nf" + + process "FGBIO_FILTERCONSENSUSREADS" + + tag "modules" + tag "modules_nfcore" + tag "fgbio" + tag "fgbio/filterconsensusreads" + + test("sarscov2 - bam") { + + when { + process { + """ + input[0] = [[ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_duplex_consensus.bam', checkIfExists: true) + ] + input[1] = [[ id:'homo_sapiens'], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ] + input[2] = 3 + input[3] = 45 + input[4] = 0.2 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [[ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_duplex_consensus.bam', checkIfExists: true) + ] + input[1] = [[ id:'homo_sapiens'], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ] + input[2] = 3 + input[3] = 45 + input[4] = 0.2 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/fgbio/filterconsensusreads/tests/main.nf.test.snap b/modules/nf-core/fgbio/filterconsensusreads/tests/main.nf.test.snap new file mode 100644 index 00000000..4dff39ad --- /dev/null +++ b/modules/nf-core/fgbio/filterconsensusreads/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test_consensus_filtered.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,be19391d55fe52c0fd32a844b1aceeb1" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test_consensus_filtered.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,be19391d55fe52c0fd32a844b1aceeb1" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-06T16:37:18.521589" + }, + "sarscov2 - bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test_consensus_filtered.bam:md5,3d3c128a00a1e3c466275516f345daac" + ] + ], + "1": [ + "versions.yml:md5,be19391d55fe52c0fd32a844b1aceeb1" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test_consensus_filtered.bam:md5,3d3c128a00a1e3c466275516f345daac" + ] + ], + "versions": [ + "versions.yml:md5,be19391d55fe52c0fd32a844b1aceeb1" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-06T16:37:04.297362" + } +} \ No newline at end of file diff --git a/modules/nf-core/fgbio/groupreadsbyumi/environment.yml b/modules/nf-core/fgbio/groupreadsbyumi/environment.yml new file mode 100644 index 00000000..4dbb6856 --- /dev/null +++ b/modules/nf-core/fgbio/groupreadsbyumi/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::fgbio=2.5.21 diff --git a/modules/nf-core/fgbio/groupreadsbyumi/main.nf b/modules/nf-core/fgbio/groupreadsbyumi/main.nf new file mode 100644 index 00000000..f7725219 --- /dev/null +++ b/modules/nf-core/fgbio/groupreadsbyumi/main.nf @@ -0,0 +1,70 @@ +process FGBIO_GROUPREADSBYUMI { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/b4/b4047e3e517b57fae311eab139a12f0887d898b7da5fceeb2a1029c73b9e3904/data' : + 'community.wave.seqera.io/library/fgbio:2.5.21--368dab1b4f308243' }" + + input: + tuple val(meta), path(bam) + val(strategy) + + output: + tuple val(meta), path("*.bam") , emit: bam + tuple val(meta), path("*histogram.txt") , emit: histogram + tuple val(meta), path("*read-metrics.txt"), emit: read_metrics + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}_umi-grouped" + def mem_gb = 8 + if (!task.memory) { + log.info '[fgbio FilterConsensusReads] Available memory not known - defaulting to 8GB. Specify process memory requirements to change this.' + } else if (mem_gb > task.memory.giga) { + if (task.memory.giga < 2) { + mem_gb = 1 + } else { + mem_gb = task.memory.giga - 1 + } + } + + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + + """ + fgbio \\ + -Xmx${mem_gb}g \\ + --tmp-dir=. \\ + GroupReadsByUmi \\ + -s $strategy \\ + $args \\ + -i $bam \\ + -o ${prefix}.bam \\ + -f ${prefix}_histogram.txt \\ + --grouping-metrics ${prefix}_read-metrics.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}_umi-grouped" + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch ${prefix}.bam + touch ${prefix}_histogram.txt + touch ${prefix}_read-metrics.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/fgbio/groupreadsbyumi/meta.yml b/modules/nf-core/fgbio/groupreadsbyumi/meta.yml new file mode 100644 index 00000000..eb22ec10 --- /dev/null +++ b/modules/nf-core/fgbio/groupreadsbyumi/meta.yml @@ -0,0 +1,84 @@ +name: fgbio_groupreadsbyumi +description: | + Groups reads together that appear to have come from the same original molecule. + Reads are grouped by template, and then templates are sorted by the 5’ mapping positions + of the reads from the template, used from earliest mapping position to latest. + Reads that have the same end positions are then sub-grouped by UMI sequence. + (!) Note: the MQ tag is required on reads with mapped mates (!) + This can be added using samblaster with the optional argument --addMateTags. +keywords: + - UMI + - groupreads + - fgbio +tools: + - fgbio: + description: A set of tools for working with genomic and high throughput sequencing + data, including UMIs + homepage: http://fulcrumgenomics.github.io/fgbio/ + documentation: http://fulcrumgenomics.github.io/fgbio/tools/latest/ + tool_dev_url: https://github.com/fulcrumgenomics/fgbio + licence: ["MIT"] + identifier: biotools:fgbio +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: | + BAM file. Note: the MQ tag is required on reads with mapped mates (!) + pattern: "*.bam" + ontologies: [] + - strategy: + type: string + enum: ["Identity", "Edit", "Adjacency", "Paired"] + description: | + Required argument: defines the UMI assignment strategy. + Must be chosen among: Identity, Edit, Adjacency, Paired. +output: + bam: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.bam": + type: file + description: UMI-grouped BAM + pattern: "*.bam" + ontologies: [] + histogram: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*histogram.txt": + type: file + description: A text file containing the tag family size counts + pattern: "*.txt" + ontologies: [] + read_metrics: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*read-metrics.txt": + type: file + description: A text file containing the read count metrics from grouping + pattern: "*.txt" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@lescai" +maintainers: + - "@lescai" diff --git a/modules/nf-core/fgbio/groupreadsbyumi/tests/main.nf.test b/modules/nf-core/fgbio/groupreadsbyumi/tests/main.nf.test new file mode 100644 index 00000000..a9e8bd25 --- /dev/null +++ b/modules/nf-core/fgbio/groupreadsbyumi/tests/main.nf.test @@ -0,0 +1,60 @@ +nextflow_process { + + name "Test Process FGBIO_GROUPREADSBYUMI" + script "../main.nf" + process "FGBIO_GROUPREADSBYUMI" + + tag "modules" + tag "modules_nfcore" + tag "fgbio" + tag "fgbio/groupreadsbyumi" + + test("sarscov2 - bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/umi/test.paired_end.unsorted_tagged.bam', checkIfExists: true) + ] + input[1] = "Adjacency" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/umi/test.paired_end.unsorted_tagged.bam', checkIfExists: true) + ] + input[1] = "Adjacency" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/fgbio/groupreadsbyumi/tests/main.nf.test.snap b/modules/nf-core/fgbio/groupreadsbyumi/tests/main.nf.test.snap new file mode 100644 index 00000000..00de4ac0 --- /dev/null +++ b/modules/nf-core/fgbio/groupreadsbyumi/tests/main.nf.test.snap @@ -0,0 +1,144 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test_umi-grouped.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test_umi-grouped_histogram.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test_umi-grouped_read-metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,3e8002a4c4eef8dc0a715dd9585eeb5b" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test_umi-grouped.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "histogram": [ + [ + { + "id": "test", + "single_end": false + }, + "test_umi-grouped_histogram.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "read_metrics": [ + [ + { + "id": "test", + "single_end": false + }, + "test_umi-grouped_read-metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,3e8002a4c4eef8dc0a715dd9585eeb5b" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-06T16:37:53.48947" + }, + "sarscov2 - bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test_umi-grouped.bam:md5,35bfc992c30d8e3e50816159fa58cb11" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test_umi-grouped_histogram.txt:md5,9a0c622b65209afbce0840e2affff983" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test_umi-grouped_read-metrics.txt:md5,a5f75e3e390e30791a636fed355e0afd" + ] + ], + "3": [ + "versions.yml:md5,3e8002a4c4eef8dc0a715dd9585eeb5b" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test_umi-grouped.bam:md5,35bfc992c30d8e3e50816159fa58cb11" + ] + ], + "histogram": [ + [ + { + "id": "test", + "single_end": false + }, + "test_umi-grouped_histogram.txt:md5,9a0c622b65209afbce0840e2affff983" + ] + ], + "read_metrics": [ + [ + { + "id": "test", + "single_end": false + }, + "test_umi-grouped_read-metrics.txt:md5,a5f75e3e390e30791a636fed355e0afd" + ] + ], + "versions": [ + "versions.yml:md5,3e8002a4c4eef8dc0a715dd9585eeb5b" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-10T08:59:32.932448" + } +} \ No newline at end of file diff --git a/modules/nf-core/fgbio/sortbam/environment.yml b/modules/nf-core/fgbio/sortbam/environment.yml new file mode 100644 index 00000000..9645b667 --- /dev/null +++ b/modules/nf-core/fgbio/sortbam/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + # renovate: datasource=conda depName=bioconda/fgbio + - bioconda::fgbio=2.5.21 diff --git a/modules/nf-core/fgbio/sortbam/main.nf b/modules/nf-core/fgbio/sortbam/main.nf new file mode 100644 index 00000000..3b3e6521 --- /dev/null +++ b/modules/nf-core/fgbio/sortbam/main.nf @@ -0,0 +1,61 @@ +process FGBIO_SORTBAM { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/b4/b4047e3e517b57fae311eab139a12f0887d898b7da5fceeb2a1029c73b9e3904/data' : + 'community.wave.seqera.io/library/fgbio:2.5.21--368dab1b4f308243' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*.bam"), emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}_sorted" + def mem_gb = 8 + if (!task.memory) { + log.info '[fgbio SortBam] Available memory not known - defaulting to 8GB. Specify process memory requirements to change this.' + } else if (mem_gb > task.memory.giga) { + if (task.memory.giga < 2) { + mem_gb = 1 + } else { + mem_gb = task.memory.giga - 1 + } + } + + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + + """ + fgbio -Xmx${mem_gb}g \\ + --async-io=true \\ + --tmp-dir=. \\ + SortBam \\ + -i $bam \\ + $args \\ + -o ${prefix}.bam + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}_sorted" + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/fgbio/sortbam/meta.yml b/modules/nf-core/fgbio/sortbam/meta.yml new file mode 100644 index 00000000..81c295d6 --- /dev/null +++ b/modules/nf-core/fgbio/sortbam/meta.yml @@ -0,0 +1,50 @@ +name: fgbio_sortbam +description: Sorts a SAM or BAM file. Several sort orders are available, including + coordinate, queryname, random, and randomquery. +keywords: + - sort + - bam + - sam +tools: + - fgbio: + description: Tools for working with genomic and high throughput sequencing data. + homepage: https://github.com/fulcrumgenomics/fgbio + documentation: http://fulcrumgenomics.github.io/fgbio/ + licence: ["MIT"] + identifier: biotools:fgbio +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false, collapse:false ] + - bam: + type: file + description: | + The input SAM or BAM file to be sorted. + pattern: "*.{bam,sam}" + ontologies: [] +output: + bam: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.bam": + type: file + description: | + Output SAM or BAM file. + pattern: "*.{bam,sam}" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@sruthipsuresh" +maintainers: + - "@sruthipsuresh" diff --git a/modules/nf-core/fgbio/sortbam/tests/main.nf.test b/modules/nf-core/fgbio/sortbam/tests/main.nf.test new file mode 100644 index 00000000..2e9b2459 --- /dev/null +++ b/modules/nf-core/fgbio/sortbam/tests/main.nf.test @@ -0,0 +1,56 @@ +nextflow_process { + + name "Test Process FGBIO_SORTBAM" + script "../main.nf" + process "FGBIO_SORTBAM" + + tag "modules" + tag "modules_nfcore" + tag "fgbio" + tag "fgbio/sortbam" + + test("sarscov2 - bam") { + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/fgbio/sortbam/tests/main.nf.test.snap b/modules/nf-core/fgbio/sortbam/tests/main.nf.test.snap new file mode 100644 index 00000000..cb8d6768 --- /dev/null +++ b/modules/nf-core/fgbio/sortbam/tests/main.nf.test.snap @@ -0,0 +1,68 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_sorted.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,c6608b61c38dcf9142a28a0d665eb96d" + ], + "bam": [ + [ + { + "id": "test" + }, + "test_sorted.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,c6608b61c38dcf9142a28a0d665eb96d" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-06T16:38:27.474292" + }, + "sarscov2 - bam": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_sorted.bam:md5,1d7a558a72b7aecc80946cb9cadf8f60" + ] + ], + "1": [ + "versions.yml:md5,c6608b61c38dcf9142a28a0d665eb96d" + ], + "bam": [ + [ + { + "id": "test" + }, + "test_sorted.bam:md5,1d7a558a72b7aecc80946cb9cadf8f60" + ] + ], + "versions": [ + "versions.yml:md5,c6608b61c38dcf9142a28a0d665eb96d" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-06T16:38:12.994113" + } +} \ No newline at end of file diff --git a/modules/nf-core/fgbio/zipperbams/environment.yml b/modules/nf-core/fgbio/zipperbams/environment.yml new file mode 100644 index 00000000..4dbb6856 --- /dev/null +++ b/modules/nf-core/fgbio/zipperbams/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::fgbio=2.5.21 diff --git a/modules/nf-core/fgbio/zipperbams/fgbio-zipperbams.diff b/modules/nf-core/fgbio/zipperbams/fgbio-zipperbams.diff new file mode 100644 index 00000000..48614230 --- /dev/null +++ b/modules/nf-core/fgbio/zipperbams/fgbio-zipperbams.diff @@ -0,0 +1,40 @@ +Changes in component 'nf-core/fgbio/zipperbams' +'modules/nf-core/fgbio/zipperbams/environment.yml' is unchanged +Changes in 'fgbio/zipperbams/main.nf': +--- modules/nf-core/fgbio/zipperbams/main.nf ++++ modules/nf-core/fgbio/zipperbams/main.nf +@@ -8,10 +8,8 @@ + 'community.wave.seqera.io/library/fgbio:2.5.21--368dab1b4f308243' }" + + input: +- tuple val(meta), path(unmapped_bam) +- tuple val(meta2), path(mapped_bam) +- tuple val(meta3), path(fasta) +- tuple val(meta4), path(dict) ++ ++ tuple val(meta), path(unmapped_bam), path(mapped_bam), path(fasta), path(dict) + + output: + tuple val(meta), path("${prefix}.bam"), emit: bam +@@ -22,7 +20,6 @@ + + script: + def args = task.ext.args ?: '' +- def args2 = task.ext.args2 ?: '' + def compression = task.ext.compression ?: '0' + prefix = task.ext.prefix ?: "${meta.id}_zipped" + def mem_gb = 8 +@@ -50,7 +47,6 @@ + ${args} \\ + --output ${prefix}.bam + +- + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + +'modules/nf-core/fgbio/zipperbams/meta.yml' is unchanged +'modules/nf-core/fgbio/zipperbams/tests/main.nf.test' is unchanged +'modules/nf-core/fgbio/zipperbams/tests/main.nf.test.snap' is unchanged +'modules/nf-core/fgbio/zipperbams/tests/nextflow.config' is unchanged +************************************************************ diff --git a/modules/nf-core/fgbio/zipperbams/main.nf b/modules/nf-core/fgbio/zipperbams/main.nf new file mode 100644 index 00000000..cb723439 --- /dev/null +++ b/modules/nf-core/fgbio/zipperbams/main.nf @@ -0,0 +1,69 @@ +process FGBIO_ZIPPERBAMS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/b4/b4047e3e517b57fae311eab139a12f0887d898b7da5fceeb2a1029c73b9e3904/data' : + 'community.wave.seqera.io/library/fgbio:2.5.21--368dab1b4f308243' }" + + input: + + tuple val(meta), path(unmapped_bam), path(mapped_bam), path(fasta), path(dict) + + output: + tuple val(meta), path("${prefix}.bam"), emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def compression = task.ext.compression ?: '0' + prefix = task.ext.prefix ?: "${meta.id}_zipped" + def mem_gb = 8 + if (!task.memory) { + log.info '[fgbio ZipperBams] Available memory not known - defaulting to 8GB. Specify process memory requirements to change this.' + } else if (mem_gb > task.memory.giga) { + if (task.memory.giga < 2) { + mem_gb = 1 + } else { + mem_gb = task.memory.giga - 1 + } + } + + if ("${unmapped_bam}" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + if ("${mapped_bam}" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + + """ + fgbio -Xmx${mem_gb}g \\ + --compression ${compression} \\ + --async-io=true \\ + ZipperBams \\ + --unmapped ${unmapped_bam} \\ + --input ${mapped_bam} \\ + --ref ${fasta} \\ + ${args} \\ + --output ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}_zipped" + if ("${unmapped_bam}" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + if ("${mapped_bam}" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + + """ + touch ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/fgbio/zipperbams/meta.yml b/modules/nf-core/fgbio/zipperbams/meta.yml new file mode 100644 index 00000000..052b50e1 --- /dev/null +++ b/modules/nf-core/fgbio/zipperbams/meta.yml @@ -0,0 +1,82 @@ +name: "fgbio_zipperbams" +description: FGBIO tool to zip together an unmapped and mapped BAM to transfer metadata + into the output BAM +keywords: + - fgbio + - umi + - unmapped + - ubam + - zipperbams +tools: + - fgbio: + description: A set of tools for working with genomic and high throughput sequencing + data, including UMIs + homepage: http://fulcrumgenomics.github.io/fgbio/ + documentation: http://fulcrumgenomics.github.io/fgbio/tools/latest/ + tool_dev_url: https://github.com/fulcrumgenomics/fgbio + licence: ["MIT"] + identifier: biotools:fgbio +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - unmapped_bam: + type: file + description: unmapped BAM file + pattern: "*.bam" + ontologies: [] + - - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - mapped_bam: + type: file + description: mapped BAM/SAM file + pattern: "*.{bam,sam}" + ontologies: [] + - - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'GRCh38' ] + - fasta: + type: file + description: fasta file containing genomic sequence information + pattern: "*.{fasta,fa}" + ontologies: [] + - - meta4: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'GRCh38' ] + - dict: + type: file + description: dict file containing a sequence dictionary for the fasta file + pattern: "*.{dict}" + ontologies: [] +output: + bam: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}.bam: + type: file + description: Zipped BAM file + pattern: "*.bam" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@lescai" +maintainers: + - "@lescai" diff --git a/modules/nf-core/fgbio/zipperbams/tests/main.nf.test b/modules/nf-core/fgbio/zipperbams/tests/main.nf.test new file mode 100644 index 00000000..89f7ce5c --- /dev/null +++ b/modules/nf-core/fgbio/zipperbams/tests/main.nf.test @@ -0,0 +1,83 @@ +nextflow_process { + + name "Test Process FGBIO_ZIPPERBAMS" + script "../main.nf" + process "FGBIO_ZIPPERBAMS" + + tag "modules" + tag "modules_nfcore" + tag "fgbio" + tag "fgbio/zipperbams" + + test("sarscov2 - bam") { + config "./nextflow.config" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_unmapped.bam', checkIfExists: true) + ] + input[1] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_mapped.bam', checkIfExists: true) + ] + input[2] = [ + [ id:'test'], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ] + input[3] = [ + [ id:'test'], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.dict', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_unmapped.bam', checkIfExists: true) + ] + input[1] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_mapped.bam', checkIfExists: true) + ] + input[2] = [ + [ id:'test'], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ] + input[3] = [ + [ id:'test'], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.dict', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/fgbio/zipperbams/tests/main.nf.test.snap b/modules/nf-core/fgbio/zipperbams/tests/main.nf.test.snap new file mode 100644 index 00000000..9ceb5b24 --- /dev/null +++ b/modules/nf-core/fgbio/zipperbams/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test_zipped.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,786ad0edcd8c1ead6fd6d8f8a751f971" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test_zipped.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,786ad0edcd8c1ead6fd6d8f8a751f971" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-06T16:39:17.538398" + }, + "sarscov2 - bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test_zipped.bam:md5,1980b44177f4720f1005c9be62b09f79" + ] + ], + "1": [ + "versions.yml:md5,786ad0edcd8c1ead6fd6d8f8a751f971" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test_zipped.bam:md5,1980b44177f4720f1005c9be62b09f79" + ] + ], + "versions": [ + "versions.yml:md5,786ad0edcd8c1ead6fd6d8f8a751f971" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-06T16:38:57.558961" + } +} \ No newline at end of file diff --git a/modules/nf-core/fgbio/zipperbams/tests/nextflow.config b/modules/nf-core/fgbio/zipperbams/tests/nextflow.config new file mode 100644 index 00000000..4c4c4ddc --- /dev/null +++ b/modules/nf-core/fgbio/zipperbams/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: "FGBIO_ZIPPERBAMS" { + ext.args = "--tags-to-reverse Consensus --tags-to-revcomp Consensus" + } +} diff --git a/modules/nf-core/samtools/fastq/environment.yml b/modules/nf-core/samtools/fastq/environment.yml new file mode 100644 index 00000000..62054fc9 --- /dev/null +++ b/modules/nf-core/samtools/fastq/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::htslib=1.21 + - bioconda::samtools=1.21 diff --git a/modules/nf-core/samtools/fastq/main.nf b/modules/nf-core/samtools/fastq/main.nf new file mode 100644 index 00000000..bcc5d604 --- /dev/null +++ b/modules/nf-core/samtools/fastq/main.nf @@ -0,0 +1,60 @@ +process SAMTOOLS_FASTQ { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.21--h50ea8bc_0' : + 'biocontainers/samtools:1.21--h50ea8bc_0' }" + + input: + tuple val(meta), path(input) + val(interleave) + + output: + tuple val(meta), path("*_{1,2}.fastq.gz") , optional:true, emit: fastq + tuple val(meta), path("*_interleaved.fastq") , optional:true, emit: interleaved + tuple val(meta), path("*_singleton.fastq.gz") , optional:true, emit: singleton + tuple val(meta), path("*_other.fastq.gz") , optional:true, emit: other + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def output = ( interleave && ! meta.single_end ) ? "> ${prefix}_interleaved.fastq" : + meta.single_end ? "-1 ${prefix}_1.fastq.gz -s ${prefix}_singleton.fastq.gz" : + "-1 ${prefix}_1.fastq.gz -2 ${prefix}_2.fastq.gz -s ${prefix}_singleton.fastq.gz" + """ + # Note: --threads value represents *additional* CPUs to allocate (total CPUs = 1 + --threads). + samtools \\ + fastq \\ + $args \\ + --threads ${task.cpus-1} \\ + -0 ${prefix}_other.fastq.gz \\ + $input \\ + $output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def output = ( interleave && ! meta.single_end ) ? "touch ${prefix}_interleaved.fastq" : + meta.single_end ? "echo | gzip > ${prefix}_1.fastq.gz && echo | gzip > ${prefix}_singleton.fastq.gz" : + "echo | gzip > ${prefix}_1.fastq.gz && echo | gzip > ${prefix}_2.fastq.gz && echo | gzip > ${prefix}_singleton.fastq.gz" + """ + ${output} + echo | gzip > ${prefix}_other.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/fastq/meta.yml b/modules/nf-core/samtools/fastq/meta.yml new file mode 100644 index 00000000..9a5bd42f --- /dev/null +++ b/modules/nf-core/samtools/fastq/meta.yml @@ -0,0 +1,96 @@ +name: samtools_fastq +description: Converts a SAM/BAM/CRAM file to FASTQ +keywords: + - bam + - sam + - cram + - fastq +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] + identifier: biotools:samtools +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + ontologies: [] + - interleave: + type: boolean + description: Set true for interleaved fastq file +output: + fastq: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*_{1,2}.fastq.gz": + type: file + description: Compressed FASTQ file(s) with reads with either the READ1 or + READ2 flag set in separate files. + pattern: "*_{1,2}.fastq.gz" + ontologies: [] + interleaved: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*_interleaved.fastq": + type: file + description: Compressed FASTQ file with reads with either the READ1 or READ2 + flag set in a combined file. Needs collated input file. + pattern: "*_interleaved.fastq.gz" + ontologies: + - edam: http://edamontology.org/format_3989 # GZIP format + singleton: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*_singleton.fastq.gz": + type: file + description: Compressed FASTQ file with singleton reads + pattern: "*_singleton.fastq.gz" + ontologies: + - edam: http://edamontology.org/format_3989 # GZIP format + other: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*_other.fastq.gz": + type: file + description: Compressed FASTQ file with reads with either both READ1 and READ2 + flags set or unset + pattern: "*_other.fastq.gz" + ontologies: + - edam: http://edamontology.org/format_3989 # GZIP format + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@priyanka-surana" + - "@suzannejin" +maintainers: + - "@priyanka-surana" + - "@suzannejin" diff --git a/modules/nf-core/samtools/fastq/tests/main.nf.test b/modules/nf-core/samtools/fastq/tests/main.nf.test new file mode 100644 index 00000000..971ea1d4 --- /dev/null +++ b/modules/nf-core/samtools/fastq/tests/main.nf.test @@ -0,0 +1,119 @@ +nextflow_process { + + name "Test Process SAMTOOLS_FASTQ" + script "../main.nf" + process "SAMTOOLS_FASTQ" + + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/fastq" + + test("bam") { + + when { + process { + """ + interleave = false + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ]) + input[1] = interleave + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.fastq[0][1].collect { path(it).linesGzip[0..6] }).match("bam_fastq") }, + { assert snapshot(process.out.interleaved).match("bam_interleaved") }, + { assert snapshot(file(process.out.singleton[0][1]).name).match("bam_singleton") }, + { assert snapshot(file(process.out.other[0][1]).name).match("bam_other") }, + { assert snapshot(process.out.versions).match("bam_versions") } + ) + } + } + + test("bam_interleave") { + + when { + process { + """ + interleave = true + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ]) + input[1] = interleave + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.fastq).match("bam_interleave_fastq") }, + { assert snapshot(path(process.out.interleaved[0][1]).readLines()[0..6]).match("bam_interlinterleave_eaved") }, + { assert snapshot(process.out.singleton).match("bam_singinterleave_leton") }, + { assert snapshot(file(process.out.other[0][1]).name).match("bam_interleave_other") }, + { assert snapshot(process.out.versions).match("bam_verinterleave_sions") } + ) + } + } + + test("bam - stub") { + + options "-stub" + + when { + process { + """ + interleave = false + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ]) + input[1] = interleave + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("bam_interleave - stub") { + + options "-stub" + + when { + process { + """ + interleave = true + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ]) + input[1] = interleave + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/samtools/fastq/tests/main.nf.test.snap b/modules/nf-core/samtools/fastq/tests/main.nf.test.snap new file mode 100644 index 00000000..ff63f9ae --- /dev/null +++ b/modules/nf-core/samtools/fastq/tests/main.nf.test.snap @@ -0,0 +1,287 @@ +{ + "bam_interlinterleave_eaved": { + "content": [ + [ + "@ERR5069949.2151832/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "+", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def extension = file(input).getExtension() == 'cram' ? + "crai" : args.contains("-c") ? "csi" : "bai" + """ + touch ${input}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/index/meta.yml b/modules/nf-core/samtools/index/meta.yml new file mode 100644 index 00000000..1bed6bca --- /dev/null +++ b/modules/nf-core/samtools/index/meta.yml @@ -0,0 +1,77 @@ +name: samtools_index +description: Index SAM/BAM/CRAM file +keywords: + - index + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] + identifier: biotools:samtools +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: input file + ontologies: [] +output: + bai: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.bai": + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + ontologies: [] + csi: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.csi": + type: file + description: CSI index file + pattern: "*.{csi}" + ontologies: [] + crai: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.crai": + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@drpatelh" + - "@ewels" + - "@maxulysse" +maintainers: + - "@drpatelh" + - "@ewels" + - "@maxulysse" diff --git a/modules/nf-core/samtools/index/tests/csi.nextflow.config b/modules/nf-core/samtools/index/tests/csi.nextflow.config new file mode 100644 index 00000000..0ed260ef --- /dev/null +++ b/modules/nf-core/samtools/index/tests/csi.nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: SAMTOOLS_INDEX { + ext.args = '-c' + } + +} diff --git a/modules/nf-core/samtools/index/tests/main.nf.test b/modules/nf-core/samtools/index/tests/main.nf.test new file mode 100644 index 00000000..ca34fb5c --- /dev/null +++ b/modules/nf-core/samtools/index/tests/main.nf.test @@ -0,0 +1,140 @@ +nextflow_process { + + name "Test Process SAMTOOLS_INDEX" + script "../main.nf" + process "SAMTOOLS_INDEX" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/index" + + test("bai") { + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("crai") { + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("csi") { + config "./csi.nextflow.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.csi[0][1]).name, + process.out.versions + ).match() } + ) + } + } + + test("bai - stub") { + options "-stub" + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("crai - stub") { + options "-stub" + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("csi - stub") { + options "-stub" + config "./csi.nextflow.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/samtools/index/tests/main.nf.test.snap b/modules/nf-core/samtools/index/tests/main.nf.test.snap new file mode 100644 index 00000000..72d65e81 --- /dev/null +++ b/modules/nf-core/samtools/index/tests/main.nf.test.snap @@ -0,0 +1,250 @@ +{ + "csi - stub": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,5e09a6fdf76de396728f877193d72315" + ], + "bai": [ + + ], + "crai": [ + + ], + "csi": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,5e09a6fdf76de396728f877193d72315" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-16T08:21:25.261127166" + }, + "crai - stub": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.recalibrated.sorted.cram.crai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,5e09a6fdf76de396728f877193d72315" + ], + "bai": [ + + ], + "crai": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.recalibrated.sorted.cram.crai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "csi": [ + + ], + "versions": [ + "versions.yml:md5,5e09a6fdf76de396728f877193d72315" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-16T08:21:12.653194876" + }, + "bai - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,5e09a6fdf76de396728f877193d72315" + ], + "bai": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "crai": [ + + ], + "csi": [ + + ], + "versions": [ + "versions.yml:md5,5e09a6fdf76de396728f877193d72315" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-16T08:21:01.854932651" + }, + "csi": { + "content": [ + "test.paired_end.sorted.bam.csi", + [ + "versions.yml:md5,5e09a6fdf76de396728f877193d72315" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-16T08:20:51.485364222" + }, + "crai": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.recalibrated.sorted.cram.crai:md5,14bc3bd5c89cacc8f4541f9062429029" + ] + ], + "3": [ + "versions.yml:md5,5e09a6fdf76de396728f877193d72315" + ], + "bai": [ + + ], + "crai": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.recalibrated.sorted.cram.crai:md5,14bc3bd5c89cacc8f4541f9062429029" + ] + ], + "csi": [ + + ], + "versions": [ + "versions.yml:md5,5e09a6fdf76de396728f877193d72315" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-16T08:20:40.518873972" + }, + "bai": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.bai:md5,704c10dd1326482448ca3073fdebc2f4" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,5e09a6fdf76de396728f877193d72315" + ], + "bai": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.bai:md5,704c10dd1326482448ca3073fdebc2f4" + ] + ], + "crai": [ + + ], + "csi": [ + + ], + "versions": [ + "versions.yml:md5,5e09a6fdf76de396728f877193d72315" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-16T08:20:21.184050361" + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index cb90bae7..2108dad4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -26,6 +26,19 @@ params { roi = null genelists = null + // UMI options + umi_group_strategy = 'adjacency' + callmolecularconsensusreads_min_reads = 3 + callmolecularconsensusreads_min_baseq = 20 + filterconsensusreads_min_reads = 3 + filterconsensusreads_min_baseq = 45 + filterconsensusreads_min_base_error_rate = 0.2 + + // UMI options according to KAPA HyperPlex UMI kit (potentially used in the future) + // callmolecularconsensusreads_max_reads = 50 + // callmolecularconsensusreads_output_per_base_tags = false + // callmolecularconsensusreads_read_name_prefix = 'consensus' + // References genomes = [:] @@ -284,3 +297,15 @@ validation { // Load modules.config for DSL2 module specific options includeConfig 'conf/modules.config' + + +// CONSENSUS SUBWORKFLOW - MODULE EXT SETTINGS + +process { + withName: 'FGBIO_FASTQTOBAM' { + ext.prefix = { "${meta.id}_ubam" } + } + withName: 'FGBIO_COPYUMIFROMREADNAME' { + ext.prefix = { "${meta.id}_copyumifromreadname" } + } +} diff --git a/nextflow_schema.json b/nextflow_schema.json index 6a2a3685..45322b3a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -47,6 +47,43 @@ "description": "", "default": "", "properties": { + "umi_group_strategy": { + "type": "string", + "description": "Strategy for Mapped Bam => Grouped BAM ('identity', 'edit', 'adjacency', 'paired')", + "default": "adjacency", + "hidden": true, + "enum": ["identity", "edit", "adjacency", "paired"] + }, + "callmolecularconsensusreads_min_reads": { + "type": "integer", + "default": 3, + "description": "Minimum reads for callmolecularconsensusreads", + "hidden": true + }, + "callmolecularconsensusreads_min_baseq": { + "type": "integer", + "default": 20, + "description": "Minimum base quality for callmolecularconsensusreads", + "hidden": true + }, + "filterconsensusreads_min_reads": { + "type": "integer", + "default": 3, + "hidden": true, + "description": "Minimum reads for filterconsensusreads" + }, + "filterconsensusreads_min_baseq": { + "type": "integer", + "default": 45, + "hidden": true, + "description": "Minimum base quality for filterconsensusreads" + }, + "filterconsensusreads_min_base_error_rate": { + "type": "number", + "default": 0.2, + "hidden": true, + "description": "Minimum base error rate for filterconsensusreads" + }, "aligner": { "type": "string", "default": "bowtie2", @@ -61,7 +98,6 @@ }, "umi_aware": { "type": "boolean", - "default": "false", "description": "Run markdup in UMI-aware mode. This applies to Samtools only and requires the UMI to be in the read name." }, "run_coverage": { @@ -71,8 +107,7 @@ }, "skip_trimming": { "type": "boolean", - "description": "Skip adapter trimming", - "default": false + "description": "Skip adapter trimming" }, "trim_front": { "type": "integer", @@ -86,27 +121,22 @@ }, "adapter_R1": { "type": "string", - "default": null, "description": "Adapter sequence to be trimmed" }, "adapter_R2": { "type": "string", - "default": null, "description": "Adapter sequence to be trimmed" }, "disable_picard_metrics": { "type": "boolean", - "default": false, "description": "Disable the calculation of (slow) Picard metrics" }, "roi": { "type": "string", - "default": null, "description": "Region of interest for coverage analysis to be applied to all samples" }, "genelists": { "type": "string", - "default": null, "exists": true, "format": "directory-path", "description": "Directory containing gene list bed files for granular coverage analysis" diff --git a/nf-test.config b/nf-test.config index 0688f302..8829a169 100644 --- a/nf-test.config +++ b/nf-test.config @@ -1,9 +1,11 @@ config { + plugins { + load "nft-bam@0.6.0" + } testsDir "tests" workDir ".nf-test" configFile "tests/config/nf-test.config" profile "docker" options "-dump-channels" - } diff --git a/subworkflows/local/consensus/main.nf b/subworkflows/local/consensus/main.nf new file mode 100644 index 00000000..5a3e6189 --- /dev/null +++ b/subworkflows/local/consensus/main.nf @@ -0,0 +1,214 @@ +#!/usr/bin/env nextflow + +include { FGBIO_COPYUMIFROMREADNAME } from '../../../modules/nf-core/fgbio/copyumifromreadname/main' +include { FGBIO_CALLMOLECULARCONSENSUSREADS } from '../../../modules/nf-core/fgbio/callmolecularconsensusreads/main' +include { FGBIO_FASTQTOBAM as FASTQTOBAM_READNAME } from '../../../modules/nf-core/fgbio/fastqtobam/main' +include { FGBIO_FASTQTOBAM as FASTQTOBAM_SEQ } from '../../../modules/nf-core/fgbio/fastqtobam/main' +include { FGBIO_FILTERCONSENSUSREADS } from '../../../modules/nf-core/fgbio/filterconsensusreads/main' +include { FGBIO_GROUPREADSBYUMI } from '../../../modules/nf-core/fgbio/groupreadsbyumi/main' +include { FGBIO_SORTBAM } from '../../../modules/nf-core/fgbio/sortbam/main' +include { FGBIO_ZIPPERBAMS as FGBIO_ZIPPERBAMS_RAW } from '../../../modules/nf-core/fgbio/zipperbams/main' +include { FGBIO_ZIPPERBAMS as FGBIO_ZIPPERBAMS_CONS } from '../../../modules/nf-core/fgbio/zipperbams/main' +include { SAMTOOLS_FASTQ as SAMTOOLS_FASTQ_RAW } from '../../../modules/nf-core/samtools/fastq/main' +include { SAMTOOLS_FASTQ as SAMTOOLS_FASTQ_CONS } from '../../../modules/nf-core/samtools/fastq/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_UMI } from '../../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_CONSENSUS } from '../../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_CONVERT as SAMTOOLS_CONVERT } from '../../../modules/nf-core/samtools/convert/main' +include { FASTQ_ALIGN_DNA as FASTQ_ALIGN_DNA_RAW } from '../../../subworkflows/nf-core/fastq_align_dna/main' +include { FASTQ_ALIGN_DNA as FASTQ_ALIGN_DNA_CONS } from '../../../subworkflows/nf-core/fastq_align_dna/main' + +workflow CONSENSUS { + take: + ch_umi_fastq // channel: [meta, fastq1, fastq2] + + main: + def ch_versions = Channel.empty() + def ch_ubam = Channel.empty() + + // 1.1: FASTQ => uBAM + + def ch_fastq = ch_umi_fastq + .map { meta, r1, r2 -> tuple(meta, [r1, r2]) } + .branch { meta, _fastq -> + readname: meta['umi_type'] == 'readname' + seq: meta['umi_type'] == 'seq' + } + + // Case 1: UMI_in_readname + if (ch_fastq.readname) { + FASTQTOBAM_READNAME(ch_fastq.readname) + ch_versions = ch_versions.mix(FASTQTOBAM_READNAME.out.versions) + + SAMTOOLS_INDEX_UMI(FASTQTOBAM_READNAME.out.bam) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX_UMI.out.versions) + + FASTQTOBAM_READNAME.out.bam + .join(SAMTOOLS_INDEX_UMI.out.bai, by: 0) + .map { meta, bam, bai -> tuple(meta, bam, bai) } + .set { ch_ubam_with_bai } + + FGBIO_COPYUMIFROMREADNAME(ch_ubam_with_bai) + ch_versions = ch_versions.mix(FGBIO_COPYUMIFROMREADNAME.out.versions) + + ch_ubam = ch_ubam.mix(FGBIO_COPYUMIFROMREADNAME.out.bam) + } + + // Case 2: UMI_in_sequence + + if (ch_fastq.seq) { + FASTQTOBAM_SEQ(ch_fastq.seq) + ch_versions = ch_versions.mix(FASTQTOBAM_SEQ.out.versions) + + ch_ubam = ch_ubam.mix(FASTQTOBAM_SEQ.out.bam) + } + + // 1.2: uBAM => Mapped BAM + + SAMTOOLS_FASTQ_RAW(ch_ubam, true) + + ch_versions = ch_versions.mix(SAMTOOLS_FASTQ_RAW.out.versions) + + def ch_reads_aligner_index_fasta = SAMTOOLS_FASTQ_RAW.out.interleaved.map { meta, reads -> + def gd = (meta.genome_data instanceof Map) ? meta.genome_data : [:] + def alg = (meta.aligner ?: 'bwamem') + def fasta = file(gd.fasta, checkIfExists: true) + def index = file(gd[alg], checkIfExists: true) + tuple(meta, reads, alg, index, fasta) + } + + FASTQ_ALIGN_DNA_RAW(ch_reads_aligner_index_fasta, false) + ch_versions = ch_versions.mix(FASTQ_ALIGN_DNA_RAW.out.versions) + + def ch_mapped_bam = FASTQ_ALIGN_DNA_RAW.out.bam + + def ch_fasta_by_meta = ch_reads_aligner_index_fasta.map { meta, _r, _a, _i, fasta -> tuple(meta, fasta) } + + def ch_dict_by_meta = ch_reads_aligner_index_fasta.map { meta, _r, _a, _i, _fasta -> + def dict = file(meta.genome_data.dict, checkIfExists: true) + tuple(meta, dict) + } + ch_ubam + .join(ch_mapped_bam, by:0) + .join(ch_fasta_by_meta, by:0) + .join(ch_dict_by_meta, by:0) + .map { meta, ubam, mapped_bam, fasta, dict -> tuple(meta, ubam, mapped_bam, fasta, dict) } + .set { ch_zipperbam } + + FGBIO_ZIPPERBAMS_RAW(ch_zipperbam) + + ch_versions = ch_versions.mix(FGBIO_ZIPPERBAMS_RAW.out.versions) + + // 1.3: Mapped BAM => Grouped BAM + + + def ch_strategy = Channel.value(params.umi_group_strategy) + + FGBIO_GROUPREADSBYUMI( + FGBIO_ZIPPERBAMS_RAW.out.bam, + ch_strategy + ) + + ch_versions = ch_versions.mix(FGBIO_GROUPREADSBYUMI.out.versions) + def ch_grouped_bam = FGBIO_GROUPREADSBYUMI.out.bam + + // 2(b).1: GroupedBam -> Filtered Consensus uBam + def call_min_reads = Channel.value(params.callmolecularconsensusreads_min_reads) + def call_min_baseq = Channel.value(params.callmolecularconsensusreads_min_baseq) + + FGBIO_CALLMOLECULARCONSENSUSREADS( + ch_grouped_bam, + call_min_reads, + call_min_baseq + ) + + ch_versions = ch_versions.mix(FGBIO_CALLMOLECULARCONSENSUSREADS.out.versions) + + def ch_input_filterconsensusreads = FGBIO_CALLMOLECULARCONSENSUSREADS.out.bam.map {meta, bam -> + def fasta = file(meta.genome_data.fasta, checkIfExists: true) + tuple(meta, bam, fasta) + } + + def filter_min_reads = Channel.value(params.filterconsensusreads_min_reads) + def filter_min_baseq = Channel.value(params.filterconsensusreads_min_baseq) + def filter_min_base_error_rate = Channel.value(params.filterconsensusreads_min_base_error_rate) + + FGBIO_FILTERCONSENSUSREADS( + ch_input_filterconsensusreads, + filter_min_reads, + filter_min_baseq, + filter_min_base_error_rate + ) + + ch_versions = ch_versions.mix(FGBIO_FILTERCONSENSUSREADS.out.versions) + + ch_filtered_uBam = FGBIO_FILTERCONSENSUSREADS.out.bam + + // 2(b).2: Consensus Filtered uBam -> Consensus Mapped & Filtered BAM + + SAMTOOLS_FASTQ_CONS(ch_filtered_uBam, true) + ch_versions = ch_versions.mix(SAMTOOLS_FASTQ_CONS.out.versions) + + def ch_cons_reads_aligner_index_fasta = SAMTOOLS_FASTQ_CONS.out.interleaved.map { meta, reads -> + def gd = (meta.genome_data instanceof Map) ? meta.genome_data : [:] + def alg = (meta.aligner ?: 'bwamem') + def fasta = file(gd.fasta, checkIfExists: true) + def index = file(gd[alg], checkIfExists: true) + tuple(meta, reads, alg, index, fasta) + } + + FASTQ_ALIGN_DNA_CONS(ch_cons_reads_aligner_index_fasta, false) + ch_versions = ch_versions.mix(FASTQ_ALIGN_DNA_CONS.out.versions) + + def ch_cons_mapped_bam = FASTQ_ALIGN_DNA_CONS.out.bam + def ch_cons_fasta_by_meta = ch_cons_reads_aligner_index_fasta.map { meta, _r, _a, _i, fasta -> tuple(meta, fasta) } + def ch_cons_dict_by_meta = ch_cons_reads_aligner_index_fasta.map { meta, _r, _a, _i, _fasta -> + def dict = file(meta.genome_data.dict, checkIfExists: true) + tuple(meta, dict) + } + + ch_filtered_uBam + .join(ch_cons_mapped_bam, by: 0) + .join(ch_cons_fasta_by_meta, by: 0) + .join(ch_cons_dict_by_meta, by: 0) + .map { meta, ubam, mapped_bam, fasta, dict -> tuple(meta, ubam, mapped_bam, fasta, dict) } + .set { ch_cons_zipperbam } + + FGBIO_ZIPPERBAMS_CONS(ch_cons_zipperbam) + ch_versions = ch_versions.mix(FGBIO_ZIPPERBAMS_CONS.out.versions) + + FGBIO_SORTBAM(FGBIO_ZIPPERBAMS_CONS.out.bam) + ch_versions = ch_versions.mix(FGBIO_SORTBAM.out.versions) + + def ch_consensus_filtered_bam = FGBIO_SORTBAM.out.bam + + // Consensus_filtered_bam into CRAM (integration into pipeline) + + SAMTOOLS_INDEX_CONSENSUS(ch_consensus_filtered_bam) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX_CONSENSUS.out.versions) + + def ch_sam_convert_bai_fasta_fai = SAMTOOLS_INDEX_CONSENSUS.out.bai.map {meta, bai -> + def gd = (meta.genome_data instanceof Map) ? meta.genome_data : [:] + def fasta = file(gd.fasta, checkIfExists: true) + def fai = file(gd.fai, checkIfExists: true) + tuple(meta, bai, fasta, fai) + } + + def ch_consensus_bam_convert = ch_consensus_filtered_bam + .join(ch_sam_convert_bai_fasta_fai, by: 0) + + SAMTOOLS_CONVERT(ch_consensus_bam_convert) + ch_versions = ch_versions.mix(SAMTOOLS_CONVERT.out.versions) + + SAMTOOLS_CONVERT.out.cram + .join(SAMTOOLS_CONVERT.out.crai, by: 0) + .map { meta, cram, crai -> tuple(meta, cram, crai) } + .set { ch_consensus_cram_crai } + + emit: + ubam = ch_ubam + consensus_bam = ch_consensus_filtered_bam + grouped_bam = ch_grouped_bam + filtered_ubam = ch_filtered_uBam + consensus_cram_crai = ch_consensus_cram_crai + versions = ch_versions +} diff --git a/tests/config/nf-test.config b/tests/config/nf-test.config index 68e597f9..c429ebe1 100644 --- a/tests/config/nf-test.config +++ b/tests/config/nf-test.config @@ -14,3 +14,16 @@ aws { connectionTimeout = 60000 } } + +params { + genomes { + GRCh38 { + bwamem = "s3://test-data/genomics/homo_sapiens/genome/bwa/" + dict = "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict" + fai = "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna.fai" + fasta = "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna" + star = "s3://test-data/genomics/homo_sapiens/genome/star/" + gtf = "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.gtf" + } + } +} diff --git a/tests/subworkflows/local/consensus/main.nf.test b/tests/subworkflows/local/consensus/main.nf.test new file mode 100644 index 00000000..67be60b7 --- /dev/null +++ b/tests/subworkflows/local/consensus/main.nf.test @@ -0,0 +1,54 @@ +nextflow_workflow { + name "Test Subworkflow CONSENSUS" + script "subworkflows/local/consensus/main.nf" + workflow "CONSENSUS" + + tag "subworkflows" + tag "consensus" + tag "fgbio" + tag "umi" + tag "bwamem" + + test("test_consensus_reads_md5") { + + when { + workflow { + """ + input[0] = Channel.of([ + [ + id : 'umi_sample1', + samplename: 'umi_test1', + library : 'test', + organism : 'Homo sapiens', + umi_type : 'seq', + tag : 'WES', + aligner : 'bwamem', + genome_data: [ + fasta : params.genomes.GRCh38.fasta, + dict : params.genomes.GRCh38.dict, + bwamem: params.genomes.GRCh38.bwamem, + fai : params.genomes.GRCh38.fai + ] + ], + file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test.umi_1.fastq.gz'), + file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test.umi_2.fastq.gz') + ]) + """ + } + } + + then { + assertAll { + assert workflow.success + assert snapshot( + workflow.out.ubam.collect { bam(it[1]).getReadsMD5() }, + workflow.out.grouped_bam.collect { bam(it[1]).getReadsMD5() }, + workflow.out.filtered_ubam.collect { bam(it[1]).getReadsMD5() }, + workflow.out.consensus_bam.collect { bam(it[1]).getReadsMD5() }, + workflow.out.consensus_cram_crai.collect { bam(it[1]).getReadsMD5() }, + workflow.out.versions + ).match() + } + } + } +} diff --git a/tests/subworkflows/local/consensus/main.nf.test.snap b/tests/subworkflows/local/consensus/main.nf.test.snap new file mode 100644 index 00000000..166b52a1 --- /dev/null +++ b/tests/subworkflows/local/consensus/main.nf.test.snap @@ -0,0 +1,41 @@ +{ + "test_consensus_reads_md5": { + "content": [ + [ + "c21b5ba7ecc759f9d729a98420921e1e" + ], + [ + "d41d8cd98f00b204e9800998ecf8427e" + ], + [ + "d41d8cd98f00b204e9800998ecf8427e" + ], + [ + "d41d8cd98f00b204e9800998ecf8427e" + ], + [ + "d41d8cd98f00b204e9800998ecf8427e" + ], + [ + "versions.yml:md5,37ec8cc6d3cdb55f06e84f325f511538", + "versions.yml:md5,383381d42173a31f86cce3ab6cf7299e", + "versions.yml:md5,4af2c1a5032fc1ce5ec09368f956480c", + "versions.yml:md5,5d6ccca9089e3268bb387820e8ba8f57", + "versions.yml:md5,5fa2a734f2d547c5de3b14c3a2dc5c02", + "versions.yml:md5,7e52768cb8257260977e38d745e51237", + "versions.yml:md5,92ec3e7cd231a6503321464eb83e6955", + "versions.yml:md5,a21193341a7ae0f02ad5ce8af54c1d4a", + "versions.yml:md5,ae3cd5c66636afc64cedd4e633531f2a", + "versions.yml:md5,b081ec66fb0e82580f58170743e4b910", + "versions.yml:md5,d4e922f9ba8e8b393a2994eed8fd89ae", + "versions.yml:md5,faa4838294afcfe9d81dc85e268903e5", + "versions.yml:md5,faf05dada3834ff602901efc1a334351" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-08-26T11:16:54.790224281" + } +} \ No newline at end of file diff --git a/workflows/preprocessing.nf b/workflows/preprocessing.nf index d23a17b1..bb054bb9 100644 --- a/workflows/preprocessing.nf +++ b/workflows/preprocessing.nf @@ -20,6 +20,8 @@ include { COVERAGE } from '../subworkflows/local/coverage/main' include { FASTQ_TO_UCRAM } from '../subworkflows/local/fastq_to_unaligned_cram/main' include { FASTQ_TO_CRAM } from '../subworkflows/local/fastq_to_aligned_cram/main' +include { CONSENSUS } from '../subworkflows/local/consensus/main' + // Functions include { paramsSummaryMap } from 'plugin/nf-schema' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' @@ -126,13 +128,14 @@ workflow PREPROCESSING { rg = rg + [ 'SM': samplename, 'LB': meta.library ?: "", 'PL': meta.platform ?: rg.PL, - 'ID': meta.readgroup ?: rg.ID + 'ID': rg.ID ] def meta_with_readgroup = meta + ['single_end': single_end, 'readgroup': rg] return [meta_with_readgroup, fastq] } .set {ch_input_fastq} + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // ASSOCIATE CORRECT GENOME AND COUNT SAMPLE REPLICATES @@ -192,6 +195,26 @@ workflow PREPROCESSING { ch_fastq_per_sample.dump(tag:"FASTQ per sample", pretty: true) +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// STEP: UMI CONSENSUS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + + // Convert to [meta, r1, r2] for the UMI subworkflow + def ch_umi_fastq = ch_fastq_per_sample.map { meta, reads -> + def r1 = (reads instanceof List) ? reads[0] : reads + def r2 = (reads instanceof List && reads.size() > 1) ? reads[1] : [] + return [meta, r1, r2] + } + .filter {meta, _r1, _r2 -> meta.umi_type != "none"} + + CONSENSUS(ch_umi_fastq) + ch_versions = ch_versions.mix(CONSENSUS.out.versions) + + def ch_consensus_cram_crai = CONSENSUS.out.consensus_cram_crai + + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // FASTQ TRIMMING AND QC @@ -244,7 +267,6 @@ workflow PREPROCESSING { FASTQ_TO_UCRAM(ch_trimmed_reads.other) ch_versions = ch_versions.mix(FASTQ_TO_UCRAM.out.versions) -/* /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -281,6 +303,7 @@ workflow PREPROCESSING { */ FASTQ_TO_CRAM.out.cram_crai + .mix(ch_consensus_cram_crai) .filter{ meta, cram, crai -> meta.tag != "SNP" } @@ -438,7 +461,12 @@ def readgroup_from_fastq(path) { } assert line.startsWith('@') line = line.substring(1) - def fields = line.split(':') + + def parts = line.split(' ') + def left = parts[0] + def right = parts.size() > 1 ? parts[1] : "" + + def fields = left.split(':') def rg = [:] rg.CN = "CMGG" @@ -449,7 +477,11 @@ def readgroup_from_fastq(path) { def run_nubmer = fields[1] def fcid = fields[2] def lane = fields[3] - def index = fields[-1] =~ /[GATC+-]/ ? fields[-1] : "" + def index = "" + if (right) { + def r = right.split(':') + if (r && (r[-1] ==~ /[ACGTN+\-]+/)) index = r[-1] + } rg.ID = [fcid,lane].join(".") rg.PU = [fcid, lane, index].findAll().join(".") @@ -457,6 +489,15 @@ def readgroup_from_fastq(path) { } else if (fields.size() == 5) { def fcid = fields[0] rg.ID = fcid + rg.PU = fcid + rg.PL = "ILLUMINA" + } + else { + // fallback para cabeceras no-CASAVA: usa el primer campo no vacío + def fallback = (fields && fields[0]) ? fields[0] : "unknown" + rg.ID = fallback + rg.PU = fallback + rg.PL = rg.PL ?: "ILLUMINA" } return rg }