From 1877638cffdc164be20a05140219c2094833b76b Mon Sep 17 00:00:00 2001 From: Matthias De Smet <11850640+matthdsm@users.noreply.github.com> Date: Wed, 18 Feb 2026 09:43:57 +0100 Subject: [PATCH 1/3] WIP: steal fastq utils from sarek --- .vscode/settings.json | 3 + build.gradle | 6 + .../groovy/nfcmgg/plugin/CmggExtension.groovy | 22 +++ .../nfcmgg/plugin/utils/FastqUtils.groovy | 122 +++++++++++++ .../nfcmgg/plugin/utils/FastqUtilsTest.groovy | 162 ++++++++++++++++++ 5 files changed, 315 insertions(+) create mode 100644 .vscode/settings.json create mode 100644 src/main/groovy/nfcmgg/plugin/utils/FastqUtils.groovy create mode 100644 src/test/groovy/nfcmgg/plugin/utils/FastqUtilsTest.groovy diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..849f79e --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "java.compile.nullAnalysis.mode": "automatic" +} diff --git a/build.gradle b/build.gradle index bb13957..5346d0a 100644 --- a/build.gradle +++ b/build.gradle @@ -2,6 +2,12 @@ plugins { id 'io.nextflow.nextflow-plugin' version '1.0.0-beta.12' } +java { + toolchain { + languageVersion = JavaLanguageVersion.of(17) + } +} + dependencies { implementation 'com.squareup.okhttp3:okhttp:5.3.2' } diff --git a/src/main/groovy/nfcmgg/plugin/CmggExtension.groovy b/src/main/groovy/nfcmgg/plugin/CmggExtension.groovy index a58fc14..d8f1d0f 100644 --- a/src/main/groovy/nfcmgg/plugin/CmggExtension.groovy +++ b/src/main/groovy/nfcmgg/plugin/CmggExtension.groovy @@ -18,6 +18,7 @@ package nfcmgg.plugin import groovy.transform.CompileStatic import nextflow.Session import nextflow.plugin.extension.PluginExtensionPoint +import java.nio.file.Path /** * Implements a custom function which can be imported by @@ -26,6 +27,27 @@ import nextflow.plugin.extension.PluginExtensionPoint @CompileStatic class CmggExtension extends PluginExtensionPoint { + /* + * Parse first line of a FASTQ file, return the flowcell id and lane number. + */ + String flowcellLaneFromFastq(Path path) { + return nfcmgg.plugin.utils.FastqUtils.flowcellLaneFromFastq(path) + } + + /* + * Get first line of a FASTQ file + */ + String readFirstLineOfFastq(Path path) { + return nfcmgg.plugin.utils.FastqUtils.readFirstLineOfFastq(path) + } + + /* + * Add readgroup to meta and remove lane + */ + List addReadgroupToMeta(Map meta, List files, Map params) { + return nfcmgg.plugin.utils.FastqUtils.addReadgroupToMeta(meta, files, params) + } + @Override protected void init(Session session) { } diff --git a/src/main/groovy/nfcmgg/plugin/utils/FastqUtils.groovy b/src/main/groovy/nfcmgg/plugin/utils/FastqUtils.groovy new file mode 100644 index 0000000..bd6258d --- /dev/null +++ b/src/main/groovy/nfcmgg/plugin/utils/FastqUtils.groovy @@ -0,0 +1,122 @@ +/* groovylint-disable DuplicateNumberLiteral, LineLength */ +/* + * Copyright 2026, Center for Medical Genetics Ghent + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package nfcmgg.plugin.utils + +import groovy.transform.CompileStatic +import groovy.util.logging.Slf4j +import java.nio.file.Path +import java.nio.file.Files +import java.util.zip.GZIPInputStream + +/** + * Utility functions for FastQ file processing + */ +@Slf4j +@CompileStatic +class FastqUtils { + + /** + * Parse first line of a FASTQ file, return the flowcell id and lane number. + * + * @param path Path to the FastQ file + * @return Flowcell ID or null if not found + */ + static String flowcellLaneFromFastq(Path path) { + // First line of FASTQ file contains sequence identifier plus optional description + String firstLine = readFirstLineOfFastq(path) + String flowcellId = null + + // Expected format from ILLUMINA + // cf https://en.wikipedia.org/wiki/FASTQ_format#Illumina_sequence_identifiers + // Five fields: + // @::::... + // Seven fields or more (from CASAVA 1.8+): + // "@::::::..." + + String[] fields = firstLine ? firstLine.split(':') : new String[0] + if (fields.size() == 5) { + // Get the instrument name as flowcell ID + flowcellId = fields[0].substring(1) + } else if (fields.size() >= 7) { + // Get the actual flowcell ID + flowcellId = fields[2] + } else if (fields.size() != 0) { + log.warn("FASTQ file(${path}): Cannot extract flowcell ID from ${firstLine}") + } + return flowcellId + } + + /** + * Get first line of a FASTQ file + * + * @param path Path to the FastQ file + * @return The first line of the file + */ + static String readFirstLineOfFastq(Path path) { + String line = null + try { + InputStream is = Files.newInputStream(path) + is.withCloseable { InputStream wrapped -> + InputStream gzipStream = new GZIPInputStream(wrapped) + Reader decoder = new InputStreamReader(gzipStream, 'ASCII') + BufferedReader buffered = new BufferedReader(decoder) + line = buffered.readLine() + if (line && !line.startsWith('@')) { + log.warn("FASTQ file(${path}): First line does not start with '@'") + } + } + } catch (IOException e) { + log.warn("FASTQ file(${path}): Error streaming: ${e.message}") + } + return line + } + + /** + * Add readgroup to meta and remove lane + * + * @param meta Map containing sample metadata + * @param files List of FastQ files + * @param params Map containing pipeline parameters + * @return List containing updated meta map and files + */ + static List addReadgroupToMeta(Map meta, List files, Map params) { + String cn = params.seq_center ? "CN:${params.seq_center}\\t" : '' + String flowcell = flowcellLaneFromFastq(files[0]) + + // Check if flowcell ID matches + if (flowcell && files.size() > 1 && flowcell != flowcellLaneFromFastq(files[1])) { + throw new IllegalStateException("Flowcell ID does not match for paired reads of sample ${meta.id} - ${files}") + } + + // If we cannot read the flowcell ID from the fastq file, then we don't use it + String sampleLaneId = flowcell ? "${flowcell}.${meta.sample}.${meta.lane}" : "${meta.sample}.${meta.lane}" + + // Don't use a random element for ID, it breaks resuming + String readGroup = params.umi_read_structure + ? "\"@RG\\tID:${meta.sample}\\t${cn}PU:consensus\\tSM:${meta.patient}_${meta.sample}\\tLB:${meta.sample}\\tDS:${params.fasta}\\tPL:${params.seq_platform}\"" + : "\"@RG\\tID:${sampleLaneId}\\t${cn}PU:${meta.lane}\\tSM:${meta.patient}_${meta.sample}\\tLB:${meta.sample}\\tDS:${params.fasta}\\tPL:${params.seq_platform}\"" + + // Create new meta map removing 'lane' and adding 'read_group' and 'sample_lane_id' + Map newMeta = new LinkedHashMap(meta) + newMeta.remove('lane') + newMeta.put('read_group', readGroup.toString()) + newMeta.put('sample_lane_id', sampleLaneId.toString()) + + return [newMeta, files] + } + +} diff --git a/src/test/groovy/nfcmgg/plugin/utils/FastqUtilsTest.groovy b/src/test/groovy/nfcmgg/plugin/utils/FastqUtilsTest.groovy new file mode 100644 index 0000000..8584a7e --- /dev/null +++ b/src/test/groovy/nfcmgg/plugin/utils/FastqUtilsTest.groovy @@ -0,0 +1,162 @@ +/* groovylint-disable JUnitPublicNonTestMethod, MethodName, JUnitPublicProperty, MethodReturnTypeRequired */ +/* + * Copyright 2026, Center for Medical Genetics Ghent + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package nfcmgg.plugin.utils + +import groovy.transform.CompileDynamic +import spock.lang.Specification +import spock.lang.TempDir +import java.nio.file.Path +import java.nio.file.Files +import java.util.zip.GZIPOutputStream + +/** + * Unit tests for FastqUtils + */ +@CompileDynamic +class FastqUtilsTest extends Specification { + + @TempDir + Path tempDir + + def "readFirstLineOfFastq should return first line of gzipped fastq"() { + given: + Path fastq = tempDir.resolve('test.fastq.gz') + String content = '@SEQ_ID\nAGCT' + GZIPOutputStream gzipOut = new GZIPOutputStream(Files.newOutputStream(fastq)) + gzipOut.write(content.bytes) + gzipOut.close() + + when: + String line = FastqUtils.readFirstLineOfFastq(fastq) + + then: + line == '@SEQ_ID' + } + + def "flowcellLaneFromFastq should extract flowcell from 5-field header"() { + given: + Path fastq = tempDir.resolve('test.fastq.gz') + // @:::: + String content = '@INSTRUMENT:1:1101:1234:5678\nAGCT' + GZIPOutputStream gzipOut = new GZIPOutputStream(Files.newOutputStream(fastq)) + gzipOut.write(content.bytes) + gzipOut.close() + + when: + String flowcell = FastqUtils.flowcellLaneFromFastq(fastq) + + then: + flowcell == 'INSTRUMENT' + } + + def "flowcellLaneFromFastq should extract flowcell from 7-field header"() { + given: + Path fastq = tempDir.resolve('test.fastq.gz') + // @:::::: + String content = '@INSTRUMENT:123:FLOWCELL_ID:1:1101:1234:5678\nAGCT' + GZIPOutputStream gzipOut = new GZIPOutputStream(Files.newOutputStream(fastq)) + gzipOut.write(content.bytes) + gzipOut.close() + + when: + String flowcell = FastqUtils.flowcellLaneFromFastq(fastq) + + then: + flowcell == 'FLOWCELL_ID' + } + + def "addReadgroupToMeta should add readgroup info"() { + given: + Path fastq1 = tempDir.resolve('sample_R1.fastq.gz') + String content = '@INSTRUMENT:1:1101:1234:5678\nAGCT' + GZIPOutputStream gzipOut1 = new GZIPOutputStream(Files.newOutputStream(fastq1)) + gzipOut1.write(content.bytes) + gzipOut1.close() + + List files = [fastq1] + Map meta = [id: 'sample1', sample: 'sample1', patient: 'patient1', lane: '1'] + Map params = [ + seq_center: 'CENTER', + fasta: 'ref.fa', + seq_platform: 'ILLUMINA', + umi_read_structure: false + ] + + when: + List result = FastqUtils.addReadgroupToMeta(meta, files, params) + Map newMeta = result[0] + + then: + newMeta.sample_lane_id == 'INSTRUMENT.sample1.1' + newMeta.read_group.contains('ID:INSTRUMENT.sample1.1') + newMeta.read_group.contains('CN:CENTER') + newMeta.read_group.contains('LB:sample1') + !newMeta.containsKey('lane') + } + + def "addReadgroupToMeta should handle missing flowcell"() { + given: + Path fastq1 = tempDir.resolve('sample_R1.fastq.gz') + String content = '@BAD_HEADER\nAGCT' + GZIPOutputStream gzipOut1 = new GZIPOutputStream(Files.newOutputStream(fastq1)) + gzipOut1.write(content.bytes) + gzipOut1.close() + + List files = [fastq1] + Map meta = [id: 'sample1', sample: 'sample1', patient: 'patient1', lane: '1'] + Map params = [ + seq_center: 'CENTER', + fasta: 'ref.fa', + seq_platform: 'ILLUMINA', + umi_read_structure: false + ] + + when: + List result = FastqUtils.addReadgroupToMeta(meta, files, params) + Map newMeta = result[0] + + then: + newMeta.sample_lane_id == 'sample1.1' + newMeta.read_group.contains('ID:sample1.1') + } + + def "addReadgroupToMeta should fail on mismatched flowcells"() { + given: + Path fastq1 = tempDir.resolve('sample_R1.fastq.gz') + String content1 = '@FC1:1:1101:1234:5678\nAGCT' + GZIPOutputStream gzipOut1 = new GZIPOutputStream(Files.newOutputStream(fastq1)) + gzipOut1.write(content1.bytes) + gzipOut1.close() + + Path fastq2 = tempDir.resolve('sample_R2.fastq.gz') + String content2 = '@FC2:1:1101:1234:5678\nAGCT' + GZIPOutputStream gzipOut2 = new GZIPOutputStream(Files.newOutputStream(fastq2)) + gzipOut2.write(content2.bytes) + gzipOut2.close() + + List files = [fastq1, fastq2] + Map meta = [id: 'sample1', sample: 'sample1', patient: 'patient1', lane: '1'] + Map params = [:] + + when: + FastqUtils.addReadgroupToMeta(meta, files, params) + + then: + thrown(IllegalStateException) + } + +} From fe83d0877dc3b49dc5745e6640224b83ad7f065c Mon Sep 17 00:00:00 2001 From: Matthias De Smet <11850640+matthdsm@users.noreply.github.com> Date: Wed, 18 Feb 2026 09:57:52 +0100 Subject: [PATCH 2/3] Update src/main/groovy/nfcmgg/plugin/CmggExtension.groovy Co-authored-by: Nicolas Vannieuwkerke <101190534+nvnieuwk@users.noreply.github.com> --- src/main/groovy/nfcmgg/plugin/CmggExtension.groovy | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/groovy/nfcmgg/plugin/CmggExtension.groovy b/src/main/groovy/nfcmgg/plugin/CmggExtension.groovy index d8f1d0f..75c6fda 100644 --- a/src/main/groovy/nfcmgg/plugin/CmggExtension.groovy +++ b/src/main/groovy/nfcmgg/plugin/CmggExtension.groovy @@ -19,6 +19,7 @@ import groovy.transform.CompileStatic import nextflow.Session import nextflow.plugin.extension.PluginExtensionPoint import java.nio.file.Path +import nextflow.plugin.extension.Function /** * Implements a custom function which can be imported by From e3e9fa319f01bc628327eb41d644bc2e2e28abdb Mon Sep 17 00:00:00 2001 From: Matthias De Smet <11850640+matthdsm@users.noreply.github.com> Date: Wed, 18 Feb 2026 09:58:14 +0100 Subject: [PATCH 3/3] Update src/main/groovy/nfcmgg/plugin/CmggExtension.groovy Co-authored-by: Nicolas Vannieuwkerke <101190534+nvnieuwk@users.noreply.github.com> --- src/main/groovy/nfcmgg/plugin/CmggExtension.groovy | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/groovy/nfcmgg/plugin/CmggExtension.groovy b/src/main/groovy/nfcmgg/plugin/CmggExtension.groovy index 75c6fda..7b06dfd 100644 --- a/src/main/groovy/nfcmgg/plugin/CmggExtension.groovy +++ b/src/main/groovy/nfcmgg/plugin/CmggExtension.groovy @@ -31,6 +31,7 @@ class CmggExtension extends PluginExtensionPoint { /* * Parse first line of a FASTQ file, return the flowcell id and lane number. */ + @Function String flowcellLaneFromFastq(Path path) { return nfcmgg.plugin.utils.FastqUtils.flowcellLaneFromFastq(path) } @@ -38,6 +39,7 @@ class CmggExtension extends PluginExtensionPoint { /* * Get first line of a FASTQ file */ + @Function String readFirstLineOfFastq(Path path) { return nfcmgg.plugin.utils.FastqUtils.readFirstLineOfFastq(path) } @@ -45,6 +47,7 @@ class CmggExtension extends PluginExtensionPoint { /* * Add readgroup to meta and remove lane */ + @Function List addReadgroupToMeta(Map meta, List files, Map params) { return nfcmgg.plugin.utils.FastqUtils.addReadgroupToMeta(meta, files, params) }