Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"java.compile.nullAnalysis.mode": "automatic"
}
6 changes: 6 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@ plugins {
id 'io.nextflow.nextflow-plugin' version '1.0.0-beta.12'
}

java {
toolchain {
languageVersion = JavaLanguageVersion.of(17)
}
}

Comment on lines +5 to +10
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why was this needed? The nextflow gradle plugin already handles this

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do I know 🤷🏻 I just vibe coded this with a single prompt

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤖 let's remove it then

dependencies {
implementation 'com.squareup.okhttp3:okhttp:5.3.2'
}
Expand Down
26 changes: 26 additions & 0 deletions src/main/groovy/nfcmgg/plugin/CmggExtension.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ package nfcmgg.plugin
import groovy.transform.CompileStatic
import nextflow.Session
import nextflow.plugin.extension.PluginExtensionPoint
import java.nio.file.Path
import nextflow.plugin.extension.Function

/**
* Implements a custom function which can be imported by
Expand All @@ -26,6 +28,30 @@ import nextflow.plugin.extension.PluginExtensionPoint
@CompileStatic
class CmggExtension extends PluginExtensionPoint {

/*
* Parse first line of a FASTQ file, return the flowcell id and lane number.
*/
@Function
String flowcellLaneFromFastq(Path path) {
return nfcmgg.plugin.utils.FastqUtils.flowcellLaneFromFastq(path)
}

/*
* Get first line of a FASTQ file
*/
@Function
String readFirstLineOfFastq(Path path) {
return nfcmgg.plugin.utils.FastqUtils.readFirstLineOfFastq(path)
}

/*
* Add readgroup to meta and remove lane
*/
@Function
List addReadgroupToMeta(Map meta, List<Path> files, Map params) {
return nfcmgg.plugin.utils.FastqUtils.addReadgroupToMeta(meta, files, params)
}

@Override
protected void init(Session session) {
}
Expand Down
122 changes: 122 additions & 0 deletions src/main/groovy/nfcmgg/plugin/utils/FastqUtils.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
/* groovylint-disable DuplicateNumberLiteral, LineLength */
/*
* Copyright 2026, Center for Medical Genetics Ghent
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nfcmgg.plugin.utils

import groovy.transform.CompileStatic
import groovy.util.logging.Slf4j
import java.nio.file.Path
import java.nio.file.Files
import java.util.zip.GZIPInputStream

/**
* Utility functions for FastQ file processing
*/
@Slf4j
@CompileStatic
class FastqUtils {

/**
* Parse first line of a FASTQ file, return the flowcell id and lane number.
*
* @param path Path to the FastQ file
* @return Flowcell ID or null if not found
*/
static String flowcellLaneFromFastq(Path path) {
// First line of FASTQ file contains sequence identifier plus optional description
String firstLine = readFirstLineOfFastq(path)
String flowcellId = null

// Expected format from ILLUMINA
// cf https://en.wikipedia.org/wiki/FASTQ_format#Illumina_sequence_identifiers
// Five fields:
// @<instrument>:<lane>:<tile>:<x-pos>:<y-pos>...
// Seven fields or more (from CASAVA 1.8+):
// "@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>..."

String[] fields = firstLine ? firstLine.split(':') : new String[0]
if (fields.size() == 5) {
// Get the instrument name as flowcell ID
flowcellId = fields[0].substring(1)
} else if (fields.size() >= 7) {
// Get the actual flowcell ID
flowcellId = fields[2]
} else if (fields.size() != 0) {
log.warn("FASTQ file(${path}): Cannot extract flowcell ID from ${firstLine}")
}
return flowcellId
}

/**
* Get first line of a FASTQ file
*
* @param path Path to the FastQ file
* @return The first line of the file
*/
static String readFirstLineOfFastq(Path path) {
String line = null
try {
InputStream is = Files.newInputStream(path)
is.withCloseable { InputStream wrapped ->
InputStream gzipStream = new GZIPInputStream(wrapped)
Reader decoder = new InputStreamReader(gzipStream, 'ASCII')
BufferedReader buffered = new BufferedReader(decoder)
line = buffered.readLine()
if (line && !line.startsWith('@')) {
log.warn("FASTQ file(${path}): First line does not start with '@'")
}
}
} catch (IOException e) {
log.warn("FASTQ file(${path}): Error streaming: ${e.message}")
}
return line
}

/**
* Add readgroup to meta and remove lane
*
* @param meta Map containing sample metadata
* @param files List of FastQ files
* @param params Map containing pipeline parameters
* @return List containing updated meta map and files
*/
static List addReadgroupToMeta(Map meta, List<Path> files, Map params) {
String cn = params.seq_center ? "CN:${params.seq_center}\\t" : ''
String flowcell = flowcellLaneFromFastq(files[0])

// Check if flowcell ID matches
if (flowcell && files.size() > 1 && flowcell != flowcellLaneFromFastq(files[1])) {
throw new IllegalStateException("Flowcell ID does not match for paired reads of sample ${meta.id} - ${files}")
}

// If we cannot read the flowcell ID from the fastq file, then we don't use it
String sampleLaneId = flowcell ? "${flowcell}.${meta.sample}.${meta.lane}" : "${meta.sample}.${meta.lane}"

// Don't use a random element for ID, it breaks resuming
String readGroup = params.umi_read_structure
? "\"@RG\\tID:${meta.sample}\\t${cn}PU:consensus\\tSM:${meta.patient}_${meta.sample}\\tLB:${meta.sample}\\tDS:${params.fasta}\\tPL:${params.seq_platform}\""
: "\"@RG\\tID:${sampleLaneId}\\t${cn}PU:${meta.lane}\\tSM:${meta.patient}_${meta.sample}\\tLB:${meta.sample}\\tDS:${params.fasta}\\tPL:${params.seq_platform}\""

// Create new meta map removing 'lane' and adding 'read_group' and 'sample_lane_id'
Map newMeta = new LinkedHashMap(meta)
newMeta.remove('lane')
newMeta.put('read_group', readGroup.toString())
newMeta.put('sample_lane_id', sampleLaneId.toString())

return [newMeta, files]
}

}
162 changes: 162 additions & 0 deletions src/test/groovy/nfcmgg/plugin/utils/FastqUtilsTest.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
/* groovylint-disable JUnitPublicNonTestMethod, MethodName, JUnitPublicProperty, MethodReturnTypeRequired */
/*
* Copyright 2026, Center for Medical Genetics Ghent
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nfcmgg.plugin.utils

import groovy.transform.CompileDynamic
import spock.lang.Specification
import spock.lang.TempDir
import java.nio.file.Path
import java.nio.file.Files
import java.util.zip.GZIPOutputStream

/**
* Unit tests for FastqUtils
*/
@CompileDynamic
class FastqUtilsTest extends Specification {

@TempDir
Path tempDir

def "readFirstLineOfFastq should return first line of gzipped fastq"() {
given:
Path fastq = tempDir.resolve('test.fastq.gz')
String content = '@SEQ_ID\nAGCT'
GZIPOutputStream gzipOut = new GZIPOutputStream(Files.newOutputStream(fastq))
gzipOut.write(content.bytes)
gzipOut.close()

when:
String line = FastqUtils.readFirstLineOfFastq(fastq)

then:
line == '@SEQ_ID'
}

def "flowcellLaneFromFastq should extract flowcell from 5-field header"() {
given:
Path fastq = tempDir.resolve('test.fastq.gz')
// @<instrument>:<lane>:<tile>:<x-pos>:<y-pos>
String content = '@INSTRUMENT:1:1101:1234:5678\nAGCT'
GZIPOutputStream gzipOut = new GZIPOutputStream(Files.newOutputStream(fastq))
gzipOut.write(content.bytes)
gzipOut.close()

when:
String flowcell = FastqUtils.flowcellLaneFromFastq(fastq)

then:
flowcell == 'INSTRUMENT'
}

def "flowcellLaneFromFastq should extract flowcell from 7-field header"() {
given:
Path fastq = tempDir.resolve('test.fastq.gz')
// @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>
String content = '@INSTRUMENT:123:FLOWCELL_ID:1:1101:1234:5678\nAGCT'
GZIPOutputStream gzipOut = new GZIPOutputStream(Files.newOutputStream(fastq))
gzipOut.write(content.bytes)
gzipOut.close()

when:
String flowcell = FastqUtils.flowcellLaneFromFastq(fastq)

then:
flowcell == 'FLOWCELL_ID'
}

def "addReadgroupToMeta should add readgroup info"() {
given:
Path fastq1 = tempDir.resolve('sample_R1.fastq.gz')
String content = '@INSTRUMENT:1:1101:1234:5678\nAGCT'
GZIPOutputStream gzipOut1 = new GZIPOutputStream(Files.newOutputStream(fastq1))
gzipOut1.write(content.bytes)
gzipOut1.close()

List<Path> files = [fastq1]
Map meta = [id: 'sample1', sample: 'sample1', patient: 'patient1', lane: '1']
Map params = [
seq_center: 'CENTER',
fasta: 'ref.fa',
seq_platform: 'ILLUMINA',
umi_read_structure: false
]

when:
List result = FastqUtils.addReadgroupToMeta(meta, files, params)
Map newMeta = result[0]

then:
newMeta.sample_lane_id == 'INSTRUMENT.sample1.1'
newMeta.read_group.contains('ID:INSTRUMENT.sample1.1')
newMeta.read_group.contains('CN:CENTER')
newMeta.read_group.contains('LB:sample1')
!newMeta.containsKey('lane')
}

def "addReadgroupToMeta should handle missing flowcell"() {
given:
Path fastq1 = tempDir.resolve('sample_R1.fastq.gz')
String content = '@BAD_HEADER\nAGCT'
GZIPOutputStream gzipOut1 = new GZIPOutputStream(Files.newOutputStream(fastq1))
gzipOut1.write(content.bytes)
gzipOut1.close()

List<Path> files = [fastq1]
Map meta = [id: 'sample1', sample: 'sample1', patient: 'patient1', lane: '1']
Map params = [
seq_center: 'CENTER',
fasta: 'ref.fa',
seq_platform: 'ILLUMINA',
umi_read_structure: false
]

when:
List result = FastqUtils.addReadgroupToMeta(meta, files, params)
Map newMeta = result[0]

then:
newMeta.sample_lane_id == 'sample1.1'
newMeta.read_group.contains('ID:sample1.1')
}

def "addReadgroupToMeta should fail on mismatched flowcells"() {
given:
Path fastq1 = tempDir.resolve('sample_R1.fastq.gz')
String content1 = '@FC1:1:1101:1234:5678\nAGCT'
GZIPOutputStream gzipOut1 = new GZIPOutputStream(Files.newOutputStream(fastq1))
gzipOut1.write(content1.bytes)
gzipOut1.close()

Path fastq2 = tempDir.resolve('sample_R2.fastq.gz')
String content2 = '@FC2:1:1101:1234:5678\nAGCT'
GZIPOutputStream gzipOut2 = new GZIPOutputStream(Files.newOutputStream(fastq2))
gzipOut2.write(content2.bytes)
gzipOut2.close()

List<Path> files = [fastq1, fastq2]
Map meta = [id: 'sample1', sample: 'sample1', patient: 'patient1', lane: '1']
Map params = [:]

when:
FastqUtils.addReadgroupToMeta(meta, files, params)

then:
thrown(IllegalStateException)
}

}
Loading