diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..891debe --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,40 @@ +name: Bug report +description: Report something that is broken or incorrect, please ensure you have checked existing issues before submitting a new issue. +labels: [bug] +body: + - type: textarea + id: description + attributes: + label: Description of the bug + description: A clear and concise description of what the bug is. + validations: + required: true + + - type: textarea + id: command_used + attributes: + label: Command used and terminal output + description: Steps to reproduce the behaviour. Please paste the command you used to launch the pipeline and the output from your terminal. + render: console + placeholder: | + $ align_trim ... + + Some output where something broke + + - type: textarea + id: files + attributes: + label: Relevant files + description: | + Please drag and drop the relevant files here. Create a `.zip` archive if the extension is not allowed. + Your verbose log file (from stderr with --verbose enabled), the report TSV generated with `--report`, and your primer scheme bedfile are all helpful. + + - type: textarea + id: system + attributes: + label: System information + description: | + * Hardware _(eg. HPC, Desktop, Cloud)_ + * OS _(eg. CentOS Linux, macOS, Linux Mint)_ + * Install method _(eg. pip, conda, source)_ + * Version of align_trim _(eg. 1.1, 1.5, 1.8.2)_ diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..5180f20 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,11 @@ +name: Feature request +description: Suggest an idea for the align_trim, please ensure you have checked existing feature requests before submitting a new suggestion to avoid duplicates. +labels: enhancement +body: + - type: textarea + id: description + attributes: + label: Description of feature + description: Please describe your suggestion for a new feature. It might help to describe a problem or use case, plus any alternatives that you have considered. + validations: + required: true diff --git a/align_trim/main.py b/align_trim/main.py index 93aa997..4771684 100644 --- a/align_trim/main.py +++ b/align_trim/main.py @@ -182,7 +182,7 @@ def trim(segment, primer_pos, end, verbose=False): # softmask the left primer if not end: - # update the position of the leftmost mappinng base + # update the position of the leftmost mapping base segment.pos = pos - extra if verbose: print( @@ -342,19 +342,34 @@ def handle_segments( ) else: # locate the nearest primers to this alignment segment pair - p1 = find_primer_with_lookup( - lookup=lookup, - pos=segment1.reference_start, - direction="+", - chrom=segment1.reference_name, - ) - - p2 = find_primer_with_lookup( - lookup=lookup, - pos=segment2.reference_end, - direction="-", - chrom=segment2.reference_name, - ) + if segment1.reference_start < segment2.reference_start: + # if segment1 starts before segment2, then segment1 is the left segment relative to the reference + p1 = find_primer_with_lookup( + lookup=lookup, + pos=segment1.reference_start, + direction="+", + chrom=segment1.reference_name, + ) + p2 = find_primer_with_lookup( + lookup=lookup, + pos=segment2.reference_end, + direction="-", + chrom=segment2.reference_name, + ) + else: + # otherwise then segment2 is the left segment relative to the reference + p1 = find_primer_with_lookup( + lookup=lookup, + pos=segment2.reference_start, + direction="+", + chrom=segment2.reference_name, + ) + p2 = find_primer_with_lookup( + lookup=lookup, + pos=segment1.reference_end, + direction="-", + chrom=segment1.reference_name, + ) if not p1 or not p2: segment = segment1 if segment1 else segment2 @@ -366,8 +381,6 @@ def handle_segments( return False # check if primers are correctly paired and then assign read group - # NOTE: removed this as a function as only called once - # TODO: will try improving this / moving it to the primer scheme processing code correctly_paired = p1.amplicon_number == p2.amplicon_number if not paired: @@ -469,15 +482,14 @@ def handle_segments( return False # Check require-full-length - if not paired: - if args.require_full_length: - if segment.reference_start > p1.end or segment.reference_end < p2.start: - if args.verbose: - print( - f"{segment.query_name}: ref_start {segment.reference_start} > p1.end {p1.end} or ref_end {segment.reference_end} < p2.start {p2.start}, does not span a full amplicon, skipping", - file=sys.stderr, - ) - return False + if args.require_full_length: + if segment.reference_start > p1.end or segment.reference_end < p2.start: + if args.verbose: + print( + f"{segment.query_name}: ref_start {segment.reference_start} > p1.end {p1.end} or ref_end {segment.reference_end} < p2.start {p2.start}, does not span a full amplicon, skipping", + file=sys.stderr, + ) + return False # If not normalising, write the segment to the output file and add it to amplicon depth numpy array if not args.normalise: @@ -496,65 +508,46 @@ def handle_segments( return (amplicon, segment) else: - if segment1.reference_start < p1_position: - try: - trim(segment1, p1_position, False, args.verbose) - if args.verbose: - print( - f"{segment1.query_name}: ref start {segment1.reference_start} >= primer_position {p1_position}", - file=sys.stderr, + for segment_of_pair in (segment1, segment2): + if segment_of_pair.reference_start < p1_position: + try: + trim( + segment=segment_of_pair, + primer_pos=p1_position, + end=False, + verbose=args.verbose, ) - except Exception as e: - print( - f"{segment1.query_name}: Problem soft masking left primer (error: {e}), skipping", - file=sys.stderr, - ) - return False - - elif segment1.reference_end > p2_position: # type: ignore - try: - trim(segment1, p2_position, True, args.verbose) - if args.verbose: + if args.verbose: + print( + f"{segment_of_pair.query_name}: ref start {segment_of_pair.reference_start} >= primer_position {p1_position}", + file=sys.stderr, + ) + except Exception as e: print( - f"{segment1.query_name}: ref_end {segment1.reference_end} >= primer_position {p2_position}", + f"{segment_of_pair.query_name}: Problem soft masking left primer (error: {e}), skipping", file=sys.stderr, ) - except Exception as e: - print( - f"{segment1.query_name}: Problem soft masking right primer (error: {e}), skipping", - file=sys.stderr, - ) - return False + return False - # softmask the alignment if right primer start/end inside alignment - if segment2.reference_end > p2_position: # type: ignore - try: - trim(segment2, p2_position, True, args.verbose) - if args.verbose: - print( - f"{segment1.query_name}: ref_start {segment2.reference_start} >= primer_position {p2_position}", - file=sys.stderr, + if segment_of_pair.reference_end > p2_position: # type: ignore + try: + trim( + segment=segment_of_pair, + primer_pos=p2_position, + end=True, + verbose=args.verbose, ) - except Exception as e: - print( - f"{segment1.query_name}: Problem soft masking right primer (error: {e}), skipping", - file=sys.stderr, - ) - return False - elif segment2.reference_start < p1_position: - try: - trim(segment2, p1_position, False, args.verbose) - if args.verbose: + if args.verbose: + print( + f"{segment_of_pair.query_name}: ref_end {segment_of_pair.reference_end} >= primer_position {p2_position}", + file=sys.stderr, + ) + except Exception as e: print( - f"{segment1.query_name}: ref_end {segment2.reference_end} >= primer_position {p1_position}", + f"{segment_of_pair.query_name}: Problem soft masking right primer (error: {e}), skipping", file=sys.stderr, ) - except Exception as e: - print( - f"{segment1.query_name}: Problem soft masking left primer (error: {e}), skipping", - file=sys.stderr, - ) - return False + return False # check the the alignment still contains bases matching the reference if "M" not in segment1.cigarstring or "M" not in segment2.cigarstring: # type: ignore @@ -566,33 +559,40 @@ def handle_segments( return False if args.require_full_length: - if segment1.reference_start > p1.end or segment2.reference_end < p2.start: - if args.verbose: - print( - f"{segment1.query_name}: ref_start {segment1.reference_start} > p1.end {p1.end} or ref_end {segment2.reference_end} < p2.start {p2.start}, does not span a full amplicon, skipping", - file=sys.stderr, - ) - return False + if segment1.reference_start < segment2.reference_start: + if ( + segment1.reference_start > p1.end + or segment2.reference_end < p2.start + ): + if args.verbose: + print( + f"{segment1.query_name}: ref_start {segment1.reference_start} > p1.end {p1.end} or ref_end {segment2.reference_end} < p2.start {p2.start}, does not span a full amplicon, skipping", + file=sys.stderr, + ) + return False + else: + if ( + segment2.reference_start > p1.end + or segment1.reference_end < p2.start + ): + if args.verbose: + print( + f"{segment1.query_name}: ref_end {segment1.reference_end} < p2.start {p2.start} or ref_start {segment2.reference_start} > p1.end {p1.end}, does not span a full amplicon, skipping", + file=sys.stderr, + ) + return False # If not normalising, write the segments to the output file and add them to amplicon depth numpy array if not args.normalise: outfile_writer.write(segment1) outfile_writer.write(segment2) - segment1_amp_relative_start = segment1.reference_start - p1.start - segment1_amp_relative_end = segment1.reference_end - p1.start - if segment1_amp_relative_start < 0: - segment1_amp_relative_start = 0 - - segment2_amp_relative_start = segment2.reference_start - p1.start - segment2_amp_relative_end = segment2.reference_end - p1.start - if segment2_amp_relative_start < 0: - segment2_amp_relative_start = 0 - + for segment_in_pair in (segment1, segment2): + segment_amp_relative_start = segment_in_pair.reference_start - p1.start + segment_amp_relative_end = segment_in_pair.reference_end - p1.start + if segment_amp_relative_start < 0: + segment_amp_relative_start = 0 amp_depths[segment1.reference_name][amplicon][ - segment1_amp_relative_start:segment1_amp_relative_end - ] += 1 - amp_depths[segment2.reference_name][amplicon][ - segment2_amp_relative_start:segment2_amp_relative_end + segment_amp_relative_start:segment_amp_relative_end ] += 1 return (amplicon, False) @@ -813,8 +813,8 @@ def go(args): pools_str.add("unmatched") # open the input samfile and process read groups - if args.bamfile and args.bamfile != "-": - infile = pysam.AlignmentFile(args.bamfile, "rb") + if args.samfile and args.samfile != "-": + infile = pysam.AlignmentFile(args.samfile, "rb") else: infile = pysam.AlignmentFile("-", "rb") @@ -1034,7 +1034,7 @@ def go(args): def main(): parser = argparse.ArgumentParser( - description="Trim alignments from an amplicon scheme. Bam (input) can be provided by --bamfile or stdin" + description="Trim alignments from an amplicon scheme. Bam (input) can be provided by --samfile or stdin" ) parser.add_argument( "bedfile", @@ -1043,9 +1043,9 @@ def main(): metavar="BEDFILE", ) parser.add_argument( - "--bamfile", - "-b", - help="Sorted BAM file containing the aligned reads, if this is not provided (or '-') then 'align_trim' will read from stdin.", + "--samfile", + "-s", + help="Sorted SAM/BAM file containing the aligned reads, if this is not provided (or '-') then 'align_trim' will read from stdin.", required=False, ) parser.add_argument( diff --git a/tests/test_integration.py b/tests/test_integration.py index 74d555b..9fb48b7 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,4 +1,5 @@ import pathlib +import sys import unittest import argparse from primalbedtools.scheme import Scheme @@ -7,6 +8,7 @@ import tempfile import pysam from align_trim.main import go, create_primer_lookup, find_primer_with_lookup +from collections import defaultdict BED_PATH_V5_3_2 = pathlib.Path(__file__).parent / "test_data/v5.3.2.primer.bed" BAM_PATH_V5_3_2 = pathlib.Path(__file__).parent / "test_data/sars-cov-2_v5.3.2.bam" @@ -16,11 +18,34 @@ ) +def read_pair_generator(bam, region_string=None): + """ + Generate read pairs in a BAM file or within a region string. + Reads are added to read_dict until a pair is found. + """ + read_dict = defaultdict(lambda: [None, None]) + for read in bam: + if not read.is_proper_pair: + continue + qname = read.query_name + if qname not in read_dict: + if read.is_read1: + read_dict[qname][0] = read + else: + read_dict[qname][1] = read + else: + if read.is_read1: + yield read, read_dict[qname][1] + else: + yield read_dict[qname][0], read + del read_dict[qname] + + def create_args(**kwargs): """Create a fake args object with default values matching main.py argument parser""" defaults = { "bedfile": str(BED_PATH_V5_3_2.absolute()), - "bamfile": str(BAM_PATH_V5_3_2.absolute()), + "samfile": str(BAM_PATH_V5_3_2.absolute()), "normalise": 0, "min_mapq": 20, "primer_match_threshold": 35, @@ -39,10 +64,10 @@ def create_args(**kwargs): class TestIntegration(unittest.TestCase): - def test_align_trim_trim_primers(self): + def test_align_trim_trim_se_no_norm(self): """Tests primers are trimmed correctly""" with tempfile.TemporaryDirectory( - dir="tests", suffix="-trim_primers" + dir="tests", suffix="-trim_primers_se_no_norm" ) as tempdir: tempdir_path = pathlib.Path(tempdir) output_sam = tempdir_path / "output.sam" @@ -89,6 +114,56 @@ def test_align_trim_trim_primers(self): rg = "unmatched" self.assertEqual(record.get_tag("RG"), rg) + def test_align_trim_trim_se_norm(self): + """Tests primers are trimmed correctly""" + with tempfile.TemporaryDirectory( + dir="tests", suffix="-trim_primers_se_norm" + ) as tempdir: + tempdir_path = pathlib.Path(tempdir) + output_sam = tempdir_path / "output.sam" + + # Create args with test-specific values + args = create_args(output=output_sam.absolute(), normalise=200) + + # Run + go(args) + + # Read in scheme, create look ups + scheme = Scheme.from_file(args.bedfile) + scheme.bedlines = merge_primers(scheme.bedlines) + amplicons = create_amplicons(scheme.bedlines) + pools = set([bl.pool for bl in scheme.bedlines]) + + ref_lengths = [("MN908947.3", 29903)] + primer_lookup = create_primer_lookup(ref_lengths, pools, amplicons, 35) + + # Check the out sam is as expected + for record in pysam.AlignmentFile(str(output_sam), "r"): + # Find the left primer + lp = find_primer_with_lookup( + primer_lookup, record.reference_start, "+", "MN908947.3" + ) + assert lp is not None + self.assertTrue( + record.reference_start >= lp.end + ) # lp.end is non inclusive + + # Find the right primer + rp = find_primer_with_lookup( + primer_lookup, record.reference_end, "-", "MN908947.3" + ) + assert rp is not None + self.assertTrue( + record.reference_end <= rp.start # type: ignore + ) # record.reference_end is non inclusive + + # Check rg is correct + if lp.amplicon_number == rp.amplicon_number: + rg = str(lp.pool) + else: + rg = "unmatched" + self.assertEqual(record.get_tag("RG"), rg) + def test_align_trim_write_reports(self): with tempfile.TemporaryDirectory( dir="tests", suffix="-write_report" @@ -177,11 +252,61 @@ def test_align_trim_require_full_length(self): "reference_end !>= rp.start", ) # record.reference_end is non inclusive - def test_align_trim_paired(self): + def test_align_trim_se_no_trim(self): + with tempfile.TemporaryDirectory(dir="tests", suffix="-se_no_trim") as tempdir: + tempdir_path = pathlib.Path(tempdir) + output_sam = tempdir_path / "output.sam" + + # Create args with report enabled + args = create_args( + output=output_sam.absolute(), + require_full_length=True, + allow_incorrect_pairs=False, + no_trim_primers=True, + ) + + # Run + go(args) + + # Read in scheme, create look ups + scheme = Scheme.from_file(args.bedfile) + scheme.bedlines = merge_primers(scheme.bedlines) + amplicons = create_amplicons(scheme.bedlines) + pools = set([bl.pool for bl in scheme.bedlines]) + + ref_lengths = [("MN908947.3", 29903)] + primer_lookup = create_primer_lookup(ref_lengths, pools, amplicons, 35) + + # Check the out sam is as expected + for record in pysam.AlignmentFile(str(output_sam), "r"): + # Find the left primer + lp = find_primer_with_lookup( + primer_lookup, record.reference_start, "+", "MN908947.3" + ) + assert lp is not None + self.assertLess( + record.reference_start, + lp.end, + f"{record.query_name} - reference_start !< lp.end", + ) # lp.end is non inclusive + + # Find the right primer + rp = find_primer_with_lookup( + primer_lookup, record.reference_end, "-", "MN908947.3" + ) + assert rp is not None + self.assertGreaterEqual( + record.reference_end, + rp.start, + f"{record.query_name} - reference_end !>= rp.start", + ) # record.reference_end is non inclusive + + def test_align_trim_paired_norm(self): with tempfile.TemporaryDirectory(dir="tests", suffix="-paired") as tempdir: tempdir_path = pathlib.Path(tempdir) output_bam = tempdir_path / "output.bam" amp_depths = tempdir_path / "amp_depths.tsv" + report = tempdir_path / "report.tsv" # Create args with report enabled args = create_args( @@ -189,8 +314,9 @@ def test_align_trim_paired(self): allow_incorrect_pairs=False, amp_depth_report=amp_depths, bedfile=BED_PATH_V3_0_0.absolute(), - bamfile=BAM_PATH_PAIRED_V3_0_0.absolute(), + samfile=BAM_PATH_PAIRED_V3_0_0.absolute(), normalise=200, + report=report, ) # Run @@ -221,6 +347,123 @@ def test_align_trim_paired(self): else: self.assertTrue(float(mean_depth) > 0) + def test_align_trim_paired_no_norm(self): + with tempfile.TemporaryDirectory( + dir="tests", suffix="-paired-no-norm" + ) as tempdir: + tempdir_path = pathlib.Path(tempdir) + output_bam = tempdir_path / "output.bam" + amp_depths = tempdir_path / "amp_depths.tsv" + + # Create args with report enabled + args = create_args( + output=output_bam.absolute(), + allow_incorrect_pairs=False, + amp_depth_report=amp_depths, + bedfile=BED_PATH_V3_0_0.absolute(), + samfile=BAM_PATH_PAIRED_V3_0_0.absolute(), + ) + + # Run + go(args) + + # Read in scheme, create look ups + scheme = Scheme.from_file(args.bedfile) + scheme.bedlines = merge_primers(scheme.bedlines) + + zero_depth_amps = [15, 29] + + with open(amp_depths, "r") as f: + next(f) # Skip header + for line in f: + chrom, amplicon, mean_depth = line.strip().split("\t") + self.assertEqual(chrom, "MN908947.3") + self.assertIn( + amplicon, + [ + str(a.amplicon_number) + for a in create_amplicons( + Scheme.from_file(args.bedfile).bedlines + ) + ], + ) + if int(amplicon) in zero_depth_amps: + self.assertEqual(float(mean_depth), 0.0) + else: + self.assertTrue(float(mean_depth) > 0) + + def test_align_trim_paired_full_length(self): + with tempfile.TemporaryDirectory( + dir="tests", suffix="-paired-full-length" + ) as tempdir: + tempdir_path = pathlib.Path(tempdir) + output_bam = tempdir_path / "output.bam" + amp_depths = tempdir_path / "amp_depths.tsv" + + # Create args with report enabled + args = create_args( + output=output_bam.absolute(), + allow_incorrect_pairs=False, + amp_depth_report=amp_depths, + bedfile=BED_PATH_V3_0_0.absolute(), + samfile=BAM_PATH_PAIRED_V3_0_0.absolute(), + require_full_length=True, + ) + + # Run + go(args) + + # Read in scheme, create look ups + scheme = Scheme.from_file(args.bedfile) + scheme.bedlines = merge_primers(scheme.bedlines) + amplicons = create_amplicons(scheme.bedlines) + pools = set([bl.pool for bl in scheme.bedlines]) + + ref_lengths = [("MN908947.3", 29903)] + primer_lookup = create_primer_lookup(ref_lengths, pools, amplicons, 35) + + # Check the out sam is as expected + for segment1, segment2 in read_pair_generator( + pysam.AlignmentFile(str(output_bam), "r") + ): + if segment1.reference_start < segment2.reference_start: + # Find the primers for the segments + lp = find_primer_with_lookup( + primer_lookup, segment1.reference_start, "+", "MN908947.3" + ) + rp = find_primer_with_lookup( + primer_lookup, segment2.reference_end, "-", "MN908947.3" + ) + self.assertLessEqual( + segment1.reference_start, + lp.end, + "forward segment reference_start !<= lp.end", + ) + self.assertGreaterEqual( + segment2.reference_end, + rp.start, + "reverse segment reference_end !>= rp.start", + ) + + else: + # Find the primers for the segments + lp = find_primer_with_lookup( + primer_lookup, segment2.reference_start, "+", "MN908947.3" + ) + rp = find_primer_with_lookup( + primer_lookup, segment1.reference_end, "-", "MN908947.3" + ) + self.assertLessEqual( + segment2.reference_start, + lp.end, + f"{segment2.query_name} - forward segment reference_start !<= lp.end", + ) + self.assertGreaterEqual( + segment1.reference_end, + rp.start, + f"{segment1.query_name} - reverse segment reference_end !>= rp.start", + ) + if __name__ == "__main__": unittest.main()