From 42b585f4b4b8e5cd31bde62576bbf6d4085c2baf Mon Sep 17 00:00:00 2001 From: mjung Date: Wed, 12 Jun 2019 17:17:41 -0700 Subject: [PATCH 1/5] Added support for Top Alleles, added more report options for the gtc_final_report script and added a new example script to support building matrix reports`x --- README.md | 2 +- examples/gtc_final_report.py | 14 +-- examples/gtc_final_report_matrix.py | 146 ++++++++++++++++++++++++++++ module/BeadPoolManifest.py | 88 +++++++++++++++-- module/GenotypeCalls.py | 16 ++- 5 files changed, 251 insertions(+), 15 deletions(-) create mode 100644 examples/gtc_final_report_matrix.py diff --git a/README.md b/README.md index c9246bf..004d500 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ The library depends on the availability of the numpy package in the python insta > for (locus, genotype) in zip( names, genotypes ): >   sys.stdout.write( locus + "," + code2genotype[genotype] + "\n" ) -Also, see examples/gtc_final_report.py for an additional example of usage. +Also, see examples/gtc_final_report.py and examples/gtc_final_report_matrix.py for additional examples of usage. ## GTC File Format The specification for the GTC file format is provided in docs/GTC_File_Format_v5.pdf diff --git a/examples/gtc_final_report.py b/examples/gtc_final_report.py index dd4174f..f88ce45 100644 --- a/examples/gtc_final_report.py +++ b/examples/gtc_final_report.py @@ -2,7 +2,7 @@ import argparse import os from datetime import datetime -from IlluminaBeadArrayFiles import GenotypeCalls, BeadPoolManifest, code2genotype +from IlluminaBeadArrayFiles import BeadPoolManifest, GenotypeCalls, code2genotype delim = "\t" @@ -18,7 +18,7 @@ sys.exit(-1) try: - manifest = BeadPoolManifest(args.manifest) + manifest = BeadPoolManifest.BeadPoolManifest(args.manifest) except: sys.stderr.write("Failed to read data from manifest\n") sys.exit(-1) @@ -39,15 +39,17 @@ output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n") output_handle.write("[Data]\n") - output_handle.write(delim.join(["SNP Name", "Sample ID", "Chr", "MapInfo", "Alleles - AB", "Alleles - Plus", "Alleles - Forward"]) + "\n") + output_handle.write(delim.join(["SNP Name", "Sample ID", "Chr", "MapInfo", "Alleles - AB", "Alleles - Plus", "Alleles - Forward", "Alleles - TOP", "Genotype Score"]) + "\n") for gtc_file in samples: sys.stderr.write("Processing " + gtc_file + "\n") gtc_file = os.path.join(args.gtc_directory, gtc_file) - gtc = GenotypeCalls(gtc_file) + gtc = GenotypeCalls.GenotypeCalls(gtc_file) genotypes = gtc.get_genotypes() plus_strand_genotypes = gtc.get_base_calls_plus_strand(manifest.snps, manifest.ref_strands) forward_strand_genotypes = gtc.get_base_calls_forward_strand(manifest.snps, manifest.source_strands) + top_strand_genotypes = gtc.get_base_calls_TOP_strand(manifest.snps, manifest.ilmn_strands) + genotype_scores = gtc.get_genotype_scores() assert len(genotypes) == len(manifest.names) - for (name, chrom, map_info, genotype, ref_strand_genotype, source_strand_genotype) in zip(manifest.names, manifest.chroms, manifest.map_infos, genotypes, plus_strand_genotypes, forward_strand_genotypes): - output_handle.write(delim.join([name, os.path.basename(gtc_file)[:-4], chrom, str(map_info), code2genotype[genotype], ref_strand_genotype, source_strand_genotype]) + "\n") + for (name, chrom, map_info, genotype, ref_strand_genotype, source_strand_genotype, ilmn_strand_genotype, genotype_score) in zip(manifest.names, manifest.chroms, manifest.map_infos, genotypes, plus_strand_genotypes, forward_strand_genotypes, top_strand_genotypes, genotype_scores): + output_handle.write(delim.join([name, os.path.basename(gtc_file)[:-4], chrom, str(map_info), code2genotype[genotype], ref_strand_genotype, source_strand_genotype, ilmn_strand_genotype, str(genotype_score)]) + "\n") diff --git a/examples/gtc_final_report_matrix.py b/examples/gtc_final_report_matrix.py new file mode 100644 index 0000000..ef76311 --- /dev/null +++ b/examples/gtc_final_report_matrix.py @@ -0,0 +1,146 @@ +import sys +import argparse +import os +from datetime import datetime +from IlluminaBeadArrayFiles import BeadPoolManifest, GenotypeCalls, code2genotype +from pandas import DataFrame +from numpy import around + +""" +This example will allow the user to create matrix report files identical to the ones exported from GenomeStudio +by chosing the genotype with or without the genotype scores. +""" + +def build_dict(matrix = None, samplename = None, names = None, genotypes = None, genotype_scores = None): + if genotype_scores is not None: + for (name, genotype, genotype_score) in zip(names, genotypes, genotype_scores): + if samplename in matrix: + matrix[samplename][name] = genotype + "|" + str(genotype_score) + else: + matrix[samplename] = {} + matrix[samplename][name] = genotype + "|" + str(genotype_score) + else: + for (name, genotype) in zip(names, genotypes): + if samplename in matrix: + matrix[samplename][name] = genotype + else: + matrix[samplename] = {} + matrix[samplename][name] = genotype + return matrix + +delim = "\t" +NUM_ARGUMENTS = 6 + +parser = argparse.ArgumentParser("Generate a final report from a directory of GTC files") +parser.add_argument("manifest", help="BPM manifest file") +parser.add_argument("gtc_directory", help="Directory containing GTC files") +parser.add_argument("output_file", help="Location to write report") +parser.add_argument("--forward", help="python gtc_final_report_matrix.py --forward 1, print matrix with forward alleles") +parser.add_argument("--forward_GC", help="python gtc_final_report_matrix.py --forward_GC 1, print matrix with forward alleles including genotype scores") +parser.add_argument("--top", help="python gtc_final_report_matrix.py --top 1, print matrix with top alleles") +parser.add_argument("--top_GC", help="python gtc_final_report_matrix.py --top_GC 1, print matrix with top alleles including genotype scores") +parser.add_argument("--AB", help="python gtc_final_report_matrix.py --forward 1, print matrix with forward alleles") +parser.add_argument("--AB_GC", help="python gtc_final_report_matrix.py --forward_GC 1, print matrix with forward alleles including genotype scores") +parser.add_argument("--plus", help="python gtc_final_report_matrix.py --top 1, print matrix with top alleles") +parser.add_argument("--plus_GC", help="python gtc_final_report_matrix.py --top_GC 1, print matrix with top alleles including genotype scores") + +args = parser.parse_args() + +if len(sys.argv) != NUM_ARGUMENTS: + sys.stderr.write("For matrix report user needs to provide either forward or top strand parameter with or without genotype score, can only build one report!\n") + sys.exit(-1) + +if os.path.isfile(args.output_file): + sys.stderr.write("Output file already exists, please delete and re-run\n") + sys.exit(-1) + +try: + manifest = BeadPoolManifest.BeadPoolManifest(args.manifest) +except: + sys.stderr.write("Failed to read data from manifest\n") + sys.exit(-1) + +with open(args.output_file, "w") as output_handle: + output_handle.write("[Header]\n") + output_handle.write(delim.join(["Processing Date", datetime.now().strftime("%m/%d/%Y %I:%M %p")])+ "\n") + output_handle.write(delim.join(["Content", os.path.basename(args.manifest)]) + "\n") + output_handle.write(delim.join(["Num SNPs", str(len(manifest.names))]) + "\n") + output_handle.write(delim.join(["Total SNPs", str(len(manifest.names))]) + "\n") + + samples = [] + for gtc_file in os.listdir(args.gtc_directory): + if gtc_file.lower().endswith(".gtc"): + samples.append(gtc_file) + + output_handle.write(delim.join(["Num Samples", str(len(samples))]) + "\n") + output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n") + output_handle.write("[Data]\n") + + matrix_forward = {} + matrix_forward_GC = {} + matrix_top = {} + matrix_top_GC = {} + matrix_plus = {} + matrix_plus_GC = {} + matrix_AB = {} + matrix_AB_GC = {} + + for gtc_file in samples: + samplename = os.path.basename(gtc_file)[:-4] + sys.stderr.write("Processing " + gtc_file + "\n") + gtc_file = os.path.join(args.gtc_directory, gtc_file) + gtc = GenotypeCalls.GenotypeCalls(gtc_file) + genotypes = [code2genotype[genotype] for genotype in gtc.get_genotypes()] + assert len(genotypes) == len(manifest.names) + if args.plus or args.plus_GC: + plus_strand_genotypes = gtc.get_base_calls_plus_strand(manifest.snps, manifest.ref_strands) + if args.forward or args.forward_GC: + forward_strand_genotypes = gtc.get_base_calls_forward_strand(manifest.snps, manifest.source_strands) + if args.top or args.top_GC: + top_strand_genotypes = gtc.get_base_calls_TOP_strand(manifest.snps, manifest.ilmn_strands) + if args.forward_GC or args.top_GC or args.plus_GC or args.AB_GC: + genotype_scores = around(gtc.get_genotype_scores(), decimals = 4) + #build dictionary for pandas + if args.forward_GC: + matrix_forward_GC = build_dict(matrix = matrix_forward_GC, samplename = samplename, \ + names = manifest.names, genotypes = forward_strand_genotypes, genotype_scores = genotype_scores) + elif args.forward: + matrix_forward = build_dict(matrix = matrix_forward, samplename = samplename, \ + names = manifest.names, genotypes = forward_strand_genotypes) + elif args.top: + matrix_top = build_dict(matrix = matrix_top, samplename = samplename, \ + names = manifest.names, genotypes = top_strand_genotypes) + elif args.top_GC: + matrix_top_GC = build_dict(matrix = matrix_top_GC, samplename = samplename, \ + names = manifest.names, genotypes = top_strand_genotypes, genotype_scores = genotype_scores) + elif args.plus_GC: + matrix_plus_GC = build_dict(matrix = matrix_plus_GC, samplename = samplename, \ + names = manifest.names, genotypes = plus_strand_genotypes, genotype_scores = genotype_scores) + elif args.plus: + matrix_plus = build_dict(matrix = matrix_plus, samplename = samplename, \ + names = manifest.names, genotypes = plus_strand_genotypes) + elif args.AB: + matrix_AB = build_dict(matrix = matrix_AB, samplename = samplename, \ + names = manifest.names, genotypes = genotypes) + elif args.AB_GC: + matrix_AB_GC = build_dict(matrix = matrix_AB_GC, samplename = samplename, \ + names = manifest.names, genotypes = genotypes, genotype_scores = genotype_scores) + #create pandas dataframe from dictionaryand append to file + if args.forward_GC: + df = DataFrame.from_dict(matrix_forward_GC) + elif args.forward: + df = DataFrame.from_dict(matrix_forward) + elif args.top: + df = DataFrame.from_dict(matrix_top) + elif args.top_GC: + df = DataFrame.from_dict(matrix_top_GC) + elif args.plus_GC: + df = DataFrame.from_dict(matrix_plus_GC) + elif args.plus: + df = DataFrame.from_dict(matrix_plus) + elif args.AB: + df = DataFrame.from_dict(matrix_AB) + elif args.AB_GC: + df = DataFrame.from_dict(matrix_AB_GC) + df = df.reindex(manifest.names) + df.to_csv(output_handle, sep = delim) diff --git a/module/BeadPoolManifest.py b/module/BeadPoolManifest.py index dbf6bba..fadeca4 100644 --- a/module/BeadPoolManifest.py +++ b/module/BeadPoolManifest.py @@ -1,5 +1,6 @@ from .BeadArrayUtility import read_int, read_string, read_byte + class BeadPoolManifest(object): """ Class for parsing binary (BPM) manifest file. @@ -14,6 +15,7 @@ class BeadPoolManifest(object): list of normalization transforms read from GTC file ref_strands (list(int)): Reference strand annotation for loci (see RefStrand class) source_strands (list(int)): Source strand annotations for loci (see SourceStrand class) + ilmn_strands (list(int)): Ilumina strand annotations for loci (see IlmnStrand class) num_loci (int): Number of loci in manifest manifest_name (string): Name of manifest control_config (string): Control description from manifest @@ -39,6 +41,7 @@ def __init__(self, filename): self.normalization_lookups = [] self.ref_strands = [] self.source_strands = [] + self.ilmn_strands = [] self.num_loci = 0 self.manifest_name = "" self.control_config = "" @@ -100,6 +103,7 @@ def __parse_file(self, manifest_file): self.map_infos = [0] * self.num_loci self.ref_strands = [RefStrand.Unknown] * self.num_loci self.source_strands = [SourceStrand.Unknown] * self.num_loci + self.ilmn_strands = [IlmnStrand.Unknown] * self.num_loci for idx in xrange(self.num_loci): locus_entry = LocusEntry(manifest_handle) self.assay_types[name_lookup[locus_entry.name]] = locus_entry.assay_type @@ -109,6 +113,7 @@ def __parse_file(self, manifest_file): self.map_infos[name_lookup[locus_entry.name]] = locus_entry.map_info self.ref_strands[name_lookup[locus_entry.name]] = locus_entry.ref_strand self.source_strands[name_lookup[locus_entry.name]] = locus_entry.source_strand + self.ilmn_strands[name_lookup[locus_entry.name]] = locus_entry.ilmn_strand if len(self.normalization_ids) != len(self.assay_types): raise Exception( @@ -179,6 +184,71 @@ def from_string(source_strand): raise ValueError( "Unexpected value for source strand " + source_strand) + +class IlmnStrand(object): + Unknown = 0 + TOP = 1 + BOT = 2 + PLUS = 1 + MINUS = 2 + + @staticmethod + def to_string(ilmn_strand): + """Get an integer representation of Illumina strand annotation + + Args: + ilmn_strand (str) : string representation of Illumina strand annotation (e.g., "T") + + Returns: + int : int representation of Illumina strand annotation (e.g. IlmnStrand.TOP) + + Raises: + ValueError: Unexpected value for Illumina strand + """ + if ilmn_strand == IlmnStrand.Unknown: + return "U" + elif ilmn_strand == IlmnStrand.TOP: + return "T" + elif ilmn_strand == IlmnStrand.BOT: + return "B" + elif ilmn_strand == IlmnStrand.PLUS: + return "P" + elif ilmn_strand == IlmnStrand.MINUS: + return "M" + + else: + raise ValueError( + "Unexpected value for ilmn strand " + ilmn_strand) + + @staticmethod + def from_string(ilmn_strand): + """ + Get a string representation of Illumina strand annotation + + Args: + ilmn_strand (int) : int representation of Illumina strand (e.g., IlmnStrand.TOP) + + Returns: + str : string representation of Illumina strand annotation + + Raises: + ValueError: Unexpected value for Illumina strand + """ + if ilmn_strand == "U" or ilmn_strand == "": + return IlmnStrand.Unknown + if ilmn_strand == "T": + return IlmnStrand.TOP + elif ilmn_strand == "B": + return IlmnStrand.BOT + if ilmn_strand == "P": + return IlmnStrand.PLUS + elif ilmn_strand == "M": + return IlmnStrand.MINUS + else: + raise ValueError( + "Unexpected value for ilmn strand " + ilmn_strand) + + class RefStrand(object): Unknown = 0 Plus = 1 @@ -188,13 +258,13 @@ class RefStrand(object): def to_string(ref_strand): """ Get a string reprensetation of ref strand annotation - + Args: ref_strand (int) : int representation of ref strand (e.g., RefStrand.Plus) - + Returns: str : string representation of reference strand annotation - + Raises: ValueError: Unexpected value for reference strand """ @@ -211,13 +281,13 @@ def to_string(ref_strand): @staticmethod def from_string(ref_strand): """Get an integer representation of ref strand annotation - + Args: ref_strand (str) : string representation of reference strand annotation (e.g., "+") - + Returns: int : int representation of reference strand annotation (e.g. RefStrand.Plus) - + Raises: ValueError: Unexpected value for reference strand """ @@ -235,7 +305,7 @@ class LocusEntry(object): """ Helper class representing a locus entry within a bead pool manifest. Current only support version 6,7, and 8. - + Attributes: ilmn_id (string) : IlmnID (probe identifier) of locus name (string): Name (variant identifier) of locus @@ -247,6 +317,7 @@ class LocusEntry(object): address_b (int) : AddressB ID of locus (0 if none) ref_strand (int) : See RefStrand class source_strand (int) : See SourceStrand class + ilmn_strand (int) : See IlmnStrand class """ def __init__(self, handle): @@ -269,6 +340,7 @@ def __init__(self, handle): self.address_b = -1 self.ref_strand = RefStrand.Unknown self.source_strand = SourceStrand.Unknown + self.ilmn_strand = IlmnStrand.Unknown self.__parse_file(handle) def __parse_file(self, handle): @@ -308,6 +380,8 @@ def __parse_locus_version_6(self, handle): self.ilmn_id = read_string(handle) self.source_strand = SourceStrand.from_string( self.ilmn_id.split("_")[-2]) + self.ilmn_strand = IlmnStrand.from_string( + self.ilmn_id.split("_")[-3]) self.name = read_string(handle) for idx in xrange(3): read_string(handle) diff --git a/module/GenotypeCalls.py b/module/GenotypeCalls.py index 80b12ec..053a94b 100644 --- a/module/GenotypeCalls.py +++ b/module/GenotypeCalls.py @@ -1,7 +1,7 @@ import struct from numpy import cos, sin, pi, arctan2, float32, uint16, int32, seterr, frombuffer, dtype from .BeadArrayUtility import read_int, read_string, read_byte, read_float, read_char, read_ushort, complement -from .BeadPoolManifest import RefStrand, SourceStrand +from .BeadPoolManifest import RefStrand, SourceStrand, IlmnStrand seterr(divide='ignore', invalid='ignore') nan = float32('nan') @@ -374,6 +374,20 @@ def get_base_calls_forward_strand(self, snps, forward_strand_annotations): """ return self.get_base_calls_generic(snps, forward_strand_annotations, SourceStrand.Forward, RefStrand.Unknown) + def get_base_calls_TOP_strand(self, snps, ilmn_strand_annotations): + """ + Get base calls on the Illumina strand. + + Args: + snps (list(string)) : A list of string representing the snp on the design strand for the loci (e.g. [A/C]) + ilmn_strand_annotations (list(int)) : A list of strand annotations for the loci (e.g., IlmnStrand.TOP) + + Returns: + The genotype basecalls on the report strand as a list of strings. + The characters are A, C, G, T, or - for a no-call/null. + """ + return self.get_base_calls_generic(snps, ilmn_strand_annotations, IlmnStrand.TOP, RefStrand.Unknown) + def get_base_calls(self): """ Returns: From 515ac5cb7dd68339c1d9729580dd931c1d44a9fd Mon Sep 17 00:00:00 2001 From: mjung Date: Thu, 13 Jun 2019 10:06:58 -0700 Subject: [PATCH 2/5] Added modifications requested by Ryan --- module/BeadPoolManifest.py | 6 +++--- module/GenotypeCalls.py | 17 ++++++++++++----- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/module/BeadPoolManifest.py b/module/BeadPoolManifest.py index fadeca4..354b18f 100644 --- a/module/BeadPoolManifest.py +++ b/module/BeadPoolManifest.py @@ -189,8 +189,8 @@ class IlmnStrand(object): Unknown = 0 TOP = 1 BOT = 2 - PLUS = 1 - MINUS = 2 + PLUS = 3 + MINUS = 4 @staticmethod def to_string(ilmn_strand): @@ -234,7 +234,7 @@ def from_string(ilmn_strand): Raises: ValueError: Unexpected value for Illumina strand """ - if ilmn_strand == "U" or ilmn_strand == "": + if ilmn_strand == "U": return IlmnStrand.Unknown if ilmn_strand == "T": return IlmnStrand.TOP diff --git a/module/GenotypeCalls.py b/module/GenotypeCalls.py index 053a94b..53e0f57 100644 --- a/module/GenotypeCalls.py +++ b/module/GenotypeCalls.py @@ -320,15 +320,20 @@ def get_base_calls_generic(self, snps, strand_annotations, report_strand, unknow Raises: ValueError: Number of SNPs or ref strand annotations not matched to entries in GTC file """ + if type(report_strand) != list: + report_strands = [report_strand] + else: + report_strands = report_strand + genotypes = self.get_genotypes() if len(genotypes) != len(snps): raise ValueError( "The number of SNPs must match the number of loci in the GTC file") - + if len(genotypes) != len(strand_annotations): raise ValueError( "The number of reference strand annotations must match the number of loci in the GTC file") - + result = [] for (genotype, snp, strand_annotation) in zip(genotypes, snps, strand_annotations): ab_genotype = code2genotype[genotype] @@ -340,8 +345,10 @@ def get_base_calls_generic(self, snps, strand_annotations, report_strand, unknow report_strand_nucleotides = [] for ab_allele in ab_genotype: nucleotide_allele = a_nucleotide if ab_allele == "A" else b_nucleotide - report_strand_nucleotides.append( - nucleotide_allele if strand_annotation == report_strand else complement(nucleotide_allele)) + for report_strand in report_strands: + report_strand_nucleotides.append( + nucleotide_allele if strand_annotation == report_strand else complement(nucleotide_allele) + ) result.append("".join(report_strand_nucleotides)) return result @@ -386,7 +393,7 @@ def get_base_calls_TOP_strand(self, snps, ilmn_strand_annotations): The genotype basecalls on the report strand as a list of strings. The characters are A, C, G, T, or - for a no-call/null. """ - return self.get_base_calls_generic(snps, ilmn_strand_annotations, IlmnStrand.TOP, RefStrand.Unknown) + return self.get_base_calls_generic(snps, ilmn_strand_annotations, [IlmnStrand.TOP, IlmnStrand.PLUS], IlmnStrand.Unknown) def get_base_calls(self): """ From 844a5c9048f35c10f4097b1b67d43b8091a755fb Mon Sep 17 00:00:00 2001 From: mjung Date: Thu, 13 Jun 2019 11:19:10 -0700 Subject: [PATCH 3/5] minor text changes --- examples/gtc_final_report_matrix.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/gtc_final_report_matrix.py b/examples/gtc_final_report_matrix.py index ef76311..8e27d0e 100644 --- a/examples/gtc_final_report_matrix.py +++ b/examples/gtc_final_report_matrix.py @@ -31,18 +31,18 @@ def build_dict(matrix = None, samplename = None, names = None, genotypes = None, delim = "\t" NUM_ARGUMENTS = 6 -parser = argparse.ArgumentParser("Generate a final report from a directory of GTC files") +parser = argparse.ArgumentParser("Generate a final matrix report by specifying the report strand from a directory of GTC files") parser.add_argument("manifest", help="BPM manifest file") parser.add_argument("gtc_directory", help="Directory containing GTC files") parser.add_argument("output_file", help="Location to write report") -parser.add_argument("--forward", help="python gtc_final_report_matrix.py --forward 1, print matrix with forward alleles") -parser.add_argument("--forward_GC", help="python gtc_final_report_matrix.py --forward_GC 1, print matrix with forward alleles including genotype scores") -parser.add_argument("--top", help="python gtc_final_report_matrix.py --top 1, print matrix with top alleles") -parser.add_argument("--top_GC", help="python gtc_final_report_matrix.py --top_GC 1, print matrix with top alleles including genotype scores") -parser.add_argument("--AB", help="python gtc_final_report_matrix.py --forward 1, print matrix with forward alleles") -parser.add_argument("--AB_GC", help="python gtc_final_report_matrix.py --forward_GC 1, print matrix with forward alleles including genotype scores") -parser.add_argument("--plus", help="python gtc_final_report_matrix.py --top 1, print matrix with top alleles") -parser.add_argument("--plus_GC", help="python gtc_final_report_matrix.py --top_GC 1, print matrix with top alleles including genotype scores") +parser.add_argument("--forward", help="python gtc_final_report_matrix.py --forward 1; print matrix with forward alleles") +parser.add_argument("--forward_GC", help="python gtc_final_report_matrix.py --forward_GC 1; print matrix with forward alleles including genotype scores") +parser.add_argument("--top", help="python gtc_final_report_matrix.py --top 1; print matrix with top alleles") +parser.add_argument("--top_GC", help="python gtc_final_report_matrix.py --top_GC 1; print matrix with top alleles including genotype scores") +parser.add_argument("--AB", help="python gtc_final_report_matrix.py --forward 1; print matrix with forward alleles") +parser.add_argument("--AB_GC", help="python gtc_final_report_matrix.py --forward_GC 1; print matrix with forward alleles including genotype scores") +parser.add_argument("--plus", help="python gtc_final_report_matrix.py --top 1; print matrix with top alleles") +parser.add_argument("--plus_GC", help="python gtc_final_report_matrix.py --top_GC 1; print matrix with top alleles including genotype scores") args = parser.parse_args() From 37251d8600e2f3ef791e5b447cb0a02d70a6d949 Mon Sep 17 00:00:00 2001 From: mjung Date: Thu, 13 Jun 2019 13:50:58 -0700 Subject: [PATCH 4/5] parser update, removed duplication bug --- examples/gtc_final_report_matrix.py | 19 ++++++++++--------- module/GenotypeCalls.py | 29 ++++++++++++++--------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/examples/gtc_final_report_matrix.py b/examples/gtc_final_report_matrix.py index 8e27d0e..98533bc 100644 --- a/examples/gtc_final_report_matrix.py +++ b/examples/gtc_final_report_matrix.py @@ -29,20 +29,21 @@ def build_dict(matrix = None, samplename = None, names = None, genotypes = None, return matrix delim = "\t" -NUM_ARGUMENTS = 6 +NUM_ARGUMENTS = 5 parser = argparse.ArgumentParser("Generate a final matrix report by specifying the report strand from a directory of GTC files") parser.add_argument("manifest", help="BPM manifest file") parser.add_argument("gtc_directory", help="Directory containing GTC files") parser.add_argument("output_file", help="Location to write report") -parser.add_argument("--forward", help="python gtc_final_report_matrix.py --forward 1; print matrix with forward alleles") -parser.add_argument("--forward_GC", help="python gtc_final_report_matrix.py --forward_GC 1; print matrix with forward alleles including genotype scores") -parser.add_argument("--top", help="python gtc_final_report_matrix.py --top 1; print matrix with top alleles") -parser.add_argument("--top_GC", help="python gtc_final_report_matrix.py --top_GC 1; print matrix with top alleles including genotype scores") -parser.add_argument("--AB", help="python gtc_final_report_matrix.py --forward 1; print matrix with forward alleles") -parser.add_argument("--AB_GC", help="python gtc_final_report_matrix.py --forward_GC 1; print matrix with forward alleles including genotype scores") -parser.add_argument("--plus", help="python gtc_final_report_matrix.py --top 1; print matrix with top alleles") -parser.add_argument("--plus_GC", help="python gtc_final_report_matrix.py --top_GC 1; print matrix with top alleles including genotype scores") +parser.add_argument("--forward", action="store_true", default=False, help="python gtc_final_report_matrix.py --forward - print matrix with forward alleles") +parser.add_argument("--forward_GC", action="store_true", default=False, help="python gtc_final_report_matrix.py --forward_GC - print matrix with forward alleles including genotype scores") +parser.add_argument("--top", action="store_true", default=False, help="python gtc_final_report_matrix.py --top - print matrix with top alleles") +parser.add_argument("--top_GC", action="store_true", default=False, help="python gtc_final_report_matrix.py --top_GC - print matrix with top alleles including genotype scores") +parser.add_argument("--AB", action="store_true", default=False, help="python gtc_final_report_matrix.py --forward - print matrix with forward alleles") +parser.add_argument("--AB_GC", action="store_true", default=False, help="python gtc_final_report_matrix.py --forward_GC - print matrix with forward alleles including genotype scores") +parser.add_argument("--plus", action="store_true", default=False, help="python gtc_final_report_matrix.py --top - print matrix with top alleles") +parser.add_argument("--plus_GC", action="store_true", default=False, help="python gtc_final_report_matrix.py --top_GC - print matrix with top alleles including genotype scores") + args = parser.parse_args() diff --git a/module/GenotypeCalls.py b/module/GenotypeCalls.py index 53e0f57..fb10a7f 100644 --- a/module/GenotypeCalls.py +++ b/module/GenotypeCalls.py @@ -333,23 +333,22 @@ def get_base_calls_generic(self, snps, strand_annotations, report_strand, unknow if len(genotypes) != len(strand_annotations): raise ValueError( "The number of reference strand annotations must match the number of loci in the GTC file") - result = [] - for (genotype, snp, strand_annotation) in zip(genotypes, snps, strand_annotations): - ab_genotype = code2genotype[genotype] - a_nucleotide = snp[1] - b_nucleotide = snp[-2] - if a_nucleotide == "N" or b_nucleotide == "N" or strand_annotation == unknown_annotation or ab_genotype == "NC" or ab_genotype == "NULL": - result.append("-") - else: - report_strand_nucleotides = [] - for ab_allele in ab_genotype: - nucleotide_allele = a_nucleotide if ab_allele == "A" else b_nucleotide - for report_strand in report_strands: + for report_strand in report_strands: + for (genotype, snp, strand_annotation) in zip(genotypes, snps, strand_annotations): + ab_genotype = code2genotype[genotype] + a_nucleotide = snp[1] + b_nucleotide = snp[-2] + if a_nucleotide == "N" or b_nucleotide == "N" or strand_annotation == unknown_annotation or ab_genotype == "NC" or ab_genotype == "NULL": + result.append("-") + else: + report_strand_nucleotides = [] + for ab_allele in ab_genotype: + nucleotide_allele = a_nucleotide if ab_allele == "A" else b_nucleotide report_strand_nucleotides.append( - nucleotide_allele if strand_annotation == report_strand else complement(nucleotide_allele) - ) - result.append("".join(report_strand_nucleotides)) + nucleotide_allele if strand_annotation == report_strand else complement(nucleotide_allele) + ) + result.append("".join(report_strand_nucleotides)) return result def get_base_calls_plus_strand(self, snps, ref_strand_annotations): From af03dda0e9d1f14acccdfb0ff1068b516f81988a Mon Sep 17 00:00:00 2001 From: mjung Date: Wed, 24 Jul 2019 10:03:34 -0700 Subject: [PATCH 5/5] another report example --- examples/gtc_custom_report.py | 50 +++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 examples/gtc_custom_report.py diff --git a/examples/gtc_custom_report.py b/examples/gtc_custom_report.py new file mode 100644 index 0000000..e3f61b0 --- /dev/null +++ b/examples/gtc_custom_report.py @@ -0,0 +1,50 @@ +import sys +import argparse +import os +from datetime import datetime +#import GenotypeCalls +from IlluminaBeadArrayFiles import GenotypeCalls + +delim = "\t" + +parser = argparse.ArgumentParser("Generate a custom report from a directory of GTC files") +parser.add_argument("gtc_directory", help="Directory containing GTC files") +parser.add_argument("output_file", help="Location to write report") + +args = parser.parse_args() + +if os.path.isfile(args.output_file): + sys.stderr.write("Output file already exists, please delete and re-run\n") + sys.exit(-1) + +with open(args.output_file, "w") as output_handle: + output_handle.write("[Header]\n") + output_handle.write(delim.join(["Processing Date", datetime.now().strftime("%m/%d/%Y %I:%M %p")])+ "\n") + samples = [] + for gtc_file in os.listdir(args.gtc_directory): + if gtc_file.lower().endswith(".gtc"): + samples.append(gtc_file) + index = 1 + output_handle.write(delim.join(["Num Samples", str(len(samples))]) + "\n") + output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n") + output_handle.write("[Data]\n") + output_handle.write(delim.join(["Index","Sample ID", "Call Rate", "p05 Grn", "p50 Grn", "p90 Grn", "p05 Red", "p50 Red", \ + "p90 Red","p10 GC", "p50 GC"]) + "\n") + for gtc_file in samples: + sys.stderr.write("Processing " + gtc_file + "\n") + gtc_file = os.path.join(args.gtc_directory, gtc_file) + gtc = GenotypeCalls.GenotypeCalls(gtc_file) + call_rate = gtc.get_call_rate() + percentiles_grn = gtc.get_percentiles_x() + p05_grn = percentiles_grn[0] + p50_grn = percentiles_grn[1] + p90_grn = percentiles_grn[2] + percentiles_red = gtc.get_percentiles_y() + p05_red = percentiles_red[0] + p50_red = percentiles_red[1] + p90_red = percentiles_red[2] + p10_GC = gtc.get_gc10() + p50_GC = gtc.get_gc50() + output_handle.write(delim.join([str(index), os.path.basename(gtc_file)[:-4], str(call_rate), str(p05_grn), \ + str(p50_grn), str(p90_grn), str(p05_red), str(p50_red), str(p90_red), str(p10_GC), str(p50_GC)]) + "\n") + index +=1