diff --git a/src/analysis/as_analysis_sc.py b/src/analysis/as_analysis_sc.py index 416daa6b..4efc6fa1 100644 --- a/src/analysis/as_analysis_sc.py +++ b/src/analysis/as_analysis_sc.py @@ -58,7 +58,6 @@ def adata_count_qc( adata.obs["index"] = adata.obs.index # Replace index column - # TODO add options to identify and filter GT errors if gt_error is not None: pass @@ -175,9 +174,6 @@ def opt_disp(rho: float, ref_data: NDArray[np.uint16], n_data: NDArray[np.uint16 df_dict[group_name] = df - # Should I return something? - # Maybe compile all of the dataframes? - return df_dict @@ -281,7 +277,6 @@ def get_imbalance_per_group( group_results.append((region, snp_count, mu, null_ll, alt_ll, pval)) # Create allelic imbalance df - # Polars vs pandas?? df: pd.DataFrame = pd.DataFrame( group_results, columns=["region", "num_snps", "mu", "null_ll", "alt_ll", "pval"] ) diff --git a/src/analysis/compare_ai.py b/src/analysis/compare_ai.py index eb4259e5..60822c5e 100644 --- a/src/analysis/compare_ai.py +++ b/src/analysis/compare_ai.py @@ -19,7 +19,6 @@ logger = logging.getLogger(__name__) -# Use these functions to figure out how to optimize per group def get_imbalance_func( ref_count: NDArray[np.integer[Any]], n_count: NDArray[np.integer[Any]], @@ -105,7 +104,6 @@ def get_compared_imbalance( if sample is None: phased = False - # Should I be comparing all combos by default??? Seems like a lot if groups is None: groups = list(adata.var["group"].dropna().unique()) logger.info("Comparing all combinations of available groups") @@ -154,14 +152,12 @@ def opt_disp(rho: float, ref_data: NDArray[np.uint16], n_data: NDArray[np.uint16 # process counts on a per group basis to avoid recalculating group_dict: dict[str, Any] = {} - # group_data = namedtuple("group_data", ["ref_counts", "n_counts", "phase_data", "region_snp_dict"]) # Maybe include the gt_array instead of min_idx group_data = namedtuple("group_data", ["ref_counts", "n_counts", "region_snp_df"]) for group_name in groups: # Subset by group adata_sub = adata[:, adata.var["group"] == group_name] - # Create count data per group, should i do pseudocount now or later? ref_counts_group = adata_sub.layers["ref"].sum(axis=1, dtype=np.uint16).T.A1 + pseudocount alt_counts_group = adata_sub.layers["alt"].sum(axis=1, dtype=np.uint16).T.A1 + pseudocount n_counts_group = ref_counts_group + alt_counts_group @@ -239,7 +235,6 @@ def opt_disp(rho: float, ref_data: NDArray[np.uint16], n_data: NDArray[np.uint16 continue - # This sub function name kinda long...find better name maybe? df = compare_imbalance_between_groups( disp, ref_counts1, n_counts1, ref_counts2, n_counts2, region_snp_dict, gt_array ) @@ -368,7 +363,6 @@ def compare_imbalance_between_groups( # Add data to output list - # How should i format this, lots of possible outputs group_results.append( (region, len(snp_list), combined_mu, alt_mu1, alt_mu2, null_ll, alt_ll, pval) ) @@ -412,7 +406,6 @@ def get_compared_imbalance_diff_snps( if sample is None: phased = False - # Should I be comparing all combos by default??? Seems like a lot if groups is None: groups = list(adata.var["group"].dropna().unique()) logger.info("Comparing all combinations of available groups") @@ -469,13 +462,12 @@ def opt_disp_filt( group_dict: dict[str, Any] = {} group_data = namedtuple( "group_data", ["ref_counts", "n_counts", "phase_data", "region_snp_dict"] - ) # Maybe include the gt_array instead of min_idx + ) for group_name in groups: # Subset by group adata_sub = adata[:, adata.var["group"] == group_name] - # Create count data per group, should i do pseudocount now or later? ref_counts_group = adata_sub.layers["ref"].sum(axis=1, dtype=np.uint16).T.A1 + pseudocount alt_counts_group = adata_sub.layers["alt"].sum(axis=1, dtype=np.uint16).T.A1 + pseudocount n_counts_group = ref_counts_group + alt_counts_group @@ -519,8 +511,6 @@ def opt_disp_filt( df_dict: dict[tuple[str, str], pd.DataFrame] = {} for group1, group2 in group_combos: - # Might be smart to create a cache to prevent repeating calculations - # This sub function name kinda long...find better name maybe? df = compare_imbalance_between_groups_diff_snps( disp, *group_dict[group1], *group_dict[group2] ) @@ -632,7 +622,6 @@ def compare_imbalance_between_groups_diff_snps( # Add data to output list - # How should i format this, lots of possible outputs group_results.append( ( region, diff --git a/src/analysis/run_analysis.py b/src/analysis/run_analysis.py index 39bb2ee9..7dcfff19 100644 --- a/src/analysis/run_analysis.py +++ b/src/analysis/run_analysis.py @@ -59,10 +59,9 @@ def __init__( # User input data self.count_file = count_file self.region_col = region_col - self.groupby = groupby # group by region or parent? + self.groupby = groupby self.out_file = out_file - # TODO parse vcf for phased instead of default unphased self.phased: bool = bool(phased) # Default to single dispersion model @@ -108,15 +107,14 @@ def __init__( "Parent", "parent", }: - self.groupby = count_cols[region_idx + 1] # Set group + self.groupby = count_cols[region_idx + 1] else: - # Maybe throw error instead logger.warning("%s not found in columns %s", self.groupby, count_cols) self.groupby = None # Create default outfile if self.out_file is None: - self.out_file = str(Path.cwd() / "ai_results.tsv") # do this after + self.out_file = str(Path.cwd() / "ai_results.tsv") def run_ai_analysis( @@ -192,7 +190,6 @@ def run_ai_analysis( ) ai_df = pd.DataFrame(results) - # Maybe give option to sort or not sort by pval if "fdr_pval" in ai_df.columns: ai_df = ai_df.sort_values(by="fdr_pval", ascending=True) diff --git a/src/analysis/run_analysis_sc.py b/src/analysis/run_analysis_sc.py index 6eb3255a..3bf98f72 100644 --- a/src/analysis/run_analysis_sc.py +++ b/src/analysis/run_analysis_sc.py @@ -36,10 +36,9 @@ def __init__( self.model = model self.out_file = out_file self.phased = phased - self.z_cutoff = z_cutoff # Should i default to something like 4 or 5? + self.z_cutoff = z_cutoff # Default to single dispersion model - # TODO ADD GROUP DISP and other model types if (self.model is None) or (self.model not in {"single"}): self.model = "single" @@ -48,7 +47,6 @@ def __init__( self.min_count = 10 if self.pseudocount is None: - # self.pseudocount = 0 # either 0 or 1 for default self.pseudocount = 1 # Make sure min and pseudocounts are valid @@ -132,13 +130,10 @@ def process_adata_inputs( f"Please input a sample from list: {sample_list}" ) else: - # We gotta subset to include het genotypes now if not any(i in ["1|0", "0|1", "1/0", "0/1"] for i in adata.obs[sample].unique()): raise ValueError(f"Heterozygous genotypes not found for sample: {sample}.") - # adata = adata[adata.obs[sample].isin(['1|0', '0|1', '1/0', '0/1'])] - - # Using copy instead of view stops implicit mod warning, need to check memory usage + # Copy (not view) avoids implicit-modification warning downstream. adata = adata[adata.obs[sample].isin(["1|0", "0|1", "1/0", "0/1"])].copy() adata.obs = adata.obs.reset_index( drop=True @@ -234,14 +229,9 @@ def run_ai_analysis_sc( adata_inputs = process_adata_inputs(ad.read_h5ad(ai_files.adata_file), ai_files=ai_files) - # print(*vars(ai_files).items(), sep="\n") # For debugging - # print(adata_inputs) # For debugging - # Update class attributes ai_files.update_data(adata_inputs) - # adata = adata_inputs.adata # Hold parsed adata file obj in memory - # Prefilter and hold adata data in memory adata = adata_count_qc(adata_inputs.adata, z_cutoff=ai_files.z_cutoff, gt_error=None) diff --git a/src/analysis/run_compare_ai.py b/src/analysis/run_compare_ai.py index 70c2e645..463564b9 100644 --- a/src/analysis/run_compare_ai.py +++ b/src/analysis/run_compare_ai.py @@ -23,7 +23,6 @@ def run_ai_comparison( out_file: str | Path | None = None, z_cutoff: float | None = None, ) -> None: - # Might be smart to change some of the defaults in the class # Create data class that holds input data ai_files: WaspAnalysisSC = WaspAnalysisSC( adata_file=count_file, diff --git a/src/counting/count_alleles.py b/src/counting/count_alleles.py index c6e1ccee..e18ddd46 100644 --- a/src/counting/count_alleles.py +++ b/src/counting/count_alleles.py @@ -155,13 +155,10 @@ def make_count_df(bam_file: str, df: pl.DataFrame, use_rust: bool = True) -> pl. orient="row", ) - # possibly find better solution df = df.with_columns([pl.col("chrom").cast(chrom_enum)]).join( count_df, on=["chrom", "pos"], how="left" ) - # df = df.join(count_df, on=["chrom", "pos"], how="left") - return df diff --git a/src/counting/count_alleles_sc.py b/src/counting/count_alleles_sc.py index 104f8ef8..437b8b67 100644 --- a/src/counting/count_alleles_sc.py +++ b/src/counting/count_alleles_sc.py @@ -108,10 +108,8 @@ def make_count_matrix( AnnData object with count matrices in layers (ref, alt, other). """ chrom_list = df.get_column("chrom").unique(maintain_order=True) - # chrom_list = chrom_list[:3] # Testing purposes # Add genotypes annotations - # Maybe do this automatically and parse feature col instead? snp_df_cols = ["chrom", "pos", "ref", "alt"] if include_samples is not None: sample_cols = list(include_samples) @@ -121,7 +119,6 @@ def make_count_matrix( df = df.with_columns(pl.col("GT").alias(sample_name)) snp_df_cols.extend(sample_cols) - # Might be more memory efficient to use pandas index instead... snp_df = df.select(snp_df_cols).unique(maintain_order=True).with_row_index() sc_counts = CountStatsSC() # Class that holds total count data @@ -152,7 +149,6 @@ def make_count_matrix( ) # Create sparse matrices - # sparse array is recommended...but doesnt work with adata matrix_shape = (snp_df.shape[0], len(bc_dict)) sparse_ref = _sparse_from_counts(sc_counts.ref_count, matrix_shape) sparse_alt = _sparse_from_counts(sc_counts.alt_count, matrix_shape) @@ -164,8 +160,7 @@ def make_count_matrix( layers={"ref": sparse_ref, "alt": sparse_alt, "other": sparse_other}, ) - # Annotate adata: Figure out what to add to adata here vs later - adata.obs = snp_df.to_pandas() # Maybe just switch to pandas? Should i set no copy? + adata.obs = snp_df.to_pandas() adata.obs["ref_count"] = adata.layers["ref"].sum(axis=1, dtype=np.uint16).T.A1 adata.obs["alt_count"] = adata.layers["alt"].sum(axis=1, dtype=np.uint16).T.A1 diff --git a/src/counting/filter_variant_data.py b/src/counting/filter_variant_data.py index 43bfc21e..21768e7d 100644 --- a/src/counting/filter_variant_data.py +++ b/src/counting/filter_variant_data.py @@ -105,8 +105,6 @@ def gtf_to_bed( .select(["seqname", "start", "end", attribute]) ) - # TODO Extra validation and may want to return some data? - # Write to BED df.write_csv(out_bed, separator="\t", include_header=False) @@ -129,7 +127,6 @@ def intersect_vcf_region( out_file : str | Path Output intersection file path. """ - # Parse region file before or after??? intersect_cmd = ["bedtools", "intersect", "-a", str(vcf_file), "-b", str(region_file), "-wb"] # write intersect out diff --git a/src/counting/parse_gene_data.py b/src/counting/parse_gene_data.py index 682192c1..3df9ac8d 100644 --- a/src/counting/parse_gene_data.py +++ b/src/counting/parse_gene_data.py @@ -129,17 +129,10 @@ def parse_gene_file( elif df.get_column("attribute").str.contains("Name").any(): attribute = "Name" else: - # TODO return an error - # TODO maybe just use region or coords as a feature logger.warning("No 'ID', '%s_id' or 'Name' attribute found. Please include ID", feature) - # TODO: Figure out best way to handle parent attribute - # Parse parent attributes - # Figure out best way to parse and handle this if parent_attribute is None: - # Defaults to gene(possibly transcript???) - if df.get_column("attribute").str.contains("Parent").any(): parent_attribute = "Parent" elif df.get_column("attribute").str.contains("transcript_id").any(): @@ -149,7 +142,6 @@ def parse_gene_file( else: parent_attribute = attribute - # TODO: Allow for count output without parent column assert attribute is not None, "Could not determine attribute from gene file" assert parent_attribute is not None, "Could not determine parent_attribute from gene file" if parent_attribute == attribute: @@ -258,12 +250,8 @@ def parse_intersect_genes( } ) - # AFTER performing gtf_to_bed and intersecting! df = pl.scan_csv(intersect_file, separator="\t", has_header=False, infer_schema_length=0) - # Should i poossibly consider diff number of cols? - - # Might want to do checks for wrong number of columns subset_cols = [df.columns[i] for i in [0, 2, 3, 4, -2, -1]] new_cols = ["chrom", "pos", "ref", "alt", attribute, parent_attribute] diff --git a/src/counting/run_counting.py b/src/counting/run_counting.py index 651fa479..196b23ed 100644 --- a/src/counting/run_counting.py +++ b/src/counting/run_counting.py @@ -115,7 +115,7 @@ def __init__( self.skip_vcf_to_bed = precomputed_vcf_bed is not None # Parse region file - self.region_type = None # maybe use a boolean flag instead + self.region_type = None if self.region_file is not None: f_ext = "".join(Path(self.region_file).suffixes) @@ -162,7 +162,6 @@ def __init__( ) self.skip_intersect = precomputed_intersect is not None - # TODO UPDATE THIS WHEN I ADD AUTOPARSERS if self.is_gene_file: # Possible edge case of vcf and gtf prefix conflict if self.vcf_bed == self.gtf_bed: @@ -249,15 +248,10 @@ def run_count_variants( precomputed_intersect=precomputed_intersect, ) - # print(*vars(count_files).items(), sep="\n") # For debugging with_gt = False if (count_files.samples is not None) and (len(count_files.samples) == 1): with_gt = True - # temporarily disable for ASE - # if not count_files.is_gene_file: - # with_gt = True - # Create Intermediary Files if not count_files.skip_vcf_to_bed: vcf_to_bed( @@ -268,7 +262,6 @@ def run_count_variants( include_indels=include_indels, ) - # TODO PARSE GENE FEATURES AND ATTRIBUTES region_col_name = None # Defaults to 'region' as region name intersect_genes = False @@ -276,7 +269,6 @@ def run_count_variants( if count_files.region_file is not None: # Check if we need to prepare genes for intersection if count_files.gtf_bed is not None: - # TODO UPDATE THIS WHEN I ADD AUTOPARSERS AND VALIDATORS gene_data = make_gene_data( gene_file=count_files.region_file, out_bed=count_files.gtf_bed, @@ -300,7 +292,6 @@ def run_count_variants( ) # Create Variant Dataframe - # TODO validate if intersect_genes: df = parse_intersect_genes_new( intersect_file=count_files.intersect_file, @@ -321,18 +312,9 @@ def run_count_variants( use_region_names=count_files.use_region_names, region_col=region_col_name, ) - # df = parse_intersect_region( - # intersect_file=count_files.intersect_file, - # use_region_names=count_files.use_region_names, - # region_col=region_col_name) - - # Should I include a filt bam step??? # Count count_df = make_count_df(bam_file=count_files.bam_file, df=df, use_rust=use_rust) # Write counts count_df.write_csv(count_files.out_file, include_header=True, separator="\t") - - # Should i return for use in analysis pipeline? - # return count_df diff --git a/src/counting/run_counting_sc.py b/src/counting/run_counting_sc.py index c9c4c20b..637df8aa 100644 --- a/src/counting/run_counting_sc.py +++ b/src/counting/run_counting_sc.py @@ -43,12 +43,10 @@ def __init__( out_file: str | None = None, temp_loc: str | None = None, ) -> None: - # TODO: ALSO ACCEPT .h5 - # User input files self.bam_file = bam_file self.variant_file = variant_file - self.barcode_file = barcode_file # Maybe could be optional? + self.barcode_file = barcode_file self.feature_file = feature_file self.use_region_names = use_region_names @@ -89,7 +87,7 @@ def __init__( self.vcf_bed = str(Path(self.temp_loc) / f"{variant_prefix}.bed") # Parse feature file - self.feature_type = None # maybe use a boolean flag instead + self.feature_type = None if self.feature_file is not None: f_ext = "".join(Path(self.feature_file).suffixes) @@ -158,7 +156,6 @@ def run_count_variants_sc( ) # Create intermediary files - # Maybe change include_gt based on preparse? vcf_to_bed( vcf_file=count_files.variant_file, out_bed=count_files.vcf_bed, @@ -191,7 +188,6 @@ def run_count_variants_sc( ad.AnnData().write_h5ad(count_files.out_file) return - # TODO: handle case where barcode file contains multiple columns with open(count_files.barcode_file) as file: bc_dict = {line.rstrip(): i for i, line in enumerate(file)} @@ -205,4 +201,3 @@ def run_count_variants_sc( # Write outputs adata.write_h5ad(count_files.out_file) - # TODO: include output options, (ie MTX, dense?) diff --git a/src/mapping/intersect_variant_data.py b/src/mapping/intersect_variant_data.py index 973965c1..db182539 100644 --- a/src/mapping/intersect_variant_data.py +++ b/src/mapping/intersect_variant_data.py @@ -217,9 +217,6 @@ def make_intersect_df( df = df.with_columns(expr_list).unnest([*samples, "read"]).with_columns(cast_list) - # should i remove instead of keep first? - df = df.unique( - ["chrom", "read", "mate", "start", "stop"], keep="first" - ) # Doesnt remove dup snp in pair? + df = df.unique(["chrom", "read", "mate", "start", "stop"], keep="first") return df.collect() diff --git a/src/mapping/remap_utils.py b/src/mapping/remap_utils.py index 416ebde7..f6b7af00 100644 --- a/src/mapping/remap_utils.py +++ b/src/mapping/remap_utils.py @@ -36,7 +36,6 @@ def paired_read_gen_stat( read_dict = {} discard_set = set() - # DO I need multiple iterators??? for read in bam.fetch(chrom, multiple_iterators=False): if not read.is_proper_pair: discard_set.add(read.query_name) diff --git a/src/mapping/run_mapping.py b/src/mapping/run_mapping.py index 6e76382b..b5bf5e7b 100644 --- a/src/mapping/run_mapping.py +++ b/src/mapping/run_mapping.py @@ -282,8 +282,6 @@ def run_make_remap_reads( temp_loc=temp_loc, ) - # print(*vars(wasp_files).items(), sep="\n") - # Create Checks for not integrated options if not wasp_files.is_paired: raise ValueError("Single-End not Implemented") @@ -300,7 +298,6 @@ def run_make_remap_reads( assert isinstance(wasp_files.samples, list), "samples should be normalized to list" assert wasp_files.remap_fq2 is not None, "remap_fq2 should be set when is_paired is True" - # Should I create cache that checks for premade files?? Path(str(wasp_files.out_dir)).mkdir(parents=True, exist_ok=True) # Create Intermediary Files @@ -329,10 +326,6 @@ def run_make_remap_reads( num_samples=len(wasp_files.samples), ) - # print("INTERSECTION COMPLETE") - - # If a tempdir already exists?? - # Create remap fq write_remap_bam( wasp_files.to_remap_bam, @@ -345,10 +338,7 @@ def run_make_remap_reads( max_seqs=max_seqs, ) - # print("WROTE READS TO BE REMAPPED") - wasp_files.write_data(out_file=out_json) # export json - # print(f"File Data written to JSON...\n{out_json}") # Decorator and Parser for post remap filtering diff --git a/src/mapping/wasp_data_files.py b/src/mapping/wasp_data_files.py index 97aa0a44..69cc3f38 100644 --- a/src/mapping/wasp_data_files.py +++ b/src/mapping/wasp_data_files.py @@ -55,14 +55,11 @@ def __init__( else: self.samples = [s.strip() for s in self.samples.split(",")] - # self.samples = self.samples.split(",") # should i strip spaces? # At this point, self.samples is normalized to Optional[List[str]] # Check if variant file is phased (only works for VCF/BCF, not PGEN) if self.is_phased is None and self.samples is not None: - # TODO GOTTA FIX THIS TO CHECK IF PHASED - # Note: This only works for VCF/BCF files, PGEN doesn't store phase in the same way variant_path = Path(self.variant_file) suffix = variant_path.suffix.lower() if suffix in (".vcf", ".bcf") or str(variant_path).lower().endswith(".vcf.gz"): @@ -79,23 +76,18 @@ def __init__( if n_checked > 0 and n_phased > n_checked // 2: self.is_phased = True else: - # TODO GOTTA WARN UNPHASED BAD - # TODO WARN SOME UNPHASED WHILE OTHERS PHASED self.is_phased = False else: # PGEN format - assume phased (user should specify if not) self.is_phased = True if self.out_dir is None: - self.out_dir = Path(bam_file).parent # change to cwd? + self.out_dir = Path(bam_file).parent - # TODO handle temp loc, maybe make default if temp not made? - # Temporary workaround until figure out temp dir options if self.temp_loc is None: self.temp_loc = self.out_dir # Generate intermediate files - # Maybe use easy defalt names if temp loc in use # Handle different variant file extensions for prefix extraction variant_name = Path(self.variant_file).name