mcvickerlab · Jaureguy760 · Apr 19, 2026 · Apr 19, 2026
@@ -58,7 +58,6 @@ def adata_count_qc(
 
         adata.obs["index"] = adata.obs.index  # Replace index column
 
-    # TODO add options to identify and filter GT errors
     if gt_error is not None:
         pass
 
@@ -175,9 +174,6 @@ def opt_disp(rho: float, ref_data: NDArray[np.uint16], n_data: NDArray[np.uint16
 
         df_dict[group_name] = df
 
-    # Should I return something?
-    # Maybe compile all of the dataframes?
-
     return df_dict
 
 
@@ -281,7 +277,6 @@ def get_imbalance_per_group(
         group_results.append((region, snp_count, mu, null_ll, alt_ll, pval))
 
     # Create allelic imbalance df
-    # Polars vs pandas??
     df: pd.DataFrame = pd.DataFrame(
         group_results, columns=["region", "num_snps", "mu", "null_ll", "alt_ll", "pval"]
     )

@@ -19,7 +19,6 @@
 logger = logging.getLogger(__name__)
 
 
-# Use these functions to figure out how to optimize per group
 def get_imbalance_func(
     ref_count: NDArray[np.integer[Any]],
     n_count: NDArray[np.integer[Any]],
@@ -105,7 +104,6 @@ def get_compared_imbalance(
     if sample is None:
         phased = False
 
-    # Should I be comparing all combos by default??? Seems like a lot
     if groups is None:
         groups = list(adata.var["group"].dropna().unique())
         logger.info("Comparing all combinations of available groups")
@@ -154,14 +152,12 @@ def opt_disp(rho: float, ref_data: NDArray[np.uint16], n_data: NDArray[np.uint16
 
     # process counts on a per group basis to avoid recalculating
     group_dict: dict[str, Any] = {}
-    # group_data = namedtuple("group_data", ["ref_counts", "n_counts", "phase_data", "region_snp_dict"]) # Maybe include the gt_array instead of min_idx
     group_data = namedtuple("group_data", ["ref_counts", "n_counts", "region_snp_df"])
 
     for group_name in groups:
         # Subset by group
         adata_sub = adata[:, adata.var["group"] == group_name]
 
-        # Create count data per group, should i do pseudocount now or later?
         ref_counts_group = adata_sub.layers["ref"].sum(axis=1, dtype=np.uint16).T.A1 + pseudocount
         alt_counts_group = adata_sub.layers["alt"].sum(axis=1, dtype=np.uint16).T.A1 + pseudocount
         n_counts_group = ref_counts_group + alt_counts_group
@@ -239,7 +235,6 @@ def opt_disp(rho: float, ref_data: NDArray[np.uint16], n_data: NDArray[np.uint16
 
             continue
 
-        # This sub function name kinda long...find better name maybe?
         df = compare_imbalance_between_groups(
             disp, ref_counts1, n_counts1, ref_counts2, n_counts2, region_snp_dict, gt_array
         )
@@ -368,7 +363,6 @@ def compare_imbalance_between_groups(
 
         # Add data to output list
 
-        # How should i format this, lots of possible outputs
         group_results.append(
             (region, len(snp_list), combined_mu, alt_mu1, alt_mu2, null_ll, alt_ll, pval)
         )
@@ -412,7 +406,6 @@ def get_compared_imbalance_diff_snps(
     if sample is None:
         phased = False
 
-    # Should I be comparing all combos by default??? Seems like a lot
     if groups is None:
         groups = list(adata.var["group"].dropna().unique())
         logger.info("Comparing all combinations of available groups")
@@ -469,13 +462,12 @@ def opt_disp_filt(
     group_dict: dict[str, Any] = {}
     group_data = namedtuple(
         "group_data", ["ref_counts", "n_counts", "phase_data", "region_snp_dict"]
-    )  # Maybe include the gt_array instead of min_idx
+    )
 
     for group_name in groups:
         # Subset by group
         adata_sub = adata[:, adata.var["group"] == group_name]
 
-        # Create count data per group, should i do pseudocount now or later?
         ref_counts_group = adata_sub.layers["ref"].sum(axis=1, dtype=np.uint16).T.A1 + pseudocount
         alt_counts_group = adata_sub.layers["alt"].sum(axis=1, dtype=np.uint16).T.A1 + pseudocount
         n_counts_group = ref_counts_group + alt_counts_group
@@ -519,8 +511,6 @@ def opt_disp_filt(
 
     df_dict: dict[tuple[str, str], pd.DataFrame] = {}
     for group1, group2 in group_combos:
-        # Might be smart to create a cache to prevent repeating calculations
-        # This sub function name kinda long...find better name maybe?
         df = compare_imbalance_between_groups_diff_snps(
             disp, *group_dict[group1], *group_dict[group2]
         )
@@ -632,7 +622,6 @@ def compare_imbalance_between_groups_diff_snps(
 
         # Add data to output list
 
-        # How should i format this, lots of possible outputs
         group_results.append(
             (
                 region,

@@ -59,10 +59,9 @@ def __init__(
         # User input data
         self.count_file = count_file
         self.region_col = region_col
-        self.groupby = groupby  # group by region or parent?
+        self.groupby = groupby
         self.out_file = out_file
 
-        # TODO parse vcf for phased instead of default unphased
         self.phased: bool = bool(phased)
 
         # Default to single dispersion model
@@ -108,15 +107,14 @@ def __init__(
                 "Parent",
                 "parent",
             }:
-                self.groupby = count_cols[region_idx + 1]  # Set group
+                self.groupby = count_cols[region_idx + 1]
             else:
-                # Maybe throw error instead
                 logger.warning("%s not found in columns %s", self.groupby, count_cols)
                 self.groupby = None
 
         # Create default outfile
         if self.out_file is None:
-            self.out_file = str(Path.cwd() / "ai_results.tsv")  # do this after
+            self.out_file = str(Path.cwd() / "ai_results.tsv")
 
 
 def run_ai_analysis(
@@ -192,7 +190,6 @@ def run_ai_analysis(
     )
     ai_df = pd.DataFrame(results)
 
-    # Maybe give option to sort or not sort by pval
     if "fdr_pval" in ai_df.columns:
         ai_df = ai_df.sort_values(by="fdr_pval", ascending=True)
 

@@ -36,10 +36,9 @@ def __init__(
         self.model = model
         self.out_file = out_file
         self.phased = phased
-        self.z_cutoff = z_cutoff  # Should i default to something like 4 or 5?
+        self.z_cutoff = z_cutoff
 
         # Default to single dispersion model
-        # TODO ADD GROUP DISP and other model types
         if (self.model is None) or (self.model not in {"single"}):
             self.model = "single"
 
@@ -48,7 +47,6 @@ def __init__(
             self.min_count = 10
 
         if self.pseudocount is None:
-            # self.pseudocount = 0 # either 0 or 1 for default
             self.pseudocount = 1
 
         # Make sure min and pseudocounts are valid
@@ -132,13 +130,10 @@ def process_adata_inputs(
                 f"Please input a sample from list: {sample_list}"
             )
         else:
-            # We gotta subset to include het genotypes now
             if not any(i in ["1|0", "0|1", "1/0", "0/1"] for i in adata.obs[sample].unique()):
                 raise ValueError(f"Heterozygous genotypes not found for sample: {sample}.")
 
-            # adata = adata[adata.obs[sample].isin(['1|0', '0|1', '1/0', '0/1'])]
-
-            # Using copy instead of view stops implicit mod warning, need to check memory usage
+            # Copy (not view) avoids implicit-modification warning downstream.
             adata = adata[adata.obs[sample].isin(["1|0", "0|1", "1/0", "0/1"])].copy()
             adata.obs = adata.obs.reset_index(
                 drop=True
@@ -234,14 +229,9 @@ def run_ai_analysis_sc(
 
     adata_inputs = process_adata_inputs(ad.read_h5ad(ai_files.adata_file), ai_files=ai_files)
 
-    # print(*vars(ai_files).items(), sep="\n") # For debugging
-    # print(adata_inputs) # For debugging
-
     # Update class attributes
     ai_files.update_data(adata_inputs)
 
-    # adata = adata_inputs.adata # Hold parsed adata file obj in memory
-
     # Prefilter and hold adata data in memory
     adata = adata_count_qc(adata_inputs.adata, z_cutoff=ai_files.z_cutoff, gt_error=None)
 

@@ -23,7 +23,6 @@ def run_ai_comparison(
     out_file: str | Path | None = None,
     z_cutoff: float | None = None,
 ) -> None:
-    # Might be smart to change some of the defaults in the class
     # Create data class that holds input data
     ai_files: WaspAnalysisSC = WaspAnalysisSC(
         adata_file=count_file,

diff --git a/src/counting/count_alleles.py b/src/counting/count_alleles.py
@@ -155,13 +155,10 @@ def make_count_df(bam_file: str, df: pl.DataFrame, use_rust: bool = True) -> pl.
         orient="row",
     )
 
-    # possibly find better solution
     df = df.with_columns([pl.col("chrom").cast(chrom_enum)]).join(
         count_df, on=["chrom", "pos"], how="left"
     )
 
-    # df = df.join(count_df, on=["chrom", "pos"], how="left")
-
     return df
 
 

diff --git a/src/counting/count_alleles_sc.py b/src/counting/count_alleles_sc.py
@@ -108,10 +108,8 @@ def make_count_matrix(
         AnnData object with count matrices in layers (ref, alt, other).
     """
     chrom_list = df.get_column("chrom").unique(maintain_order=True)
-    # chrom_list = chrom_list[:3] # Testing purposes
 
     # Add genotypes annotations
-    # Maybe do this automatically and parse feature col instead?
     snp_df_cols = ["chrom", "pos", "ref", "alt"]
     if include_samples is not None:
         sample_cols = list(include_samples)
@@ -121,7 +119,6 @@ def make_count_matrix(
             df = df.with_columns(pl.col("GT").alias(sample_name))
         snp_df_cols.extend(sample_cols)
 
-    # Might be more memory efficient to use pandas index instead...
     snp_df = df.select(snp_df_cols).unique(maintain_order=True).with_row_index()
 
     sc_counts = CountStatsSC()  # Class that holds total count data
@@ -152,7 +149,6 @@ def make_count_matrix(
                 )
 
     # Create sparse matrices
-    # sparse array is recommended...but doesnt work with adata
     matrix_shape = (snp_df.shape[0], len(bc_dict))
     sparse_ref = _sparse_from_counts(sc_counts.ref_count, matrix_shape)
     sparse_alt = _sparse_from_counts(sc_counts.alt_count, matrix_shape)
@@ -164,8 +160,7 @@ def make_count_matrix(
         layers={"ref": sparse_ref, "alt": sparse_alt, "other": sparse_other},
     )
 
-    # Annotate adata: Figure out what to add to adata here vs later
-    adata.obs = snp_df.to_pandas()  # Maybe just switch to pandas? Should i set no copy?
+    adata.obs = snp_df.to_pandas()
     adata.obs["ref_count"] = adata.layers["ref"].sum(axis=1, dtype=np.uint16).T.A1
     adata.obs["alt_count"] = adata.layers["alt"].sum(axis=1, dtype=np.uint16).T.A1
 

diff --git a/src/counting/filter_variant_data.py b/src/counting/filter_variant_data.py
@@ -105,8 +105,6 @@ def gtf_to_bed(
         .select(["seqname", "start", "end", attribute])
     )
 
-    # TODO Extra validation and may want to return some data?
-
     # Write to BED
     df.write_csv(out_bed, separator="\t", include_header=False)
 
@@ -129,7 +127,6 @@ def intersect_vcf_region(
     out_file : str | Path
         Output intersection file path.
     """
-    # Parse region file before or after???
     intersect_cmd = ["bedtools", "intersect", "-a", str(vcf_file), "-b", str(region_file), "-wb"]
 
     # write intersect out

diff --git a/src/counting/parse_gene_data.py b/src/counting/parse_gene_data.py
@@ -129,17 +129,10 @@ def parse_gene_file(
         elif df.get_column("attribute").str.contains("Name").any():
             attribute = "Name"
         else:
-            # TODO return an error
-            # TODO maybe just use region or coords as a feature
             logger.warning("No 'ID', '%s_id' or 'Name' attribute found. Please include ID", feature)
 
-    # TODO: Figure out best way to handle parent attribute
-
     # Parse parent attributes
-    # Figure out best way to parse and handle this
     if parent_attribute is None:
-        # Defaults to gene(possibly transcript???)
-
         if df.get_column("attribute").str.contains("Parent").any():
             parent_attribute = "Parent"
         elif df.get_column("attribute").str.contains("transcript_id").any():
@@ -149,7 +142,6 @@ def parse_gene_file(
         else:
             parent_attribute = attribute
 
-    # TODO: Allow for count output without parent column
     assert attribute is not None, "Could not determine attribute from gene file"
     assert parent_attribute is not None, "Could not determine parent_attribute from gene file"
     if parent_attribute == attribute:
@@ -258,12 +250,8 @@ def parse_intersect_genes(
             }
         )
 
-    # AFTER performing gtf_to_bed and intersecting!
     df = pl.scan_csv(intersect_file, separator="\t", has_header=False, infer_schema_length=0)
 
-    # Should i poossibly consider diff number of cols?
-
-    # Might want to do checks for wrong number of columns
     subset_cols = [df.columns[i] for i in [0, 2, 3, 4, -2, -1]]
     new_cols = ["chrom", "pos", "ref", "alt", attribute, parent_attribute]
 

diff --git a/src/counting/run_counting.py b/src/counting/run_counting.py
@@ -115,7 +115,7 @@ def __init__(
         self.skip_vcf_to_bed = precomputed_vcf_bed is not None
 
         # Parse region file
-        self.region_type = None  # maybe use a boolean flag instead
+        self.region_type = None
 
         if self.region_file is not None:
             f_ext = "".join(Path(self.region_file).suffixes)
@@ -162,7 +162,6 @@ def __init__(
             )
         self.skip_intersect = precomputed_intersect is not None
 
-        # TODO UPDATE THIS WHEN I ADD AUTOPARSERS
         if self.is_gene_file:
             # Possible edge case of vcf and gtf prefix conflict
             if self.vcf_bed == self.gtf_bed:
@@ -249,15 +248,10 @@ def run_count_variants(
         precomputed_intersect=precomputed_intersect,
     )
 
-    # print(*vars(count_files).items(), sep="\n") # For debugging
     with_gt = False
     if (count_files.samples is not None) and (len(count_files.samples) == 1):
         with_gt = True
 
-        # temporarily disable for ASE
-        # if not count_files.is_gene_file:
-        #     with_gt = True
-
     # Create Intermediary Files
     if not count_files.skip_vcf_to_bed:
         vcf_to_bed(
@@ -268,15 +262,13 @@ def run_count_variants(
             include_indels=include_indels,
         )
 
-    # TODO PARSE GENE FEATURES AND ATTRIBUTES
     region_col_name = None  # Defaults to 'region' as region name
     intersect_genes = False
 
     # region_files is valid to perform intersects
     if count_files.region_file is not None:
         # Check if we need to prepare genes for intersection
         if count_files.gtf_bed is not None:
-            # TODO UPDATE THIS WHEN I ADD AUTOPARSERS AND VALIDATORS
             gene_data = make_gene_data(
                 gene_file=count_files.region_file,
                 out_bed=count_files.gtf_bed,
@@ -300,7 +292,6 @@ def run_count_variants(
             )
 
     # Create Variant Dataframe
-    # TODO validate
     if intersect_genes:
         df = parse_intersect_genes_new(
             intersect_file=count_files.intersect_file,
@@ -321,18 +312,9 @@ def run_count_variants(
             use_region_names=count_files.use_region_names,
             region_col=region_col_name,
         )
-        # df = parse_intersect_region(
-        #     intersect_file=count_files.intersect_file,
-        #     use_region_names=count_files.use_region_names,
-        #     region_col=region_col_name)
-
-    # Should I include a filt bam step???
 
     # Count
     count_df = make_count_df(bam_file=count_files.bam_file, df=df, use_rust=use_rust)
 
     # Write counts
     count_df.write_csv(count_files.out_file, include_header=True, separator="\t")
-
-    # Should i return for use in analysis pipeline?
-    # return count_df