Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions src/analysis/as_analysis_sc.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def adata_count_qc(

adata.obs["index"] = adata.obs.index # Replace index column

# TODO add options to identify and filter GT errors
if gt_error is not None:
pass

Expand Down Expand Up @@ -175,9 +174,6 @@ def opt_disp(rho: float, ref_data: NDArray[np.uint16], n_data: NDArray[np.uint16

df_dict[group_name] = df

# Should I return something?
# Maybe compile all of the dataframes?

return df_dict


Expand Down Expand Up @@ -281,7 +277,6 @@ def get_imbalance_per_group(
group_results.append((region, snp_count, mu, null_ll, alt_ll, pval))

# Create allelic imbalance df
# Polars vs pandas??
df: pd.DataFrame = pd.DataFrame(
group_results, columns=["region", "num_snps", "mu", "null_ll", "alt_ll", "pval"]
)
Expand Down
13 changes: 1 addition & 12 deletions src/analysis/compare_ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
logger = logging.getLogger(__name__)


# Use these functions to figure out how to optimize per group
def get_imbalance_func(
ref_count: NDArray[np.integer[Any]],
n_count: NDArray[np.integer[Any]],
Expand Down Expand Up @@ -105,7 +104,6 @@ def get_compared_imbalance(
if sample is None:
phased = False

# Should I be comparing all combos by default??? Seems like a lot
if groups is None:
groups = list(adata.var["group"].dropna().unique())
logger.info("Comparing all combinations of available groups")
Expand Down Expand Up @@ -154,14 +152,12 @@ def opt_disp(rho: float, ref_data: NDArray[np.uint16], n_data: NDArray[np.uint16

# process counts on a per group basis to avoid recalculating
group_dict: dict[str, Any] = {}
# group_data = namedtuple("group_data", ["ref_counts", "n_counts", "phase_data", "region_snp_dict"]) # Maybe include the gt_array instead of min_idx
group_data = namedtuple("group_data", ["ref_counts", "n_counts", "region_snp_df"])

for group_name in groups:
# Subset by group
adata_sub = adata[:, adata.var["group"] == group_name]

# Create count data per group, should i do pseudocount now or later?
ref_counts_group = adata_sub.layers["ref"].sum(axis=1, dtype=np.uint16).T.A1 + pseudocount
alt_counts_group = adata_sub.layers["alt"].sum(axis=1, dtype=np.uint16).T.A1 + pseudocount
n_counts_group = ref_counts_group + alt_counts_group
Expand Down Expand Up @@ -239,7 +235,6 @@ def opt_disp(rho: float, ref_data: NDArray[np.uint16], n_data: NDArray[np.uint16

continue

# This sub function name kinda long...find better name maybe?
df = compare_imbalance_between_groups(
disp, ref_counts1, n_counts1, ref_counts2, n_counts2, region_snp_dict, gt_array
)
Expand Down Expand Up @@ -368,7 +363,6 @@ def compare_imbalance_between_groups(

# Add data to output list

# How should i format this, lots of possible outputs
group_results.append(
(region, len(snp_list), combined_mu, alt_mu1, alt_mu2, null_ll, alt_ll, pval)
)
Expand Down Expand Up @@ -412,7 +406,6 @@ def get_compared_imbalance_diff_snps(
if sample is None:
phased = False

# Should I be comparing all combos by default??? Seems like a lot
if groups is None:
groups = list(adata.var["group"].dropna().unique())
logger.info("Comparing all combinations of available groups")
Expand Down Expand Up @@ -469,13 +462,12 @@ def opt_disp_filt(
group_dict: dict[str, Any] = {}
group_data = namedtuple(
"group_data", ["ref_counts", "n_counts", "phase_data", "region_snp_dict"]
) # Maybe include the gt_array instead of min_idx
)

for group_name in groups:
# Subset by group
adata_sub = adata[:, adata.var["group"] == group_name]

# Create count data per group, should i do pseudocount now or later?
ref_counts_group = adata_sub.layers["ref"].sum(axis=1, dtype=np.uint16).T.A1 + pseudocount
alt_counts_group = adata_sub.layers["alt"].sum(axis=1, dtype=np.uint16).T.A1 + pseudocount
n_counts_group = ref_counts_group + alt_counts_group
Expand Down Expand Up @@ -519,8 +511,6 @@ def opt_disp_filt(

df_dict: dict[tuple[str, str], pd.DataFrame] = {}
for group1, group2 in group_combos:
# Might be smart to create a cache to prevent repeating calculations
# This sub function name kinda long...find better name maybe?
df = compare_imbalance_between_groups_diff_snps(
disp, *group_dict[group1], *group_dict[group2]
)
Expand Down Expand Up @@ -632,7 +622,6 @@ def compare_imbalance_between_groups_diff_snps(

# Add data to output list

# How should i format this, lots of possible outputs
group_results.append(
(
region,
Expand Down
9 changes: 3 additions & 6 deletions src/analysis/run_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,9 @@ def __init__(
# User input data
self.count_file = count_file
self.region_col = region_col
self.groupby = groupby # group by region or parent?
self.groupby = groupby
self.out_file = out_file

# TODO parse vcf for phased instead of default unphased
self.phased: bool = bool(phased)

# Default to single dispersion model
Expand Down Expand Up @@ -108,15 +107,14 @@ def __init__(
"Parent",
"parent",
}:
self.groupby = count_cols[region_idx + 1] # Set group
self.groupby = count_cols[region_idx + 1]
else:
# Maybe throw error instead
logger.warning("%s not found in columns %s", self.groupby, count_cols)
self.groupby = None

# Create default outfile
if self.out_file is None:
self.out_file = str(Path.cwd() / "ai_results.tsv") # do this after
self.out_file = str(Path.cwd() / "ai_results.tsv")


def run_ai_analysis(
Expand Down Expand Up @@ -192,7 +190,6 @@ def run_ai_analysis(
)
ai_df = pd.DataFrame(results)

# Maybe give option to sort or not sort by pval
if "fdr_pval" in ai_df.columns:
ai_df = ai_df.sort_values(by="fdr_pval", ascending=True)

Expand Down
14 changes: 2 additions & 12 deletions src/analysis/run_analysis_sc.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,9 @@ def __init__(
self.model = model
self.out_file = out_file
self.phased = phased
self.z_cutoff = z_cutoff # Should i default to something like 4 or 5?
self.z_cutoff = z_cutoff

# Default to single dispersion model
# TODO ADD GROUP DISP and other model types
if (self.model is None) or (self.model not in {"single"}):
self.model = "single"

Expand All @@ -48,7 +47,6 @@ def __init__(
self.min_count = 10

if self.pseudocount is None:
# self.pseudocount = 0 # either 0 or 1 for default
self.pseudocount = 1

# Make sure min and pseudocounts are valid
Expand Down Expand Up @@ -132,13 +130,10 @@ def process_adata_inputs(
f"Please input a sample from list: {sample_list}"
)
else:
# We gotta subset to include het genotypes now
if not any(i in ["1|0", "0|1", "1/0", "0/1"] for i in adata.obs[sample].unique()):
raise ValueError(f"Heterozygous genotypes not found for sample: {sample}.")

# adata = adata[adata.obs[sample].isin(['1|0', '0|1', '1/0', '0/1'])]

# Using copy instead of view stops implicit mod warning, need to check memory usage
# Copy (not view) avoids implicit-modification warning downstream.
adata = adata[adata.obs[sample].isin(["1|0", "0|1", "1/0", "0/1"])].copy()
adata.obs = adata.obs.reset_index(
drop=True
Expand Down Expand Up @@ -234,14 +229,9 @@ def run_ai_analysis_sc(

adata_inputs = process_adata_inputs(ad.read_h5ad(ai_files.adata_file), ai_files=ai_files)

# print(*vars(ai_files).items(), sep="\n") # For debugging
# print(adata_inputs) # For debugging

# Update class attributes
ai_files.update_data(adata_inputs)

# adata = adata_inputs.adata # Hold parsed adata file obj in memory

# Prefilter and hold adata data in memory
adata = adata_count_qc(adata_inputs.adata, z_cutoff=ai_files.z_cutoff, gt_error=None)

Expand Down
1 change: 0 additions & 1 deletion src/analysis/run_compare_ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ def run_ai_comparison(
out_file: str | Path | None = None,
z_cutoff: float | None = None,
) -> None:
# Might be smart to change some of the defaults in the class
# Create data class that holds input data
ai_files: WaspAnalysisSC = WaspAnalysisSC(
adata_file=count_file,
Expand Down
3 changes: 0 additions & 3 deletions src/counting/count_alleles.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,13 +155,10 @@ def make_count_df(bam_file: str, df: pl.DataFrame, use_rust: bool = True) -> pl.
orient="row",
)

# possibly find better solution
df = df.with_columns([pl.col("chrom").cast(chrom_enum)]).join(
count_df, on=["chrom", "pos"], how="left"
)

# df = df.join(count_df, on=["chrom", "pos"], how="left")

return df


Expand Down
7 changes: 1 addition & 6 deletions src/counting/count_alleles_sc.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,8 @@ def make_count_matrix(
AnnData object with count matrices in layers (ref, alt, other).
"""
chrom_list = df.get_column("chrom").unique(maintain_order=True)
# chrom_list = chrom_list[:3] # Testing purposes

# Add genotypes annotations
# Maybe do this automatically and parse feature col instead?
snp_df_cols = ["chrom", "pos", "ref", "alt"]
if include_samples is not None:
sample_cols = list(include_samples)
Expand All @@ -121,7 +119,6 @@ def make_count_matrix(
df = df.with_columns(pl.col("GT").alias(sample_name))
snp_df_cols.extend(sample_cols)

# Might be more memory efficient to use pandas index instead...
snp_df = df.select(snp_df_cols).unique(maintain_order=True).with_row_index()

sc_counts = CountStatsSC() # Class that holds total count data
Expand Down Expand Up @@ -152,7 +149,6 @@ def make_count_matrix(
)

# Create sparse matrices
# sparse array is recommended...but doesnt work with adata
matrix_shape = (snp_df.shape[0], len(bc_dict))
sparse_ref = _sparse_from_counts(sc_counts.ref_count, matrix_shape)
sparse_alt = _sparse_from_counts(sc_counts.alt_count, matrix_shape)
Expand All @@ -164,8 +160,7 @@ def make_count_matrix(
layers={"ref": sparse_ref, "alt": sparse_alt, "other": sparse_other},
)

# Annotate adata: Figure out what to add to adata here vs later
adata.obs = snp_df.to_pandas() # Maybe just switch to pandas? Should i set no copy?
adata.obs = snp_df.to_pandas()
adata.obs["ref_count"] = adata.layers["ref"].sum(axis=1, dtype=np.uint16).T.A1
adata.obs["alt_count"] = adata.layers["alt"].sum(axis=1, dtype=np.uint16).T.A1

Expand Down
3 changes: 0 additions & 3 deletions src/counting/filter_variant_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,6 @@ def gtf_to_bed(
.select(["seqname", "start", "end", attribute])
)

# TODO Extra validation and may want to return some data?

# Write to BED
df.write_csv(out_bed, separator="\t", include_header=False)

Expand All @@ -129,7 +127,6 @@ def intersect_vcf_region(
out_file : str | Path
Output intersection file path.
"""
# Parse region file before or after???
intersect_cmd = ["bedtools", "intersect", "-a", str(vcf_file), "-b", str(region_file), "-wb"]

# write intersect out
Expand Down
12 changes: 0 additions & 12 deletions src/counting/parse_gene_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,17 +129,10 @@ def parse_gene_file(
elif df.get_column("attribute").str.contains("Name").any():
attribute = "Name"
else:
# TODO return an error
# TODO maybe just use region or coords as a feature
logger.warning("No 'ID', '%s_id' or 'Name' attribute found. Please include ID", feature)

# TODO: Figure out best way to handle parent attribute

# Parse parent attributes
# Figure out best way to parse and handle this
if parent_attribute is None:
# Defaults to gene(possibly transcript???)

if df.get_column("attribute").str.contains("Parent").any():
parent_attribute = "Parent"
elif df.get_column("attribute").str.contains("transcript_id").any():
Expand All @@ -149,7 +142,6 @@ def parse_gene_file(
else:
parent_attribute = attribute

# TODO: Allow for count output without parent column
assert attribute is not None, "Could not determine attribute from gene file"
assert parent_attribute is not None, "Could not determine parent_attribute from gene file"
if parent_attribute == attribute:
Expand Down Expand Up @@ -258,12 +250,8 @@ def parse_intersect_genes(
}
)

# AFTER performing gtf_to_bed and intersecting!
df = pl.scan_csv(intersect_file, separator="\t", has_header=False, infer_schema_length=0)

# Should i poossibly consider diff number of cols?

# Might want to do checks for wrong number of columns
subset_cols = [df.columns[i] for i in [0, 2, 3, 4, -2, -1]]
new_cols = ["chrom", "pos", "ref", "alt", attribute, parent_attribute]

Expand Down
20 changes: 1 addition & 19 deletions src/counting/run_counting.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def __init__(
self.skip_vcf_to_bed = precomputed_vcf_bed is not None

# Parse region file
self.region_type = None # maybe use a boolean flag instead
self.region_type = None

if self.region_file is not None:
f_ext = "".join(Path(self.region_file).suffixes)
Expand Down Expand Up @@ -162,7 +162,6 @@ def __init__(
)
self.skip_intersect = precomputed_intersect is not None

# TODO UPDATE THIS WHEN I ADD AUTOPARSERS
if self.is_gene_file:
# Possible edge case of vcf and gtf prefix conflict
if self.vcf_bed == self.gtf_bed:
Expand Down Expand Up @@ -249,15 +248,10 @@ def run_count_variants(
precomputed_intersect=precomputed_intersect,
)

# print(*vars(count_files).items(), sep="\n") # For debugging
with_gt = False
if (count_files.samples is not None) and (len(count_files.samples) == 1):
with_gt = True

# temporarily disable for ASE
# if not count_files.is_gene_file:
# with_gt = True

# Create Intermediary Files
if not count_files.skip_vcf_to_bed:
vcf_to_bed(
Expand All @@ -268,15 +262,13 @@ def run_count_variants(
include_indels=include_indels,
)

# TODO PARSE GENE FEATURES AND ATTRIBUTES
region_col_name = None # Defaults to 'region' as region name
intersect_genes = False

# region_files is valid to perform intersects
if count_files.region_file is not None:
# Check if we need to prepare genes for intersection
if count_files.gtf_bed is not None:
# TODO UPDATE THIS WHEN I ADD AUTOPARSERS AND VALIDATORS
gene_data = make_gene_data(
gene_file=count_files.region_file,
out_bed=count_files.gtf_bed,
Expand All @@ -300,7 +292,6 @@ def run_count_variants(
)

# Create Variant Dataframe
# TODO validate
if intersect_genes:
df = parse_intersect_genes_new(
intersect_file=count_files.intersect_file,
Expand All @@ -321,18 +312,9 @@ def run_count_variants(
use_region_names=count_files.use_region_names,
region_col=region_col_name,
)
# df = parse_intersect_region(
# intersect_file=count_files.intersect_file,
# use_region_names=count_files.use_region_names,
# region_col=region_col_name)

# Should I include a filt bam step???

# Count
count_df = make_count_df(bam_file=count_files.bam_file, df=df, use_rust=use_rust)

# Write counts
count_df.write_csv(count_files.out_file, include_header=True, separator="\t")

# Should i return for use in analysis pipeline?
# return count_df
Loading
Loading