diff --git a/nextflow/modules/jabs_classifiers.nf b/nextflow/modules/jabs_classifiers.nf index d53520b..ae24b85 100644 --- a/nextflow/modules/jabs_classifiers.nf +++ b/nextflow/modules/jabs_classifiers.nf @@ -176,14 +176,14 @@ process BEHAVIOR_TABLE_TO_FEATURES { label "r_jabs_table_convert" input: - tuple path(in_summary_table), val(bin_size) + tuple path(in_summary_table), val(bin_size), val(prev_bin_size) output: path("${in_summary_table.baseName}_features_${bin_size}.csv"), emit: features script: """ - python3 ${params.support_code_dir}/behavior_summaries.py -f ${in_summary_table} -b ${bin_size} -o "${in_summary_table.baseName}_features_${bin_size}.csv" + python3 ${params.support_code_dir}/behavior_summaries.py -f ${in_summary_table} -b ${bin_size} -p ${prev_bin_size} -o "${in_summary_table.baseName}_features_${bin_size}.csv" """ } diff --git a/nextflow/workflows/feature_generation.nf b/nextflow/workflows/feature_generation.nf index c8eb1c6..eb0a2b1 100644 --- a/nextflow/workflows/feature_generation.nf +++ b/nextflow/workflows/feature_generation.nf @@ -131,12 +131,20 @@ workflow SINGLE_MOUSE_V6_FEATURES { .collect() merged_bout_tables = AGGREGATE_BOUT_TABLES(all_bout_tables).merged_bout_tables + // Compute incremental bin pairs: [bin_size, prev_bin_size] + // Each bin_size is paired with the previous feature_bin so that latency + // features describe only the incremental time window. + sorted_bins = params.feature_bins.sort() + bin_pairs = sorted_bins.withIndex().collect { bin_size, idx -> + [bin_size, idx == 0 ? 0 : sorted_bins[idx - 1]] + } + // Combine table data into feature file all_summary_tables = heuristic_tables .concat(classifier_tables) .map { bout_table, summary_table -> summary_table } .flatten() - .combine(params.feature_bins) + .combine(Channel.fromList(bin_pairs)) individual_behavior_features = BEHAVIOR_TABLE_TO_FEATURES(all_summary_tables) // Features are named columns (wide) split across multiple files // Transform them into long format so that we can row-concat without sorting diff --git a/support_code/behavior_summaries.py b/support_code/behavior_summaries.py index a24a26f..0d9c06a 100644 --- a/support_code/behavior_summaries.py +++ b/support_code/behavior_summaries.py @@ -34,6 +34,13 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "-o", "--output", type=str, required=True, help="output file name" ) + parser.add_argument( + "-p", + "--prev_bin_size", + type=int, + default=0, + help="previous bin size (rows to skip for incremental latency features)", + ) return parser.parse_args() @@ -115,7 +122,7 @@ def get_columns_to_exclude(behavior: str) -> list: def aggregate_data_by_bin_size( - data: pd.DataFrame, bin_size: int, behavior: str + data: pd.DataFrame, bin_size: int, behavior: str, prev_bin_size: int = 0 ) -> pd.DataFrame: """Aggregate data by bin size. @@ -123,6 +130,9 @@ def aggregate_data_by_bin_size( data: Preprocessed dataframe. bin_size: Number of bins to aggregate. behavior: Behavior name. + prev_bin_size: Previous bin size; rows before this index are excluded + from incremental features (latency). Sum features and avg_bout_length + remain cumulative from bin 0. Returns: pd.DataFrame: Aggregated dataframe. @@ -131,6 +141,40 @@ def aggregate_data_by_bin_size( grouped = data.groupby("MouseID") filtered_data = pd.concat([group.iloc[:bin_size] for _, group in grouped]) + # Incremental slice: only the "new" bins for latency features. + # E.g., with feature_bins=[1,4], bin_size=4, prev_bin_size=1: + # filtered_data has bins 0-3 (0-20min), incremental has bins 1-3 (5-20min) + incremental_data = pd.concat( + [ + group.iloc[prev_bin_size:bin_size] + for _, group in filtered_data.groupby("MouseID") + ] + ) + + # Latency: first()/last() skip NaN within the incremental window. + # For a single-bin window, returns that bin's value or NaN. + # For a multi-bin window, returns first/last non-NaN, or NaN if all are NaN. + latency_first_col = f"{behavior}_latency_to_first_prediction" + latency_last_col = f"{behavior}_latency_to_last_prediction" + latency_first = incremental_data.groupby("MouseID")[latency_first_col].first() + latency_last = incremental_data.groupby("MouseID")[latency_last_col].last() + + # Avg bout length: cumulative weighted average across ALL bins (0 to bin_size), + # matching the semantics of sum features. + avg_bout_dur_col = f"{behavior}_avg_bout_duration" + sample_count_col = f"{behavior}__stats_sample_count" + + def _weighted_avg_bout(group): + mask = group[sample_count_col] > 0 + if not mask.any(): + return np.nan + return np.average( + group.loc[mask, avg_bout_dur_col], + weights=group.loc[mask, sample_count_col], + ) + + avg_bout_length = filtered_data.groupby("MouseID").apply(_weighted_avg_bout) + # Aggregate numeric columns by summing them numeric_cols = filtered_data.select_dtypes(include=["number"]).columns aggregated = filtered_data.groupby("MouseID")[numeric_cols].sum() @@ -170,21 +214,14 @@ def aggregate_data_by_bin_size( behavior_bout_col ] - # Additional stats - if np.sum(aggregated[f"{behavior}__stats_sample_count"]) == 0: - aggregated[f"bin_avg_{bin_size * 5}.{behavior}_avg_bout_length"] = np.nan - else: - aggregated[f"bin_avg_{bin_size * 5}.{behavior}_avg_bout_length"] = np.average( - aggregated[f"{behavior}_avg_bout_duration"], - weights=aggregated[f"{behavior}__stats_sample_count"], - ) + aggregated[f"bin_avg_{bin_size * 5}.{behavior}_avg_bout_length"] = avg_bout_length # TODO: var and std need to be aggregated across bins. # This is non-trivial because of the partial bouts and their associated weights. aggregated[f"bin_first_{bin_size * 5}.{behavior}_latency_first_prediction"] = ( - aggregated[f"{behavior}_latency_to_first_prediction"].head(1) + latency_first ) aggregated[f"bin_last_{bin_size * 5}.{behavior}_latency_last_prediction"] = ( - aggregated[f"{behavior}_latency_to_last_prediction"].tail(1) + latency_last ) # Reset index to make MouseID a regular column @@ -207,7 +244,7 @@ def main(): # Aggregate data by bin size aggregated_data = aggregate_data_by_bin_size( - processed_data, args.bin_size, behavior + processed_data, args.bin_size, behavior, args.prev_bin_size ) # Drop excluded columns diff --git a/tests/support_code/__init__.py b/tests/support_code/__init__.py new file mode 100644 index 0000000..fb5b506 --- /dev/null +++ b/tests/support_code/__init__.py @@ -0,0 +1 @@ +"""Tests for support code modules.""" diff --git a/tests/support_code/test_behavior_summaries.py b/tests/support_code/test_behavior_summaries.py new file mode 100644 index 0000000..6c51542 --- /dev/null +++ b/tests/support_code/test_behavior_summaries.py @@ -0,0 +1,361 @@ +"""Unit tests for support_code/behavior_summaries.py.""" + +import math +import sys +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + +# behavior_summaries.py lives in support_code/, which is not a package. +# Add it to sys.path so we can import it directly. +sys.path.insert(0, str(Path(__file__).parents[2] / "support_code")) + +import behavior_summaries + +BEHAVIOR = "Jumping" + + +def _make_bin_data( + latency_first_values: list, + latency_last_values: list, + avg_bout_durations: list | None = None, + stats_sample_counts: list | None = None, + mouse_id: str = "mouse_A", +) -> pd.DataFrame: + """Build a per-bin DataFrame matching the shape expected by aggregate_data_by_bin_size.""" + n = len(latency_first_values) + if avg_bout_durations is None: + avg_bout_durations = [1.5] * n + if stats_sample_counts is None: + stats_sample_counts = [2] * n + return pd.DataFrame( + { + "MouseID": [mouse_id] * n, + f"{BEHAVIOR}_latency_to_first_prediction": latency_first_values, + f"{BEHAVIOR}_latency_to_last_prediction": latency_last_values, + f"{BEHAVIOR}_time_behavior": [100.0] * n, + f"{BEHAVIOR}_time_not_behavior": [200.0] * n, + f"{BEHAVIOR}_behavior_dist": [50.0] * n, + f"{BEHAVIOR}_behavior_dist_threshold": [10.0] * n, + f"{BEHAVIOR}_behavior_dist_seg": [5.0] * n, + f"{BEHAVIOR}_bout_behavior": [2] * n, + f"{BEHAVIOR}_avg_bout_duration": avg_bout_durations, + f"{BEHAVIOR}__stats_sample_count": stats_sample_counts, + f"{BEHAVIOR}_bout_duration_std": [0.1] * n, + f"{BEHAVIOR}_bout_duration_var": [0.01] * n, + } + ) + + +class TestLatencyFirstPrediction: + """Tests for bin_first_XX.latency_first_prediction (incremental semantics).""" + + def test_single_bin_returns_value(self): + """bin_size=1, prev_bin_size=0: returns first bin's value.""" + data = _make_bin_data( + latency_first_values=[2506.0], + latency_last_values=[4900.0], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=1, behavior=BEHAVIOR, prev_bin_size=0 + ) + col = f"bin_first_5.{BEHAVIOR}_latency_first_prediction" + assert result[col].iloc[0] == pytest.approx(2506.0) + + def test_single_bin_nan_returns_nan(self): + """NaN input returns NaN for single bin.""" + data = _make_bin_data( + latency_first_values=[float("nan")], + latency_last_values=[float("nan")], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=1, behavior=BEHAVIOR, prev_bin_size=0 + ) + col = f"bin_first_5.{BEHAVIOR}_latency_first_prediction" + assert math.isnan(result[col].iloc[0]) + + def test_consecutive_bins_returns_incremental_value(self): + """bin_size=2, prev_bin_size=1: should return bin 1's value (5-10min), not bin 0's.""" + data = _make_bin_data( + latency_first_values=[2506.0, 9412.0], + latency_last_values=[4900.0, 11000.0], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=2, behavior=BEHAVIOR, prev_bin_size=1 + ) + col = f"bin_first_10.{BEHAVIOR}_latency_first_prediction" + assert result[col].iloc[0] == pytest.approx(9412.0) + + def test_non_consecutive_bins_returns_first_in_range(self): + """bin_size=3, prev_bin_size=1: incremental window is bins 1-2 (5-15min). + + Should return first non-NaN in that range. + """ + data = _make_bin_data( + latency_first_values=[2506.0, 9412.0, 18082.0], + latency_last_values=[4900.0, 11000.0, 19000.0], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR, prev_bin_size=1 + ) + col = f"bin_first_15.{BEHAVIOR}_latency_first_prediction" + assert result[col].iloc[0] == pytest.approx(9412.0) + + def test_non_consecutive_skips_nan_in_range(self): + """bin_size=3, prev_bin_size=1: bins 1-2, bin 1 is NaN → returns bin 2's value.""" + data = _make_bin_data( + latency_first_values=[2506.0, float("nan"), 18082.0], + latency_last_values=[4900.0, float("nan"), 19000.0], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR, prev_bin_size=1 + ) + col = f"bin_first_15.{BEHAVIOR}_latency_first_prediction" + assert result[col].iloc[0] == pytest.approx(18082.0) + + def test_incremental_all_nan_returns_nan(self): + """bin_size=3, prev_bin_size=1: bins 1-2 both NaN → NaN.""" + data = _make_bin_data( + latency_first_values=[2506.0, float("nan"), float("nan")], + latency_last_values=[4900.0, float("nan"), float("nan")], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR, prev_bin_size=1 + ) + col = f"bin_first_15.{BEHAVIOR}_latency_first_prediction" + assert math.isnan(result[col].iloc[0]) + + def test_prev_bin_zero_returns_first_non_nan(self): + """bin_size=3, prev_bin_size=0: full window 0-15min.""" + data = _make_bin_data( + latency_first_values=[float("nan"), 9412.0, 18082.0], + latency_last_values=[float("nan"), 11000.0, 19000.0], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR, prev_bin_size=0 + ) + col = f"bin_first_15.{BEHAVIOR}_latency_first_prediction" + assert result[col].iloc[0] == pytest.approx(9412.0) + + +class TestLatencyLastPrediction: + """Tests for bin_last_XX.latency_last_prediction (incremental semantics).""" + + def test_single_bin_returns_value(self): + """Single bin returns the last prediction value.""" + data = _make_bin_data( + latency_first_values=[2506.0], + latency_last_values=[4900.0], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=1, behavior=BEHAVIOR, prev_bin_size=0 + ) + col = f"bin_last_5.{BEHAVIOR}_latency_last_prediction" + assert result[col].iloc[0] == pytest.approx(4900.0) + + def test_consecutive_bins_returns_incremental_value(self): + """bin_size=2, prev_bin_size=1: returns bin 1's last prediction.""" + data = _make_bin_data( + latency_first_values=[2506.0, 9412.0], + latency_last_values=[4900.0, 14863.0], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=2, behavior=BEHAVIOR, prev_bin_size=1 + ) + col = f"bin_last_10.{BEHAVIOR}_latency_last_prediction" + assert result[col].iloc[0] == pytest.approx(14863.0) + + def test_non_consecutive_returns_last_non_nan(self): + """bin_size=3, prev_bin_size=1: bins 1-2, returns last non-NaN.""" + data = _make_bin_data( + latency_first_values=[2506.0, 9412.0, float("nan")], + latency_last_values=[4900.0, 11000.0, float("nan")], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR, prev_bin_size=1 + ) + col = f"bin_last_15.{BEHAVIOR}_latency_last_prediction" + assert result[col].iloc[0] == pytest.approx(11000.0) + + def test_incremental_all_nan_returns_nan(self): + """All NaN in incremental range returns NaN.""" + data = _make_bin_data( + latency_first_values=[2506.0, float("nan"), float("nan")], + latency_last_values=[4900.0, float("nan"), float("nan")], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR, prev_bin_size=1 + ) + col = f"bin_last_15.{BEHAVIOR}_latency_last_prediction" + assert math.isnan(result[col].iloc[0]) + + +class TestAvgBoutLength: + """Tests for avg_bout_length aggregation.""" + + def test_single_bin_returns_that_bins_value(self): + """Single bin should return that bin's avg_bout_duration.""" + data = _make_bin_data( + latency_first_values=[100.0], + latency_last_values=[200.0], + avg_bout_durations=[18.8], + stats_sample_counts=[5], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=1, behavior=BEHAVIOR + ) + col = f"bin_avg_5.{BEHAVIOR}_avg_bout_length" + assert result[col].iloc[0] == pytest.approx(18.8) + + def test_cumulative_weighted_average(self): + """avg_bout_length is cumulative: weighted avg across all bins 0..bin_size.""" + data = _make_bin_data( + latency_first_values=[100.0, 200.0, 300.0], + latency_last_values=[150.0, 250.0, 350.0], + avg_bout_durations=[10.0, 20.0, 30.0], + stats_sample_counts=[5, 3, 4], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR + ) + col = f"bin_avg_15.{BEHAVIOR}_avg_bout_length" + # Weighted avg across all bins: (10*5 + 20*3 + 30*4) / (5+3+4) = 230/12 + expected = np.average([10.0, 20.0, 30.0], weights=[5, 3, 4]) + assert result[col].iloc[0] == pytest.approx(expected) + + def test_returns_nan_when_last_bin_has_no_behavior(self): + """Bins with sample_count=0 have zero weight; only bins with behavior contribute.""" + data = _make_bin_data( + latency_first_values=[100.0, float("nan")], + latency_last_values=[200.0, float("nan")], + avg_bout_durations=[18.0, 0.0], + stats_sample_counts=[4, 0], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=2, behavior=BEHAVIOR + ) + col = f"bin_avg_10.{BEHAVIOR}_avg_bout_length" + assert result[col].iloc[0] == pytest.approx(18.0) + + def test_returns_nan_when_all_bins_have_no_behavior(self): + """Should return NaN when all bins have no behavior.""" + data = _make_bin_data( + latency_first_values=[float("nan"), float("nan")], + latency_last_values=[float("nan"), float("nan")], + avg_bout_durations=[0.0, 0.0], + stats_sample_counts=[0, 0], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=2, behavior=BEHAVIOR + ) + col = f"bin_avg_10.{BEHAVIOR}_avg_bout_length" + assert math.isnan(result[col].iloc[0]) + + def test_first_bin_no_behavior_uses_later_bins(self): + """No behavior in first bin (NaN duration, 0 count) — later bins drive the average.""" + data = _make_bin_data( + latency_first_values=[float("nan"), 100.0, 200.0], + latency_last_values=[float("nan"), 150.0, 250.0], + avg_bout_durations=[float("nan"), 10.0, 20.0], + stats_sample_counts=[0, 5, 3], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR + ) + col = f"bin_avg_15.{BEHAVIOR}_avg_bout_length" + # Bins 1 and 2 contribute: (10*5 + 20*3) / (5+3) = 110/8 = 13.75 + expected = np.average([10.0, 20.0], weights=[5, 3]) + assert result[col].iloc[0] == pytest.approx(expected) + + def test_nan_duration_with_zero_weight_is_ignored(self): + """NaN avg_bout_duration with sample_count=0 should not poison the weighted average.""" + data = _make_bin_data( + latency_first_values=[100.0, 200.0, float("nan")], + latency_last_values=[150.0, 250.0, float("nan")], + avg_bout_durations=[10.0, 20.0, float("nan")], + stats_sample_counts=[5, 3, 0], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR + ) + col = f"bin_avg_15.{BEHAVIOR}_avg_bout_length" + # Only bins 0 and 1 contribute: (10*5 + 20*3) / (5+3) = 110/8 = 13.75 + expected = np.average([10.0, 20.0], weights=[5, 3]) + assert result[col].iloc[0] == pytest.approx(expected) + + def test_skips_bins_with_no_behavior_in_weighted_avg(self): + """Bins with sample_count=0 have zero weight and don't affect the average.""" + data = _make_bin_data( + latency_first_values=[100.0, float("nan"), 300.0], + latency_last_values=[150.0, float("nan"), 350.0], + avg_bout_durations=[10.0, 0.0, 30.0], + stats_sample_counts=[5, 0, 4], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR + ) + col = f"bin_avg_15.{BEHAVIOR}_avg_bout_length" + # Weighted avg = (10*5 + 30*4) / (5+4) = 170/9 ≈ 18.889 + expected = np.average([10.0, 30.0], weights=[5, 4]) + assert result[col].iloc[0] == pytest.approx(expected) + + +class TestMultiMouseAlignment: + """Tests for multi-mouse alignment in aggregation.""" + + def test_each_mouse_gets_its_own_first_latency(self): + """With multiple mice, each should receive their own first-bin latency value.""" + mouse_a = _make_bin_data( + latency_first_values=[2506.0, 9412.0], + latency_last_values=[4900.0, 11000.0], + mouse_id="mouse_A", + ) + mouse_b = _make_bin_data( + latency_first_values=[3000.0, float("nan")], + latency_last_values=[5000.0, float("nan")], + mouse_id="mouse_B", + ) + data = pd.concat([mouse_a, mouse_b], ignore_index=True) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=2, behavior=BEHAVIOR, prev_bin_size=1 + ) + result = result.set_index("MouseID") + + first_col = f"bin_first_10.{BEHAVIOR}_latency_first_prediction" + last_col = f"bin_last_10.{BEHAVIOR}_latency_last_prediction" + + assert result.loc["mouse_A", first_col] == pytest.approx(9412.0) + assert math.isnan(result.loc["mouse_B", first_col]) + + assert result.loc["mouse_A", last_col] == pytest.approx(11000.0) + assert math.isnan(result.loc["mouse_B", last_col]) + + def test_each_mouse_gets_own_avg_bout_length(self): + """Each mouse gets its own cumulative weighted avg_bout_length.""" + mouse_a = _make_bin_data( + latency_first_values=[100.0, 200.0], + latency_last_values=[150.0, 250.0], + avg_bout_durations=[10.0, 20.0], + stats_sample_counts=[3, 5], + mouse_id="mouse_A", + ) + mouse_b = _make_bin_data( + latency_first_values=[300.0, 400.0], + latency_last_values=[350.0, 450.0], + avg_bout_durations=[7.0, 0.0], + stats_sample_counts=[2, 0], + mouse_id="mouse_B", + ) + data = pd.concat([mouse_a, mouse_b], ignore_index=True) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=2, behavior=BEHAVIOR + ) + result = result.set_index("MouseID") + + col = f"bin_avg_10.{BEHAVIOR}_avg_bout_length" + # Mouse A: (10*3 + 20*5) / (3+5) = 130/8 = 16.25 + assert result.loc["mouse_A", col] == pytest.approx(16.25) + # Mouse B: only bin 0 has behavior → 7.0 + assert result.loc["mouse_B", col] == pytest.approx(7.0)