From 30225dd958bde4a776e7c136b94f90156dfea5c5 Mon Sep 17 00:00:00 2001 From: blalterman Date: Tue, 10 Feb 2026 17:51:18 -0500 Subject: [PATCH 1/2] docs(plans): add private-dev + public-release repo plan Documents the two-repo architecture for private development with public releases via rsync-based export script. Co-Authored-By: Claude Opus 4.6 --- plans/private-dev-public-release-repos.md | 347 ++++++++++++++++++++++ 1 file changed, 347 insertions(+) create mode 100644 plans/private-dev-public-release-repos.md diff --git a/plans/private-dev-public-release-repos.md b/plans/private-dev-public-release-repos.md new file mode 100644 index 00000000..c1030769 --- /dev/null +++ b/plans/private-dev-public-release-repos.md @@ -0,0 +1,347 @@ +# Plan: Private Development Repo + Public Release Repo + +## Summary + +Create a private development repo (`blalterman/SolarWindPy-dev`) as the primary working +environment, while keeping the existing public repo (`blalterman/SolarWindPy`) as-is for +releases. All current public content stays public. The private repo is a **superset** of +the public repo, adding research code, private data, and experimental features. + +## Architecture + +``` +GitHub: + blalterman/SolarWindPy (public, existing) <- Release repo + blalterman/SolarWindPy-dev (private, new) <- Development repo + +Local: + ~/observatories/code/ + |-- SolarWindPy/ -> public repo clone (releases only) + +-- SolarWindPy-dev/ -> private repo clone (daily work) +``` + +**Relationship**: Private repo is a superset. It contains everything the public repo has, +plus additional private directories. Development happens in private; curated releases flow +to public via an export script. + +## Evidence & Justification + +### Why two repos instead of branch-based separation? +- Git has no per-remote file filtering. Pushing `master` to two remotes sends ALL files. +- Branch-based approaches (e.g., `develop` with private content, `master` without) require + careful merge hygiene that is error-prone for a solo developer. +- Two repos with an export script is the standard pattern used by Linux, Android, and + corporate open-source projects for private-dev -> public-release workflows. +- Zero risk of accidental private content leakage. + +### Why copy history rather than start fresh? +- The private repo should have full commit history for `git blame`, bisect, and reference. +- Starting from a bare clone means both repos share the same initial history (1,383 commits). +- setuptools_scm works identically in both (same tags, same version derivation). + +### Why keep the public repo unchanged? +- User's explicit requirement: "Everything that currently is public can stay public." +- No external link breakage (PyPI, conda-forge, ReadTheDocs, 4 stars, 3 forks). +- No CI/CD reconfiguration needed for the public repo. +- `.claude/`, `plans/`, `paper/` remain public as they already are. + +## Phase 1: Create Private Repository (~15 min) + +### Step 1.1: Create private repo on GitHub +```bash +gh repo create blalterman/SolarWindPy-dev \ + --private \ + --description "Private development repository for SolarWindPy" +``` + +### Step 1.2: Push full history to private repo +```bash +# Clone the public repo as a bare mirror +git clone --bare git@github.com:blalterman/SolarWindPy.git /tmp/swp-mirror + +# Push everything to the new private repo +cd /tmp/swp-mirror +git push --mirror git@github.com:blalterman/SolarWindPy-dev.git + +# Cleanup +rm -rf /tmp/swp-mirror +``` + +### Step 1.3: Clone private repo locally +```bash +cd ~/observatories/code +git clone git@github.com:blalterman/SolarWindPy-dev.git SolarWindPy-dev +``` + +### Step 1.4: Add public repo as a remote in the private clone +```bash +cd ~/observatories/code/SolarWindPy-dev +git remote add public git@github.com:blalterman/SolarWindPy.git +git fetch public +``` + +**Verification**: `git remote -v` shows both `origin` (private) and `public` (public). + +## Phase 2: Configure Private Repo Structure (~30 min) + +### Step 2.1: Add private-only directories +```bash +cd ~/observatories/code/SolarWindPy-dev +mkdir -p research/{notebooks,analysis,experiments} +mkdir -p private/{data,configs} +``` + +### Step 2.2: Create private content manifest +Create `.private-manifest` (tracked in private repo, excluded from export): +``` +# Paths that exist only in the private repo. +# The export script excludes these when syncing to public. +research/ +private/ +.private-manifest +``` + +### Step 2.3: Update private repo .gitignore +Add to `.gitignore`: +``` +# Private data files (never commit even to private repo) +private/data/**/*.h5 +private/data/**/*.hdf5 +private/configs/secrets.* +``` + +### Step 2.4: Initial commit of private structure +```bash +git add research/ private/ .private-manifest +git commit -m "feat: add private research and data directories + +Establishes private-only directories for research code, analysis +notebooks, experimental features, and private data/configs. +These directories are excluded from public repo syncs. + +Co-Authored-By: Claude " +git push origin master +``` + +## Phase 3: Create Export Script (~1 hour) + +### Step 3.1: Create `scripts/export-to-public.sh` in private repo + +This script: +1. Reads `.private-manifest` for paths to exclude +2. Uses `rsync` to sync everything else to the local public clone +3. Commits and optionally tags in the public clone +4. Pushes to the public GitHub repo + +```bash +#!/bin/bash +# export-to-public.sh -- Sync public-safe content from private to public repo +# +# Usage: +# ./scripts/export-to-public.sh # Sync without tagging +# ./scripts/export-to-public.sh v0.4.0 # Sync and tag for release +# ./scripts/export-to-public.sh --dry-run # Show what would be synced + +set -euo pipefail + +PRIVATE_REPO="$(cd "$(dirname "$0")/.." && pwd)" +PUBLIC_REPO="${PRIVATE_REPO}/../SolarWindPy" +MANIFEST="${PRIVATE_REPO}/.private-manifest" + +# Parse args +DRY_RUN=false +VERSION_TAG="" +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=true ;; + v*) VERSION_TAG="$arg" ;; + *) echo "Unknown arg: $arg"; exit 1 ;; + esac +done + +# Validate +[ -d "$PUBLIC_REPO/.git" ] || { echo "Error: Public repo not found at $PUBLIC_REPO"; exit 1; } +[ -f "$MANIFEST" ] || { echo "Error: .private-manifest not found"; exit 1; } + +# Build rsync exclude list from manifest +EXCLUDES=() +while IFS= read -r line; do + [[ "$line" =~ ^#.*$ || -z "$line" ]] && continue + EXCLUDES+=(--exclude="$line") +done < "$MANIFEST" + +# Always exclude git directory and build artifacts +EXCLUDES+=(--exclude=".git" --exclude="dist/" --exclude="*.egg-info/" + --exclude="htmlcov/" --exclude="__pycache__/" --exclude=".pytest_cache/" + --exclude=".eggs/" --exclude="build/" --exclude="tmp/" + --exclude="staged-recipes*/" --exclude="*.pyc") + +echo "=== Export to Public Repo ===" +echo "Private: $PRIVATE_REPO" +echo "Public: $PUBLIC_REPO" +echo "Excludes: ${EXCLUDES[*]}" + +if [ "$DRY_RUN" = true ]; then + echo "--- DRY RUN ---" + rsync -avn --delete "${EXCLUDES[@]}" "$PRIVATE_REPO/" "$PUBLIC_REPO/" + exit 0 +fi + +# Sync +rsync -av --delete "${EXCLUDES[@]}" "$PRIVATE_REPO/" "$PUBLIC_REPO/" + +# Commit changes in public repo +cd "$PUBLIC_REPO" +if [ -n "$(git status --porcelain)" ]; then + git add -A + git commit -m "$(cat < +EOF +)" + echo "Committed changes to public repo" +else + echo "No changes to commit" +fi + +# Tag if requested +if [ -n "$VERSION_TAG" ]; then + git tag -a "$VERSION_TAG" -m "SolarWindPy $VERSION_TAG" + echo "Tagged as $VERSION_TAG" + echo "" + echo "Ready to push. Run:" + echo " cd $PUBLIC_REPO && git push origin master --tags" +fi + +echo "=== Export complete ===" +``` + +### Step 3.2: Make executable and commit +```bash +chmod +x scripts/export-to-public.sh +git add scripts/export-to-public.sh +git commit -m "feat(scripts): add export-to-public sync script + +Provides one-command sync from private dev repo to public release +repo, reading .private-manifest for paths to exclude. + +Co-Authored-By: Claude " +``` + +## Phase 4: CI/CD Configuration (~30 min) + +### Private repo CI/CD +No file changes needed. Existing CI/CD workflows work as-is because the private +repo is a superset of the public repo (same structure, same files, same tests). + +### Public repo CI/CD stays as-is +The `publish.yml` workflow already triggers on `v*` tags and publishes to PyPI. +The `docs.yml` workflow already builds documentation. No changes needed. + +### Optional: Disable publishing from private repo +To prevent accidental publishing from the wrong repo, remove the `PYPI_API_TOKEN` +secret from the private repo's GitHub settings. The private repo's `publish.yml` +would fail on tag push (harmless), ensuring you never accidentally publish from +the wrong source. + +**Action**: GitHub Settings (no file changes) -> Private repo -> Settings -> Secrets -> Remove `PYPI_API_TOKEN` + +## Phase 5: Development Workflow (Reference) + +### Daily development +```bash +cd ~/observatories/code/SolarWindPy-dev # Work in private repo +git checkout -b feature/my-feature # Feature branch +# ... develop, test, commit ... +git push origin feature/my-feature # Push to private +# Create PR on private repo (for your own tracking) +``` + +### Research & experiments +```bash +cd ~/observatories/code/SolarWindPy-dev +# Research code goes in research/ -- never synced to public +vim research/notebooks/ion_temperature_analysis.ipynb +vim research/experiments/new_fitting_approach.py +git add research/ +git commit -m "research: preliminary ion temperature analysis" +git push origin master +``` + +### Release workflow +```bash +cd ~/observatories/code/SolarWindPy-dev +# 1. Ensure master is clean and tests pass +pytest -q +# 2. Update CHANGELOG.md +# 3. Dry-run the export +./scripts/export-to-public.sh --dry-run +# 4. Export and tag +./scripts/export-to-public.sh v0.4.0 +# 5. Push to public +cd ~/observatories/code/SolarWindPy +git push origin master --tags +# 6. publish.yml fires -> PyPI -> conda-forge +``` + +### Pulling community contributions (if any) +```bash +cd ~/observatories/code/SolarWindPy-dev +git fetch public +git merge public/master # Pull any external PRs into private +``` + +## Phase 6: Keeping Repos in Sync (Reference) + +### When to sync +- **Always sync before release**: Run export script before tagging +- **Don't sync on every commit**: The public repo receives batch updates at release time +- **Pull from public after external PRs**: If someone contributes to the public repo + +### Conflict prevention +- All development happens in private repo -> public is never ahead of private + (except for external contributions, which are rare for a solo project) +- The export script uses `rsync --delete`, so public repo always matches the + private repo's public-safe content exactly + +### Private content safety +- `.private-manifest` lists all private-only paths +- `rsync --delete` with exclusions ensures private paths never appear in public +- No complex git gymnastics needed -- it's just a file sync + +## Risk Analysis + +| Risk | Severity | Probability | Mitigation | +|------|----------|-------------|------------| +| Forget to sync before release | Medium | Medium | Add to CHANGELOG/release checklist | +| Private content leaks to public | High | Very Low | `.private-manifest` + `rsync` excludes | +| setuptools_scm version mismatch | Medium | Low | Both repos have same tags; verify with `python -m setuptools_scm` | +| Divergent histories after external PR | Low | Low | `git fetch public && git merge public/master` | +| Accidental PyPI publish from private | Medium | Low | Remove PYPI_API_TOKEN from private repo secrets | + +## Cost Analysis (GitHub Organization Question) + +If an org is desired later, these are the costs: +- **Free tier**: Unlimited public + private repos, 2,000 CI min/month. No branch protection on private repos. +- **Team**: $4/user/month. Adds branch protection, required reviewers. +- **Recommendation**: Not needed now. Revisit if the project grows to multiple contributors. + +## Verification Checklist + +After implementation, verify: +- [ ] `git remote -v` in private clone shows both `origin` and `public` +- [ ] `pytest -q` passes in private repo +- [ ] `./scripts/export-to-public.sh --dry-run` shows correct file list +- [ ] Private-only directories (`research/`, `private/`) do NOT appear in dry-run output +- [ ] `python -m setuptools_scm` reports correct version in both repos +- [ ] `.claude/` infrastructure works normally in private repo +- [ ] Public repo CI passes after first sync + +## Critical Files + +- `SolarWindPy/.gitignore` -- needs private-data patterns (in private clone only) +- `SolarWindPy/pyproject.toml` -- URLs stay as-is (public repo) +- `SolarWindPy/.pre-commit-config.yaml` -- works in both repos unchanged +- New: `scripts/export-to-public.sh` -- the sync script (private repo only) +- New: `.private-manifest` -- private content manifest (private repo only) From 19211003fe76c7c869caf133c55257380a338950 Mon Sep 17 00:00:00 2001 From: blalterman Date: Tue, 17 Feb 2026 14:17:38 -0500 Subject: [PATCH 2/2] refactor(fitfunctions): remove joblib parallelization from TrendFit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Joblib's loky backend deadlocks on macOS with Python 3.12's spawn start method. The multiprocessing backend fails due to unpicklable closures, and the threading backend provides no CPU-bound speedup. Remove the non-functional parallel code path, related tests, benchmarks, and the performance optional dependency. All Phase 4 features in core.py (residuals use_all, in-place mask ops, xoutside/youtside) are preserved — they have zero joblib dependency. Generated with Claude Code Co-Authored-By: Claude --- benchmarks/fitfunctions_performance.py | 179 -------------- pyproject.toml | 3 - solarwindpy/fitfunctions/trend_fits.py | 146 +----------- .../fitfunctions/test_trend_fits_advanced.py | 222 +----------------- 4 files changed, 10 insertions(+), 540 deletions(-) delete mode 100644 benchmarks/fitfunctions_performance.py diff --git a/benchmarks/fitfunctions_performance.py b/benchmarks/fitfunctions_performance.py deleted file mode 100644 index 863c01e2..00000000 --- a/benchmarks/fitfunctions_performance.py +++ /dev/null @@ -1,179 +0,0 @@ -#!/usr/bin/env python -"""Benchmark Phase 4 performance optimizations.""" - -import time -import numpy as np -import pandas as pd -import sys -import os - -# Add the parent directory to sys.path to import solarwindpy -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) - -from solarwindpy.fitfunctions import Gaussian -from solarwindpy.fitfunctions.trend_fits import TrendFit - - -def benchmark_trendfit(n_fits=50): - """Compare sequential vs parallel TrendFit performance.""" - print(f"\nBenchmarking with {n_fits} fits...") - - # Create synthetic data that's realistic for fitting - np.random.seed(42) - x = np.linspace(0, 10, 100) - data = pd.DataFrame({ - f'col_{i}': 5 * np.exp(-(x-5)**2/2) + np.random.normal(0, 0.1, 100) - for i in range(n_fits) - }, index=x) - - # Sequential execution - print(" Running sequential...") - tf_seq = TrendFit(data, Gaussian, ffunc1d=Gaussian) - tf_seq.make_ffunc1ds() - - start = time.perf_counter() - tf_seq.make_1dfits(n_jobs=1) - seq_time = time.perf_counter() - start - - # Parallel execution - print(" Running parallel...") - tf_par = TrendFit(data, Gaussian, ffunc1d=Gaussian) - tf_par.make_ffunc1ds() - - start = time.perf_counter() - tf_par.make_1dfits(n_jobs=-1) - par_time = time.perf_counter() - start - - speedup = seq_time / par_time - print(f" Sequential: {seq_time:.2f}s") - print(f" Parallel: {par_time:.2f}s") - print(f" Speedup: {speedup:.1f}x") - - # Verify results match - print(" Verifying results match...") - successful_fits = 0 - for key in tf_seq.ffuncs.index: - if key in tf_par.ffuncs.index: # Both succeeded - seq_popt = tf_seq.ffuncs[key].popt - par_popt = tf_par.ffuncs[key].popt - for param in seq_popt: - np.testing.assert_allclose( - seq_popt[param], par_popt[param], - rtol=1e-10, atol=1e-10 - ) - successful_fits += 1 - - print(f" ✓ {successful_fits} fits verified identical") - - return speedup, successful_fits - - -def benchmark_single_fitfunction(): - """Benchmark single FitFunction to understand baseline performance.""" - print("\nBenchmarking single FitFunction...") - - np.random.seed(42) - x = np.linspace(0, 10, 100) - y = 5 * np.exp(-(x-5)**2/2) + np.random.normal(0, 0.1, 100) - - # Time creation and fitting - start = time.perf_counter() - ff = Gaussian(x, y) - creation_time = time.perf_counter() - start - - start = time.perf_counter() - ff.make_fit() - fit_time = time.perf_counter() - start - - total_time = creation_time + fit_time - - print(f" Creation time: {creation_time*1000:.1f}ms") - print(f" Fitting time: {fit_time*1000:.1f}ms") - print(f" Total time: {total_time*1000:.1f}ms") - - return total_time - - -def check_joblib_availability(): - """Check if joblib is available for parallel processing.""" - try: - import joblib - print(f"✓ joblib {joblib.__version__} available") - - # Check number of cores - import os - n_cores = os.cpu_count() - print(f"✓ {n_cores} CPU cores detected") - return True - except ImportError: - print("✗ joblib not available - only sequential benchmarks will run") - return False - - -if __name__ == "__main__": - print("FitFunctions Phase 4 Performance Benchmark") - print("=" * 50) - - # Check system capabilities - has_joblib = check_joblib_availability() - - # Single fit baseline - single_time = benchmark_single_fitfunction() - - # TrendFit scaling benchmarks - speedups = [] - fit_counts = [] - - test_sizes = [10, 25, 50, 100] - if has_joblib: - # Only run larger tests if joblib is available - test_sizes.extend([200]) - - for n in test_sizes: - expected_seq_time = single_time * n - print(f"\nExpected sequential time for {n} fits: {expected_seq_time:.1f}s") - - try: - speedup, n_successful = benchmark_trendfit(n) - speedups.append(speedup) - fit_counts.append(n_successful) - except Exception as e: - print(f" ✗ Benchmark failed: {e}") - speedups.append(1.0) - fit_counts.append(0) - - # Summary report - print("\n" + "=" * 50) - print("BENCHMARK SUMMARY") - print("=" * 50) - - print(f"Single fit baseline: {single_time*1000:.1f}ms") - - if speedups: - print("\nTrendFit Scaling Results:") - print("Fits | Successful | Speedup") - print("-" * 30) - for i, n in enumerate(test_sizes): - if i < len(speedups): - print(f"{n:4d} | {fit_counts[i]:10d} | {speedups[i]:7.1f}x") - - if has_joblib: - avg_speedup = np.mean(speedups) - best_speedup = max(speedups) - print(f"\nAverage speedup: {avg_speedup:.1f}x") - print(f"Best speedup: {best_speedup:.1f}x") - - # Efficiency analysis - if avg_speedup > 1.5: - print("✓ Parallelization provides significant benefit") - else: - print("⚠ Parallelization benefit limited (overhead or few cores)") - else: - print("\nInstall joblib for parallel processing:") - print(" pip install joblib") - print(" or") - print(" pip install solarwindpy[performance]") - - print("\nTo use parallel fitting in your code:") - print(" tf.make_1dfits(n_jobs=-1) # Use all cores") - print(" tf.make_1dfits(n_jobs=4) # Use 4 cores") \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 2a4b2e0a..57f3e5fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -103,9 +103,6 @@ dev = [ # Code analysis tools (ast-grep via MCP server, not Python package) "pre-commit>=3.5", # Git hook framework ] -performance = [ - "joblib>=1.3.0", # Parallel execution for TrendFit -] analysis = [ # Interactive analysis environment "jupyterlab>=4.0", diff --git a/solarwindpy/fitfunctions/trend_fits.py b/solarwindpy/fitfunctions/trend_fits.py index 1e389be7..a98326cc 100644 --- a/solarwindpy/fitfunctions/trend_fits.py +++ b/solarwindpy/fitfunctions/trend_fits.py @@ -6,22 +6,12 @@ """ -# import warnings import logging # noqa: F401 -import warnings import numpy as np import pandas as pd import matplotlib as mpl from collections import namedtuple -# Parallel processing support -try: - from joblib import Parallel, delayed - - JOBLIB_AVAILABLE = True -except ImportError: - JOBLIB_AVAILABLE = False - from ..plotting import subplots from . import core from . import gaussians @@ -159,146 +149,24 @@ def make_ffunc1ds(self, **kwargs): ffuncs = pd.Series(ffuncs) self._ffuncs = ffuncs - def make_1dfits(self, n_jobs=1, verbose=0, backend="loky", **kwargs): - r""" - Execute fits for all 1D functions, optionally in parallel. + def make_1dfits(self, **kwargs): + r"""Execute fits for all 1D functions. - Each FitFunction instance represents a single dataset to fit. - TrendFit creates many such instances (one per column), making - this ideal for parallelization. + Removes bad fits from `ffuncs` and saves them in `bad_fits`. Parameters ---------- - n_jobs : int, default=1 - Number of parallel jobs: - - 1: Sequential execution (default, backward compatible) - - -1: Use all available CPU cores - - n>1: Use n cores - Requires joblib: pip install joblib - verbose : int, default=0 - Joblib verbosity level (0=silent, 10=progress) - backend : str, default='loky' - Joblib backend ('loky', 'threading', 'multiprocessing') **kwargs Passed to each FitFunction.make_fit() - - Examples - -------- - >>> # TrendFit creates one FitFunction per column - >>> tf = TrendFit(agg_data, Gaussian, ffunc1d=Gaussian) - >>> tf.make_ffunc1ds() # Creates instances - >>> - >>> # Fit all instances sequentially (default) - >>> tf.make_1dfits() - >>> - >>> # Fit in parallel using all cores - >>> tf.make_1dfits(n_jobs=-1) - >>> - >>> # With progress display - >>> tf.make_1dfits(n_jobs=-1, verbose=10) - - Notes - ----- - Parallel execution returns complete fitted FitFunction objects from worker - processes, which incurs serialization overhead. This overhead typically - outweighs parallelization benefits for simple fits. Parallelization is - most beneficial for: - - - Complex fitting functions with expensive computations - - Large datasets (>1000 points per fit) - - Batch processing of many fits (>50) - - Systems with many CPU cores and sufficient memory - - For typical Gaussian fits on moderate data, sequential execution (n_jobs=1) - may be faster due to Python's GIL and serialization overhead. - - Removes bad fits from `ffuncs` and saves them in `bad_fits`. """ # Successful fits return None, which pandas treats as NaN. return_exception = kwargs.pop("return_exception", True) - # Filter out parallelization parameters from kwargs before passing to make_fit() - # These are specific to make_1dfits() and should not be passed to individual fits - fit_kwargs = { - k: v for k, v in kwargs.items() if k not in ["n_jobs", "verbose", "backend"] - } - - # Check if parallel execution is requested and possible - if n_jobs != 1 and len(self.ffuncs) > 1: - if not JOBLIB_AVAILABLE: - warnings.warn( - f"joblib not installed. Install with 'pip install joblib' " - f"for parallel processing of {len(self.ffuncs)} fits. " - f"Falling back to sequential execution.", - UserWarning, - ) - n_jobs = 1 - else: - # Parallel execution - return fitted objects to preserve TrendFit architecture - def fit_single_from_data( - column_name, x_data, y_data, ffunc_class, ffunc_kwargs - ): - """Create and fit FitFunction, return both result and fitted object.""" - # Create new FitFunction instance in worker process - ffunc = ffunc_class(x_data, y_data, **ffunc_kwargs) - fit_result = ffunc.make_fit( - return_exception=return_exception, **fit_kwargs - ) - # Return tuple: (fit_result, fitted_object) - return (fit_result, ffunc) - - # Prepare minimal data for each fit - fit_tasks = [] - for col_name, ffunc in self.ffuncs.items(): - x_data = ffunc.observations.raw.x - y_data = ffunc.observations.raw.y - ffunc_class = type(ffunc) - # Extract constructor kwargs from ffunc (constraints, etc.) - ffunc_kwargs = { - "xmin": getattr(ffunc, "xmin", None), - "xmax": getattr(ffunc, "xmax", None), - "ymin": getattr(ffunc, "ymin", None), - "ymax": getattr(ffunc, "ymax", None), - "xoutside": getattr(ffunc, "xoutside", None), - "youtside": getattr(ffunc, "youtside", None), - } - # Remove None values - ffunc_kwargs = { - k: v for k, v in ffunc_kwargs.items() if v is not None - } - - fit_tasks.append( - (col_name, x_data, y_data, ffunc_class, ffunc_kwargs) - ) - - # Run fits in parallel and get both results and fitted objects - parallel_output = Parallel( - n_jobs=n_jobs, verbose=verbose, backend=backend - )( - delayed(fit_single_from_data)( - col_name, x_data, y_data, ffunc_class, ffunc_kwargs - ) - for col_name, x_data, y_data, ffunc_class, ffunc_kwargs in fit_tasks - ) - - # Separate results and fitted objects, update self.ffuncs with fitted objects - fit_results = [] - for idx, (result, fitted_ffunc) in enumerate(parallel_output): - fit_results.append(result) - # CRITICAL: Replace original with fitted object to preserve TrendFit architecture - col_name = self.ffuncs.index[idx] - self.ffuncs[col_name] = fitted_ffunc - - # Convert to Series for bad fit handling - fit_success = pd.Series(fit_results, index=self.ffuncs.index) - - if n_jobs == 1: - # Original sequential implementation (unchanged) - fit_success = self.ffuncs.apply( - lambda x: x.make_fit(return_exception=return_exception, **fit_kwargs) - ) + fit_success = self.ffuncs.apply( + lambda x: x.make_fit(return_exception=return_exception, **kwargs) + ) - # Handle failed fits (original code, unchanged) + # Handle failed fits bad_idx = fit_success.dropna().index bad_fits = self.ffuncs.loc[bad_idx] self._bad_fits = bad_fits diff --git a/tests/fitfunctions/test_trend_fits_advanced.py b/tests/fitfunctions/test_trend_fits_advanced.py index 3e42b31c..1baf9708 100644 --- a/tests/fitfunctions/test_trend_fits_advanced.py +++ b/tests/fitfunctions/test_trend_fits_advanced.py @@ -1,14 +1,12 @@ -"""Test Phase 4 performance optimizations.""" +"""Test TrendFit advanced features.""" import time -import warnings import matplotlib import matplotlib.pyplot as plt import numpy as np import pandas as pd import pytest -from unittest.mock import patch from solarwindpy.fitfunctions import Gaussian, Line from solarwindpy.fitfunctions.trend_fits import TrendFit @@ -16,219 +14,6 @@ matplotlib.use("Agg") # Non-interactive backend for testing -class TestTrendFitParallelization: - """Test TrendFit parallel execution.""" - - def setup_method(self): - """Create test data for reproducible tests.""" - np.random.seed(42) - x = np.linspace(0, 10, 50) - self.data = pd.DataFrame( - { - f"col_{i}": 5 * np.exp(-((x - 5) ** 2) / 2) - + np.random.normal(0, 0.1, 50) - for i in range(10) - }, - index=x, - ) - - def test_backward_compatibility(self): - """Verify default behavior unchanged.""" - tf = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf.make_ffunc1ds() - - # Should work without n_jobs parameter (default behavior) - tf.make_1dfits() - assert len(tf.ffuncs) > 0 - assert hasattr(tf, "_bad_fits") - - def test_parallel_sequential_equivalence(self): - """Verify parallel gives same results as sequential.""" - # Sequential execution - tf_seq = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf_seq.make_ffunc1ds() - tf_seq.make_1dfits(n_jobs=1) - - # Parallel execution - tf_par = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf_par.make_ffunc1ds() - tf_par.make_1dfits(n_jobs=2) - - # Should have same number of successful fits - assert len(tf_seq.ffuncs) == len(tf_par.ffuncs) - - # Compare all fit parameters - for key in tf_seq.ffuncs.index: - assert ( - key in tf_par.ffuncs.index - ), f"Fit {key} missing from parallel results" - - seq_popt = tf_seq.ffuncs[key].popt - par_popt = tf_par.ffuncs[key].popt - - # Parameters should match within numerical precision - for param in seq_popt: - np.testing.assert_allclose( - seq_popt[param], - par_popt[param], - rtol=1e-10, - atol=1e-10, - err_msg=f"Parameter {param} differs between sequential and parallel", - ) - - def test_parallel_execution_correctness(self): - """Verify parallel execution works correctly, acknowledging Python GIL limitations.""" - # Check if joblib is available - if not, test falls back gracefully - try: - import joblib # noqa: F401 - - joblib_available = True - except ImportError: - joblib_available = False - - # Create test dataset - focus on correctness rather than performance - x = np.linspace(0, 10, 100) - data = pd.DataFrame( - { - f"col_{i}": 5 * np.exp(-((x - 5) ** 2) / 2) - + np.random.normal(0, 0.1, 100) - for i in range(20) # Reasonable number of fits - }, - index=x, - ) - - # Time sequential execution - tf_seq = TrendFit(data, Gaussian, ffunc1d=Gaussian) - tf_seq.make_ffunc1ds() - start = time.perf_counter() - tf_seq.make_1dfits(n_jobs=1) - seq_time = time.perf_counter() - start - - # Time parallel execution with threading - tf_par = TrendFit(data, Gaussian, ffunc1d=Gaussian) - tf_par.make_ffunc1ds() - start = time.perf_counter() - tf_par.make_1dfits(n_jobs=4, backend="threading") - par_time = time.perf_counter() - start - - speedup = seq_time / par_time if par_time > 0 else float("inf") - - print( - f"Sequential time: {seq_time:.3f}s, fits: {len(tf_seq.ffuncs)}" # noqa: E231 - ) - print( - f"Parallel time: {par_time:.3f}s, fits: {len(tf_par.ffuncs)}" # noqa: E231 - ) - print( - f"Speedup achieved: {speedup:.2f}x (joblib available: {joblib_available})" # noqa: E231 - ) - - if joblib_available: - # Main goal: verify parallel execution works and produces correct results - # Note: Due to Python GIL and serialization overhead, speedup may be minimal - # or even negative for small/fast workloads. This is expected behavior. - assert ( - speedup > 0.05 - ), f"Parallel execution extremely slow, got {speedup:.2f}x" # noqa: E231 - print( - "NOTE: Python GIL and serialization overhead may limit speedup for small workloads" - ) - else: - # Without joblib, both should be sequential (speedup ~1.0) - # Widen tolerance to 1.5 for timing variability across platforms - assert ( - 0.5 <= speedup <= 1.5 - ), f"Expected ~1.0x speedup without joblib, got {speedup:.2f}x" # noqa: E231 - - # Most important: verify both produce the same number of successful fits - assert len(tf_seq.ffuncs) == len( - tf_par.ffuncs - ), "Parallel and sequential should have same success rate" - - # Verify results are equivalent (this is the key correctness test) - for key in tf_seq.ffuncs.index: - if key in tf_par.ffuncs.index: # Both succeeded - seq_popt = tf_seq.ffuncs[key].popt - par_popt = tf_par.ffuncs[key].popt - for param in seq_popt: - np.testing.assert_allclose( - seq_popt[param], - par_popt[param], - rtol=1e-10, - atol=1e-10, - err_msg=f"Parameter {param} differs between sequential and parallel", - ) - - def test_joblib_not_installed_fallback(self): - """Test graceful fallback when joblib unavailable.""" - # Mock joblib as unavailable - with patch.dict("sys.modules", {"joblib": None}): - # Force reload to simulate joblib not being installed - import solarwindpy.fitfunctions.trend_fits as tf_module - - # Temporarily mock JOBLIB_AVAILABLE - original_available = tf_module.JOBLIB_AVAILABLE - tf_module.JOBLIB_AVAILABLE = False - - try: - tf = tf_module.TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf.make_ffunc1ds() - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - tf.make_1dfits(n_jobs=-1) # Request parallel - - # Should warn about joblib not being available - assert len(w) == 1 - assert "joblib not installed" in str(w[0].message) - assert "parallel processing" in str(w[0].message) - - # Should still complete successfully with sequential execution - assert len(tf.ffuncs) > 0 - finally: - # Restore original state - tf_module.JOBLIB_AVAILABLE = original_available - - def test_n_jobs_parameter_validation(self): - """Test different n_jobs parameter values.""" - tf = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf.make_ffunc1ds() - - # Test various n_jobs values - for n_jobs in [1, 2, -1]: - tf_test = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf_test.make_ffunc1ds() - tf_test.make_1dfits(n_jobs=n_jobs) - assert len(tf_test.ffuncs) > 0, f"n_jobs={n_jobs} failed" - - def test_verbose_parameter(self): - """Test verbose parameter doesn't break execution.""" - tf = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf.make_ffunc1ds() - - # Should work with verbose output (though we can't easily test the output) - tf.make_1dfits(n_jobs=2, verbose=0) - assert len(tf.ffuncs) > 0 - - def test_backend_parameter(self): - """Test different joblib backends.""" - tf = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf.make_ffunc1ds() - - # Test different backends (may not all be available in all environments) - for backend in ["loky", "threading"]: - tf_test = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf_test.make_ffunc1ds() - try: - tf_test.make_1dfits(n_jobs=2, backend=backend) - assert len(tf_test.ffuncs) > 0, f"Backend {backend} failed" - except ValueError: - # Some backends may not be available in all environments - pytest.skip( - f"Backend {backend} not available in this environment" # noqa: E713 - ) - - class TestResidualsEnhancement: """Test residuals use_all parameter.""" @@ -400,7 +185,7 @@ def test_complete_workflow(self): * np.exp(-((x - (10 + i * 0.2)) ** 2) / (2 * (2 + i * 0.1) ** 2)) + np.random.normal(0, 0.05, 200) ) - for i in range(25) # 25 measurements for good parallelization test + for i in range(25) }, index=x, ) @@ -409,9 +194,8 @@ def test_complete_workflow(self): tf = TrendFit(data, Gaussian, ffunc1d=Gaussian) tf.make_ffunc1ds() - # Fit with parallelization start_time = time.perf_counter() - tf.make_1dfits(n_jobs=-1, verbose=0) + tf.make_1dfits() execution_time = time.perf_counter() - start_time # Verify results