diff --git a/benchmarks/benchmark_io_performance.py b/benchmarks/benchmark_io_performance.py new file mode 100644 index 000000000..e73032901 --- /dev/null +++ b/benchmarks/benchmark_io_performance.py @@ -0,0 +1,202 @@ +"""Benchmark script for FlowSystem IO performance. + +Tests to_dataset() and from_dataset() performance with large FlowSystems. +Run this to compare performance before/after optimizations. + +Usage: + python benchmarks/benchmark_io_performance.py +""" + +import time +from typing import NamedTuple + +import numpy as np +import pandas as pd + +import flixopt as fx + + +class BenchmarkResult(NamedTuple): + """Results from a benchmark run.""" + + name: str + mean_ms: float + std_ms: float + iterations: int + + +def create_large_flow_system( + n_timesteps: int = 2190, + n_periods: int = 12, + n_components: int = 125, +) -> fx.FlowSystem: + """Create a large FlowSystem for benchmarking. + + Args: + n_timesteps: Number of timesteps (default 2190 = ~1 year at 4h resolution). + n_periods: Number of periods (default 12). + n_components: Number of sink/source pairs (default 125). + + Returns: + Configured FlowSystem ready for optimization. + """ + timesteps = pd.date_range('2024-01-01', periods=n_timesteps, freq='4h') + periods = pd.Index([2028 + i * 2 for i in range(n_periods)], name='period') + + fs = fx.FlowSystem(timesteps=timesteps, periods=periods) + fs.add_elements(fx.Effect('Cost', '€', is_objective=True)) + + n_buses = 10 + buses = [fx.Bus(f'Bus_{i}') for i in range(n_buses)] + fs.add_elements(*buses) + + # Create demand profile with daily pattern + base_demand = 100 + 50 * np.sin(2 * np.pi * np.arange(n_timesteps) / 24) + + for i in range(n_components // 2): + bus = buses[i % n_buses] + # Add noise to create unique profiles + profile = base_demand + np.random.normal(0, 10, n_timesteps) + profile = np.clip(profile / profile.max(), 0.1, 1.0) + + fs.add_elements( + fx.Sink( + f'D_{i}', + inputs=[fx.Flow(f'Q_{i}', bus=bus.label, size=100, fixed_relative_profile=profile)], + ) + ) + fs.add_elements( + fx.Source( + f'S_{i}', + outputs=[fx.Flow(f'P_{i}', bus=bus.label, size=500, effects_per_flow_hour={'Cost': 20 + i})], + ) + ) + + return fs + + +def benchmark_function(func, iterations: int = 5, warmup: int = 1) -> BenchmarkResult: + """Benchmark a function with multiple iterations. + + Args: + func: Function to benchmark (callable with no arguments). + iterations: Number of timed iterations. + warmup: Number of warmup iterations (not timed). + + Returns: + BenchmarkResult with timing statistics. + """ + # Warmup + for _ in range(warmup): + func() + + # Timed runs + times = [] + for _ in range(iterations): + start = time.perf_counter() + func() + elapsed = time.perf_counter() - start + times.append(elapsed) + + return BenchmarkResult( + name=func.__name__ if hasattr(func, '__name__') else str(func), + mean_ms=np.mean(times) * 1000, + std_ms=np.std(times) * 1000, + iterations=iterations, + ) + + +def run_io_benchmarks( + n_timesteps: int = 2190, + n_periods: int = 12, + n_components: int = 125, + n_clusters: int = 8, + iterations: int = 5, +) -> dict[str, BenchmarkResult]: + """Run IO performance benchmarks. + + Args: + n_timesteps: Number of timesteps for the FlowSystem. + n_periods: Number of periods. + n_components: Number of components (sink/source pairs). + n_clusters: Number of clusters for aggregation. + iterations: Number of benchmark iterations. + + Returns: + Dictionary mapping benchmark names to results. + """ + print('=' * 70) + print('FlowSystem IO Performance Benchmark') + print('=' * 70) + print('\nConfiguration:') + print(f' Timesteps: {n_timesteps}') + print(f' Periods: {n_periods}') + print(f' Components: {n_components}') + print(f' Clusters: {n_clusters}') + print(f' Iterations: {iterations}') + + # Create and prepare FlowSystem + print('\n1. Creating FlowSystem...') + fs = create_large_flow_system(n_timesteps, n_periods, n_components) + print(f' Components: {len(fs.components)}') + + print('\n2. Clustering and solving...') + fs_clustered = fs.transform.cluster(n_clusters=n_clusters, cluster_duration='1D') + + # Try Gurobi first, fall back to HiGHS if not available + try: + solver = fx.solvers.GurobiSolver() + fs_clustered.optimize(solver) + except Exception as e: + if 'gurobi' in str(e).lower() or 'license' in str(e).lower(): + print(f' Gurobi not available ({e}), falling back to HiGHS...') + solver = fx.solvers.HighsSolver() + fs_clustered.optimize(solver) + else: + raise + + print('\n3. Expanding...') + fs_expanded = fs_clustered.transform.expand() + print(f' Expanded timesteps: {len(fs_expanded.timesteps)}') + + # Create dataset with solution + print('\n4. Creating dataset...') + ds = fs_expanded.to_dataset(include_solution=True) + print(f' Variables: {len(ds.data_vars)}') + print(f' Size: {ds.nbytes / 1e6:.1f} MB') + + results = {} + + # Benchmark to_dataset + print('\n5. Benchmarking to_dataset()...') + result = benchmark_function(lambda: fs_expanded.to_dataset(include_solution=True), iterations=iterations) + results['to_dataset'] = result + print(f' Mean: {result.mean_ms:.1f}ms (std: {result.std_ms:.1f}ms)') + + # Benchmark from_dataset + print('\n6. Benchmarking from_dataset()...') + result = benchmark_function(lambda: fx.FlowSystem.from_dataset(ds), iterations=iterations) + results['from_dataset'] = result + print(f' Mean: {result.mean_ms:.1f}ms (std: {result.std_ms:.1f}ms)') + + # Verify restoration + print('\n7. Verification...') + fs_restored = fx.FlowSystem.from_dataset(ds) + print(f' Components restored: {len(fs_restored.components)}') + print(f' Timesteps restored: {len(fs_restored.timesteps)}') + print(f' Has solution: {fs_restored.solution is not None}') + if fs_restored.solution is not None: + print(f' Solution variables: {len(fs_restored.solution.data_vars)}') + + # Summary + print('\n' + '=' * 70) + print('Summary') + print('=' * 70) + for name, res in results.items(): + print(f' {name}: {res.mean_ms:.1f}ms (+/- {res.std_ms:.1f}ms)') + + return results + + +if __name__ == '__main__': + run_io_benchmarks() diff --git a/docs/notebooks/01-quickstart.ipynb b/docs/notebooks/01-quickstart.ipynb index 1500bce77..b21ffe86c 100644 --- a/docs/notebooks/01-quickstart.ipynb +++ b/docs/notebooks/01-quickstart.ipynb @@ -282,8 +282,16 @@ "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "version": "3.11" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" } }, "nbformat": 4, diff --git a/docs/notebooks/02-heat-system.ipynb b/docs/notebooks/02-heat-system.ipynb index 15ef3a9d3..9d0a3b9d8 100644 --- a/docs/notebooks/02-heat-system.ipynb +++ b/docs/notebooks/02-heat-system.ipynb @@ -380,6 +380,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" } }, "nbformat": 4, diff --git a/docs/notebooks/03-investment-optimization.ipynb b/docs/notebooks/03-investment-optimization.ipynb index 85d4e0677..4c8667c07 100644 --- a/docs/notebooks/03-investment-optimization.ipynb +++ b/docs/notebooks/03-investment-optimization.ipynb @@ -429,6 +429,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" } }, "nbformat": 4, diff --git a/docs/notebooks/04-operational-constraints.ipynb b/docs/notebooks/04-operational-constraints.ipynb index b99a70649..c0a9f283a 100644 --- a/docs/notebooks/04-operational-constraints.ipynb +++ b/docs/notebooks/04-operational-constraints.ipynb @@ -472,6 +472,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" } }, "nbformat": 4, diff --git a/docs/notebooks/05-multi-carrier-system.ipynb b/docs/notebooks/05-multi-carrier-system.ipynb index c7ad8af24..076f1d3b5 100644 --- a/docs/notebooks/05-multi-carrier-system.ipynb +++ b/docs/notebooks/05-multi-carrier-system.ipynb @@ -541,8 +541,16 @@ "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "version": "3.11" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" } }, "nbformat": 4, diff --git a/docs/notebooks/06a-time-varying-parameters.ipynb b/docs/notebooks/06a-time-varying-parameters.ipynb index 138eaf50a..11850e3f4 100644 --- a/docs/notebooks/06a-time-varying-parameters.ipynb +++ b/docs/notebooks/06a-time-varying-parameters.ipynb @@ -308,7 +308,20 @@ ] } ], - "metadata": {}, + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, "nbformat": 4, "nbformat_minor": 5 } diff --git a/docs/notebooks/06b-piecewise-conversion.ipynb b/docs/notebooks/06b-piecewise-conversion.ipynb index aa0ab7a89..c02bc1da8 100644 --- a/docs/notebooks/06b-piecewise-conversion.ipynb +++ b/docs/notebooks/06b-piecewise-conversion.ipynb @@ -205,8 +205,16 @@ "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "version": "3.12.7" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" } }, "nbformat": 4, diff --git a/docs/notebooks/06c-piecewise-effects.ipynb b/docs/notebooks/06c-piecewise-effects.ipynb index 3d7972b1c..81baa707a 100644 --- a/docs/notebooks/06c-piecewise-effects.ipynb +++ b/docs/notebooks/06c-piecewise-effects.ipynb @@ -312,8 +312,16 @@ "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "version": "3.12.7" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" } }, "nbformat": 4, diff --git a/docs/notebooks/08a-aggregation.ipynb b/docs/notebooks/08a-aggregation.ipynb index ae61e3562..f0e512b76 100644 --- a/docs/notebooks/08a-aggregation.ipynb +++ b/docs/notebooks/08a-aggregation.ipynb @@ -388,6 +388,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" } }, "nbformat": 4, diff --git a/docs/notebooks/08c-clustering.ipynb b/docs/notebooks/08c-clustering.ipynb index 8f03fb335..d8949a028 100644 --- a/docs/notebooks/08c-clustering.ipynb +++ b/docs/notebooks/08c-clustering.ipynb @@ -121,7 +121,7 @@ "4. **Handles storage** with configurable behavior via `storage_mode`\n", "\n", "!!! warning \"Peak Forcing\"\n", - " Always use `time_series_for_high_peaks` to ensure extreme demand days are captured.\n", + " Always use `extremes=ExtremeConfig(max_value=[...])` to ensure extreme demand days are captured.\n", " Without this, clustering may miss peak periods, causing undersized components." ] }, @@ -132,6 +132,8 @@ "metadata": {}, "outputs": [], "source": [ + "from tsam.config import ExtremeConfig\n", + "\n", "start = timeit.default_timer()\n", "\n", "# IMPORTANT: Force inclusion of peak demand periods!\n", @@ -141,7 +143,7 @@ "fs_clustered = flow_system.transform.cluster(\n", " n_clusters=8, # 8 typical days\n", " cluster_duration='1D', # Daily clustering\n", - " time_series_for_high_peaks=peak_series, # Capture peak demand day\n", + " extremes=ExtremeConfig(method='new_cluster', max_value=peak_series), # Capture peak demand day\n", ")\n", "fs_clustered.name = 'Clustered (8 days)'\n", "\n", @@ -179,7 +181,7 @@ "outputs": [], "source": [ "# Access clustering metadata directly\n", - "clustering = fs_clustered.clustering\n", + "clustering = fs_clustered.clustering.results\n", "clustering" ] }, @@ -203,7 +205,7 @@ "source": [ "# Quality metrics - how well do the clusters represent the original data?\n", "# Lower RMSE/MAE = better representation\n", - "clustering.metrics.to_dataframe().style.format('{:.3f}')" + "fs_clustered.clustering.metrics.to_dataframe().style.format('{:.3f}')" ] }, { @@ -214,13 +216,111 @@ "outputs": [], "source": [ "# Visual comparison: original vs clustered time series\n", - "clustering.plot.compare()" + "fs_clustered.clustering.plot.compare()" ] }, { "cell_type": "markdown", "id": "15", "metadata": {}, + "source": [ + "## Inspect Clustering Input Data\n", + "\n", + "Before clustering, you can inspect which time-varying data will be used.\n", + "The `clustering_data()` method returns only the arrays that vary over time\n", + "(constant arrays are excluded since they don't affect clustering):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16", + "metadata": {}, + "outputs": [], + "source": [ + "# See what data will be used for clustering\n", + "clustering_data = flow_system.transform.clustering_data()\n", + "print(f'Variables used for clustering ({len(clustering_data.data_vars)} total):')\n", + "for var in clustering_data.data_vars:\n", + " print(f' - {var}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17", + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize the time-varying data (select a few key variables)\n", + "key_vars = [v for v in clustering_data.data_vars if 'fixed_relative_profile' in v or 'effects_per_flow_hour' in v]\n", + "clustering_data[key_vars].plotly.line(facet_row='variable', title='Time-Varying Data Used for Clustering')" + ] + }, + { + "cell_type": "markdown", + "id": "18", + "metadata": {}, + "source": [ + "## Selective Clustering with `data_vars`\n", + "\n", + "By default, clustering uses **all** time-varying data to determine typical periods.\n", + "However, you may want to cluster based on only a **subset** of variables while still\n", + "applying the clustering to all data.\n", + "\n", + "Use the `data_vars` parameter to specify which variables determine the clustering:\n", + "\n", + "- **Cluster based on subset**: Only the specified variables affect which days are grouped together\n", + "- **Apply to all data**: The resulting clustering is applied to ALL time-varying data\n", + "\n", + "This is useful when:\n", + "- You want to cluster based on demand patterns only (ignoring price variations)\n", + "- You have dominant time series that should drive the clustering\n", + "- You want to ensure certain patterns are well-represented in typical periods" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19", + "metadata": {}, + "outputs": [], + "source": [ + "# Cluster based ONLY on heat demand pattern (ignore electricity prices)\n", + "demand_var = 'HeatDemand(Q_th)|fixed_relative_profile'\n", + "\n", + "fs_demand_only = flow_system.transform.cluster(\n", + " n_clusters=8,\n", + " cluster_duration='1D',\n", + " data_vars=[demand_var], # Only this variable determines clustering\n", + " extremes=ExtremeConfig(method='new_cluster', max_value=[demand_var]),\n", + ")\n", + "\n", + "# Verify: clustering was determined by demand but applied to all data\n", + "print(f'Clustered using: {demand_var}')\n", + "print(f'But all {len(clustering_data.data_vars)} variables are included in the result')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20", + "metadata": {}, + "outputs": [], + "source": [ + "# Compare metrics: clustering with all data vs. demand-only\n", + "pd.DataFrame(\n", + " {\n", + " 'All Variables': fs_clustered.clustering.metrics.to_dataframe().iloc[0],\n", + " 'Demand Only': fs_demand_only.clustering.metrics.to_dataframe().iloc[0],\n", + " }\n", + ").round(4)" + ] + }, + { + "cell_type": "markdown", + "id": "21", + "metadata": {}, "source": [ "## Advanced Clustering Options\n", "\n", @@ -230,15 +330,17 @@ { "cell_type": "code", "execution_count": null, - "id": "16", + "id": "22", "metadata": {}, "outputs": [], "source": [ + "from tsam.config import ClusterConfig\n", + "\n", "# Try different clustering algorithms\n", "fs_kmeans = flow_system.transform.cluster(\n", " n_clusters=8,\n", " cluster_duration='1D',\n", - " cluster_method='k_means', # Alternative: 'hierarchical' (default), 'k_medoids', 'averaging'\n", + " cluster=ClusterConfig(method='kmeans'), # Alternative: 'hierarchical' (default), 'kmedoids', 'averaging'\n", ")\n", "\n", "fs_kmeans.clustering" @@ -247,7 +349,7 @@ { "cell_type": "code", "execution_count": null, - "id": "17", + "id": "23", "metadata": {}, "outputs": [], "source": [ @@ -263,58 +365,43 @@ { "cell_type": "code", "execution_count": null, - "id": "18", + "id": "24", "metadata": {}, "outputs": [], "source": [ "# Visualize cluster structure with heatmap\n", - "clustering.plot.heatmap()" + "fs_clustered.clustering.plot.heatmap()" ] }, { "cell_type": "markdown", - "id": "19", + "id": "25", "metadata": {}, "source": [ - "### Manual Cluster Assignment\n", + "### Apply Existing Clustering\n", "\n", "When comparing design variants or performing sensitivity analysis, you often want to\n", "use the **same cluster structure** across different FlowSystem configurations.\n", - "Use `predef_cluster_order` to ensure comparable results:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20", - "metadata": {}, - "outputs": [], - "source": [ - "# Save the cluster order from our optimized system\n", - "cluster_order = fs_clustered.clustering.cluster_order.values\n", + "Use `apply_clustering()` to reuse a clustering from another FlowSystem:\n", "\n", - "# Now modify the FlowSystem (e.g., increase storage capacity limits)\n", - "flow_system_modified = flow_system.copy()\n", - "flow_system_modified.components['Storage'].capacity_in_flow_hours.maximum_size = 2000 # Larger storage option\n", + "```python\n", + "# First, create a reference clustering\n", + "fs_reference = flow_system.transform.cluster(n_clusters=8, cluster_duration='1D')\n", "\n", - "# Cluster with the SAME cluster structure for fair comparison\n", - "fs_modified_clustered = flow_system_modified.transform.cluster(\n", - " n_clusters=8,\n", - " cluster_duration='1D',\n", - " predef_cluster_order=cluster_order, # Reuse cluster assignments\n", - ")\n", - "fs_modified_clustered.name = 'Modified (larger storage limit)'\n", + "# Modify the FlowSystem (e.g., different storage size)\n", + "flow_system_modified = flow_system.copy()\n", + "flow_system_modified.components['Storage'].capacity_in_flow_hours.maximum_size = 2000\n", "\n", - "# Optimize the modified system\n", - "fs_modified_clustered.optimize(solver)\n", + "# Apply the SAME clustering for fair comparison\n", + "fs_modified = flow_system_modified.transform.apply_clustering(fs_reference.clustering)\n", + "```\n", "\n", - "# Compare results using Comparison class\n", - "fx.Comparison([fs_clustered, fs_modified_clustered])" + "This ensures both systems use identical typical periods for fair comparison." ] }, { "cell_type": "markdown", - "id": "21", + "id": "26", "metadata": {}, "source": [ "## Method 3: Two-Stage Workflow (Recommended)\n", @@ -332,7 +419,7 @@ { "cell_type": "code", "execution_count": null, - "id": "22", + "id": "27", "metadata": {}, "outputs": [], "source": [ @@ -344,7 +431,7 @@ { "cell_type": "code", "execution_count": null, - "id": "23", + "id": "28", "metadata": {}, "outputs": [], "source": [ @@ -363,7 +450,7 @@ }, { "cell_type": "markdown", - "id": "24", + "id": "29", "metadata": {}, "source": [ "## Compare Results" @@ -372,7 +459,7 @@ { "cell_type": "code", "execution_count": null, - "id": "25", + "id": "30", "metadata": {}, "outputs": [], "source": [ @@ -421,7 +508,7 @@ }, { "cell_type": "markdown", - "id": "26", + "id": "31", "metadata": {}, "source": [ "## Expand Solution to Full Resolution\n", @@ -433,7 +520,7 @@ { "cell_type": "code", "execution_count": null, - "id": "27", + "id": "32", "metadata": {}, "outputs": [], "source": [ @@ -444,7 +531,7 @@ { "cell_type": "code", "execution_count": null, - "id": "28", + "id": "33", "metadata": {}, "outputs": [], "source": [ @@ -466,7 +553,7 @@ }, { "cell_type": "markdown", - "id": "29", + "id": "34", "metadata": {}, "source": [ "## Visualize Clustered Heat Balance" @@ -475,7 +562,7 @@ { "cell_type": "code", "execution_count": null, - "id": "30", + "id": "35", "metadata": {}, "outputs": [], "source": [ @@ -485,7 +572,7 @@ { "cell_type": "code", "execution_count": null, - "id": "31", + "id": "36", "metadata": {}, "outputs": [], "source": [ @@ -494,7 +581,7 @@ }, { "cell_type": "markdown", - "id": "32", + "id": "37", "metadata": {}, "source": [ "## API Reference\n", @@ -505,16 +592,25 @@ "|-----------|------|---------|-------------|\n", "| `n_clusters` | `int` | - | Number of typical periods (e.g., 8 typical days) |\n", "| `cluster_duration` | `str \\| float` | - | Duration per cluster ('1D', '24h') or hours |\n", + "| `data_vars` | `list[str]` | None | Variables to cluster on (applies result to all) |\n", "| `weights` | `dict[str, float]` | None | Optional weights for time series in clustering |\n", - "| `time_series_for_high_peaks` | `list[str]` | None | **Essential**: Force inclusion of peak periods |\n", - "| `time_series_for_low_peaks` | `list[str]` | None | Force inclusion of minimum periods |\n", - "| `cluster_method` | `str` | 'hierarchical' | Algorithm: 'hierarchical', 'k_means', 'k_medoids', 'k_maxoids', 'averaging' |\n", - "| `representation_method` | `str` | 'medoidRepresentation' | 'medoidRepresentation', 'meanRepresentation', 'distributionAndMinMaxRepresentation' |\n", - "| `extreme_period_method` | `str \\| None` | None | How peaks are integrated: None, 'append', 'new_cluster_center', 'replace_cluster_center' |\n", - "| `rescale_cluster_periods` | `bool` | True | Rescale clusters to match original means |\n", - "| `predef_cluster_order` | `array` | None | Manual cluster assignments |\n", + "| `cluster` | `ClusterConfig` | None | Clustering algorithm configuration |\n", + "| `extremes` | `ExtremeConfig` | None | **Essential**: Force inclusion of peak/min periods |\n", "| `**tsam_kwargs` | - | - | Additional tsam parameters |\n", "\n", + "### `transform.clustering_data()` Method\n", + "\n", + "Inspect which time-varying data will be used for clustering:\n", + "\n", + "```python\n", + "# Get all time-varying variables\n", + "clustering_data = flow_system.transform.clustering_data()\n", + "print(list(clustering_data.data_vars))\n", + "\n", + "# Get data for a specific period (multi-period systems)\n", + "clustering_data = flow_system.transform.clustering_data(period=2024)\n", + "```\n", + "\n", "### Clustering Object Properties\n", "\n", "After clustering, access metadata via `fs.clustering`:\n", @@ -524,12 +620,30 @@ "| `n_clusters` | Number of clusters |\n", "| `n_original_clusters` | Number of original time segments (e.g., 365 days) |\n", "| `timesteps_per_cluster` | Timesteps in each cluster (e.g., 24 for daily) |\n", - "| `cluster_order` | xr.DataArray mapping original segment → cluster ID |\n", - "| `occurrences` | How many original segments each cluster represents |\n", + "| `cluster_assignments` | xr.DataArray mapping original segment → cluster ID |\n", + "| `cluster_occurrences` | How many original segments each cluster represents |\n", "| `metrics` | xr.Dataset with RMSE, MAE per time series |\n", + "| `results` | `ClusteringResults` with xarray-like interface |\n", "| `plot.compare()` | Compare original vs clustered time series |\n", "| `plot.heatmap()` | Visualize cluster structure |\n", "\n", + "### ClusteringResults (xarray-like)\n", + "\n", + "Access the underlying tsam results via `clustering.results`:\n", + "\n", + "```python\n", + "# Dimension info (like xarray)\n", + "clustering.results.dims # ('period', 'scenario') or ()\n", + "clustering.results.coords # {'period': [2020, 2030], 'scenario': ['high', 'low']}\n", + "\n", + "# Select specific result (like xarray)\n", + "clustering.results.sel(period=2020, scenario='high') # Label-based\n", + "clustering.results.isel(period=0, scenario=1) # Index-based\n", + "\n", + "# Apply existing clustering to new data\n", + "agg_results = clustering.results.apply(dataset) # Returns AggregationResults\n", + "```\n", + "\n", "### Storage Behavior\n", "\n", "Each `Storage` component has a `cluster_mode` parameter:\n", @@ -543,20 +657,27 @@ "\n", "For a detailed comparison of storage modes, see [08c2-clustering-storage-modes](08c2-clustering-storage-modes.ipynb).\n", "\n", - "### Peak Forcing Format\n", + "### Peak Forcing with ExtremeConfig\n", "\n", "```python\n", - "time_series_for_high_peaks = ['ComponentName(FlowName)|fixed_relative_profile']\n", + "from tsam.config import ExtremeConfig\n", + "\n", + "extremes = ExtremeConfig(\n", + " method='new_cluster', # Creates new cluster for extremes\n", + " max_value=['ComponentName(FlowName)|fixed_relative_profile'], # Capture peak demand\n", + ")\n", "```\n", "\n", "### Recommended Workflow\n", "\n", "```python\n", + "from tsam.config import ExtremeConfig\n", + "\n", "# Stage 1: Fast sizing\n", "fs_sizing = flow_system.transform.cluster(\n", " n_clusters=8,\n", " cluster_duration='1D',\n", - " time_series_for_high_peaks=['Demand(Flow)|fixed_relative_profile'],\n", + " extremes=ExtremeConfig(method='new_cluster', max_value=['Demand(Flow)|fixed_relative_profile']),\n", ")\n", "fs_sizing.optimize(solver)\n", "\n", @@ -571,7 +692,7 @@ }, { "cell_type": "markdown", - "id": "33", + "id": "38", "metadata": {}, "source": [ "## Summary\n", @@ -579,21 +700,25 @@ "You learned how to:\n", "\n", "- Use **`cluster()`** to reduce time series into typical periods\n", - "- Apply **peak forcing** to capture extreme demand days\n", + "- **Inspect clustering data** with `clustering_data()` before clustering\n", + "- Use **`data_vars`** to cluster based on specific variables only\n", + "- Apply **peak forcing** with `ExtremeConfig` to capture extreme demand days\n", "- Use **two-stage optimization** for fast yet accurate investment decisions\n", "- **Expand solutions** back to full resolution with `expand()`\n", - "- Access **clustering metadata** via `fs.clustering` (metrics, cluster_order, occurrences)\n", - "- Use **advanced options** like different algorithms\n", - "- **Manually assign clusters** using `predef_cluster_order`\n", + "- Access **clustering metadata** via `fs.clustering` (metrics, cluster_assignments, cluster_occurrences)\n", + "- Use **advanced options** like different algorithms with `ClusterConfig`\n", + "- **Apply existing clustering** to other FlowSystems using `apply_clustering()`\n", "\n", "### Key Takeaways\n", "\n", - "1. **Always use peak forcing** (`time_series_for_high_peaks`) for demand time series\n", - "2. **Add safety margin** (5-10%) when fixing sizes from clustering\n", - "3. **Two-stage is recommended**: clustering for sizing, full resolution for dispatch\n", - "4. **Storage handling** is configurable via `cluster_mode`\n", - "5. **Check metrics** to evaluate clustering quality\n", - "6. **Use `predef_cluster_order`** to reproduce or define custom cluster assignments\n", + "1. **Always use peak forcing** (`extremes=ExtremeConfig(max_value=[...])`) for demand time series\n", + "2. **Inspect data first** with `clustering_data()` to see available variables\n", + "3. **Use `data_vars`** to cluster on specific variables (e.g., demand only, ignoring prices)\n", + "4. **Add safety margin** (5-10%) when fixing sizes from clustering\n", + "5. **Two-stage is recommended**: clustering for sizing, full resolution for dispatch\n", + "6. **Storage handling** is configurable via `cluster_mode`\n", + "7. **Check metrics** to evaluate clustering quality\n", + "8. **Use `apply_clustering()`** to apply the same clustering to different FlowSystem variants\n", "\n", "### Next Steps\n", "\n", @@ -602,7 +727,25 @@ ] } ], - "metadata": {}, + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, "nbformat": 4, "nbformat_minor": 5 } diff --git a/docs/notebooks/08c2-clustering-storage-modes.ipynb b/docs/notebooks/08c2-clustering-storage-modes.ipynb index 66d84fb5c..ab223410b 100644 --- a/docs/notebooks/08c2-clustering-storage-modes.ipynb +++ b/docs/notebooks/08c2-clustering-storage-modes.ipynb @@ -171,6 +171,8 @@ "metadata": {}, "outputs": [], "source": [ + "from tsam.config import ExtremeConfig\n", + "\n", "# Clustering parameters\n", "N_CLUSTERS = 24 # 24 typical days for a full year\n", "CLUSTER_DURATION = '1D'\n", @@ -193,7 +195,7 @@ " fs_clustered = fs_copy.transform.cluster(\n", " n_clusters=N_CLUSTERS,\n", " cluster_duration=CLUSTER_DURATION,\n", - " time_series_for_high_peaks=PEAK_SERIES,\n", + " extremes=ExtremeConfig(method='new_cluster', max_value=PEAK_SERIES),\n", " )\n", " time_cluster = timeit.default_timer() - start\n", "\n", diff --git a/docs/notebooks/08d-clustering-multiperiod.ipynb b/docs/notebooks/08d-clustering-multiperiod.ipynb index 3f229d5f2..b1e89ffac 100644 --- a/docs/notebooks/08d-clustering-multiperiod.ipynb +++ b/docs/notebooks/08d-clustering-multiperiod.ipynb @@ -173,6 +173,8 @@ "metadata": {}, "outputs": [], "source": [ + "from tsam.config import ExtremeConfig\n", + "\n", "start = timeit.default_timer()\n", "\n", "# Force inclusion of peak demand periods\n", @@ -182,7 +184,7 @@ "fs_clustered = flow_system.transform.cluster(\n", " n_clusters=3,\n", " cluster_duration='1D',\n", - " time_series_for_high_peaks=peak_series,\n", + " extremes=ExtremeConfig(method='new_cluster', max_value=peak_series),\n", ")\n", "\n", "time_clustering = timeit.default_timer() - start\n", @@ -228,17 +230,6 @@ "id": "13", "metadata": {}, "outputs": [], - "source": [ - "# Compare original vs aggregated data - automatically faceted by period and scenario\n", - "fs_clustered.clustering.plot.compare(variables='Building(Heat)|fixed_relative_profile')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14", - "metadata": {}, - "outputs": [], "source": [ "# Duration curves show how well the distribution is preserved per period/scenario\n", "fs_clustered.clustering.plot.compare(\n", @@ -249,7 +240,7 @@ { "cell_type": "code", "execution_count": null, - "id": "15", + "id": "14", "metadata": {}, "outputs": [], "source": [ @@ -259,7 +250,7 @@ }, { "cell_type": "markdown", - "id": "16", + "id": "15", "metadata": {}, "source": [ "## Understand the Cluster Structure\n", @@ -270,27 +261,31 @@ { "cell_type": "code", "execution_count": null, - "id": "17", + "id": "16", "metadata": {}, "outputs": [], "source": [ - "info = fs_clustered.clustering\n", - "cs = info.result.cluster_structure\n", + "clustering = fs_clustered.clustering\n", "\n", "print('Clustering Configuration:')\n", - "print(f' Typical periods (clusters): {cs.n_clusters}')\n", - "print(f' Timesteps per cluster: {cs.timesteps_per_cluster}')\n", + "print(f' Typical periods (clusters): {clustering.n_clusters}')\n", + "print(f' Timesteps per cluster: {clustering.timesteps_per_cluster}')\n", + "\n", + "# Access underlying results via xarray-like interface\n", + "print(f'\\nClusteringResults dimensions: {clustering.results.dims}')\n", + "print(f'ClusteringResults coords: {clustering.results.coords}')\n", "\n", - "# The cluster_order shows which cluster each original day belongs to\n", - "cluster_order = cs.cluster_order.values\n", + "# The cluster_assignments shows which cluster each original day belongs to\n", + "# For multi-period systems, select a specific period/scenario combination\n", + "cluster_assignments = clustering.cluster_assignments.isel(period=0, scenario=0).values\n", "day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']\n", "\n", - "print('\\nCluster assignments per day:')\n", - "for i, cluster_id in enumerate(cluster_order):\n", + "print('\\nCluster assignments per day (period=2024, scenario=High):')\n", + "for i, cluster_id in enumerate(cluster_assignments):\n", " print(f' {day_names[i]}: Cluster {cluster_id}')\n", "\n", "# Cluster occurrences (how many original days each cluster represents)\n", - "unique, counts = np.unique(cluster_order, return_counts=True)\n", + "unique, counts = np.unique(cluster_assignments, return_counts=True)\n", "print('\\nCluster weights (days represented):')\n", "for cluster_id, count in zip(unique, counts, strict=True):\n", " print(f' Cluster {cluster_id}: {count} days')" @@ -298,7 +293,7 @@ }, { "cell_type": "markdown", - "id": "18", + "id": "17", "metadata": {}, "source": [ "## Two-Stage Workflow for Multi-Period\n", @@ -324,7 +319,7 @@ { "cell_type": "code", "execution_count": null, - "id": "19", + "id": "18", "metadata": {}, "outputs": [], "source": [ @@ -345,7 +340,7 @@ { "cell_type": "code", "execution_count": null, - "id": "20", + "id": "19", "metadata": {}, "outputs": [], "source": [ @@ -370,7 +365,7 @@ }, { "cell_type": "markdown", - "id": "21", + "id": "20", "metadata": {}, "source": [ "## Compare Results Across Methods" @@ -379,7 +374,7 @@ { "cell_type": "code", "execution_count": null, - "id": "22", + "id": "21", "metadata": {}, "outputs": [], "source": [ @@ -424,7 +419,7 @@ }, { "cell_type": "markdown", - "id": "23", + "id": "22", "metadata": {}, "source": [ "## Visualize Optimization Results\n", @@ -435,7 +430,7 @@ { "cell_type": "code", "execution_count": null, - "id": "24", + "id": "23", "metadata": {}, "outputs": [], "source": [ @@ -446,7 +441,7 @@ { "cell_type": "code", "execution_count": null, - "id": "25", + "id": "24", "metadata": {}, "outputs": [], "source": [ @@ -457,7 +452,7 @@ }, { "cell_type": "markdown", - "id": "26", + "id": "25", "metadata": {}, "source": [ "## Expand Clustered Solution to Full Resolution\n", @@ -468,7 +463,7 @@ { "cell_type": "code", "execution_count": null, - "id": "27", + "id": "26", "metadata": {}, "outputs": [], "source": [ @@ -482,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "28", + "id": "27", "metadata": {}, "outputs": [], "source": [ @@ -492,7 +487,7 @@ }, { "cell_type": "markdown", - "id": "29", + "id": "28", "metadata": {}, "source": [ "## Key Considerations for Multi-Period Clustering\n", @@ -526,7 +521,7 @@ }, { "cell_type": "markdown", - "id": "30", + "id": "29", "metadata": {}, "source": [ "## Summary\n", @@ -551,6 +546,8 @@ "### API Reference\n", "\n", "```python\n", + "from tsam.config import ExtremeConfig\n", + "\n", "# Load multi-period system\n", "fs = fx.FlowSystem.from_netcdf('multiperiod_system.nc4')\n", "\n", @@ -561,13 +558,19 @@ "fs_clustered = fs.transform.cluster(\n", " n_clusters=10,\n", " cluster_duration='1D',\n", - " time_series_for_high_peaks=['Demand(Flow)|fixed_relative_profile'],\n", + " extremes=ExtremeConfig(method='new_cluster', max_value=['Demand(Flow)|fixed_relative_profile']),\n", ")\n", "\n", "# Visualize clustering quality\n", "fs_clustered.clustering.plot.compare(variable='Demand(Flow)|profile')\n", "fs_clustered.clustering.plot.heatmap()\n", "\n", + "# Access underlying results (xarray-like interface)\n", + "fs_clustered.clustering.results.dims # ('period', 'scenario')\n", + "fs_clustered.clustering.results.coords # {'period': [...], 'scenario': [...]}\n", + "fs_clustered.clustering.results.sel(period=2024, scenario='High') # Label-based\n", + "fs_clustered.clustering.results.isel(period=0, scenario=0) # Index-based\n", + "\n", "# Two-stage workflow\n", "fs_clustered.optimize(solver)\n", "sizes = {k: v.max().item() * 1.10 for k, v in fs_clustered.statistics.sizes.items()}\n", @@ -587,8 +590,16 @@ "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "version": "3.11" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" } }, "nbformat": 4, diff --git a/docs/notebooks/08e-clustering-internals.ipynb b/docs/notebooks/08e-clustering-internals.ipynb index b685a02d1..6f6ad528d 100644 --- a/docs/notebooks/08e-clustering-internals.ipynb +++ b/docs/notebooks/08e-clustering-internals.ipynb @@ -11,9 +11,10 @@ "\n", "This notebook demonstrates:\n", "\n", - "- **Data structures**: `Clustering`, `ClusterResult`, and `ClusterStructure`\n", + "- **Data structure**: The `Clustering` class that stores all clustering information\n", "- **Plot accessor**: Built-in visualizations via `.plot`\n", "- **Data expansion**: Using `expand_data()` to map aggregated data back to original timesteps\n", + "- **IO workflow**: What's preserved and lost when saving/loading clustered systems\n", "\n", "!!! note \"Prerequisites\"\n", " This notebook assumes familiarity with [08c-clustering](08c-clustering.ipynb)." @@ -53,10 +54,12 @@ "metadata": {}, "outputs": [], "source": [ + "from tsam.config import ExtremeConfig\n", + "\n", "fs_clustered = flow_system.transform.cluster(\n", " n_clusters=8,\n", " cluster_duration='1D',\n", - " time_series_for_high_peaks=['HeatDemand(Q_th)|fixed_relative_profile'],\n", + " extremes=ExtremeConfig(method='new_cluster', max_value=['HeatDemand(Q_th)|fixed_relative_profile']),\n", ")\n", "\n", "fs_clustered.clustering" @@ -67,9 +70,12 @@ "id": "4", "metadata": {}, "source": [ - "The `Clustering` contains:\n", - "- **`result`**: A `ClusterResult` with timestep mapping and weights\n", - "- **`result.cluster_structure`**: A `ClusterStructure` with cluster assignments" + "The `Clustering` object contains:\n", + "- **`cluster_assignments`**: Which cluster each original period maps to\n", + "- **`cluster_occurrences`**: How many original periods each cluster represents\n", + "- **`timestep_mapping`**: Maps each original timestep to its representative\n", + "- **`original_data`** / **`aggregated_data`**: The data before and after clustering\n", + "- **`results`**: `ClusteringResults` object with xarray-like interface (`.dims`, `.coords`, `.sel()`)" ] }, { @@ -79,7 +85,8 @@ "metadata": {}, "outputs": [], "source": [ - "fs_clustered.clustering.result" + "# Cluster order shows which cluster each original period maps to\n", + "fs_clustered.clustering.cluster_assignments" ] }, { @@ -89,7 +96,8 @@ "metadata": {}, "outputs": [], "source": [ - "fs_clustered.clustering.result.cluster_structure" + "# Cluster occurrences shows how many original periods each cluster represents\n", + "fs_clustered.clustering.cluster_occurrences" ] }, { @@ -187,7 +195,7 @@ "source": [ "## Expanding Aggregated Data\n", "\n", - "The `ClusterResult.expand_data()` method maps aggregated data back to original timesteps.\n", + "The `Clustering.expand_data()` method maps aggregated data back to original timesteps.\n", "This is useful for comparing clustering results before optimization:" ] }, @@ -199,12 +207,12 @@ "outputs": [], "source": [ "# Get original and aggregated data\n", - "result = fs_clustered.clustering.result\n", - "original = result.original_data['HeatDemand(Q_th)|fixed_relative_profile']\n", - "aggregated = result.aggregated_data['HeatDemand(Q_th)|fixed_relative_profile']\n", + "clustering = fs_clustered.clustering\n", + "original = clustering.original_data['HeatDemand(Q_th)|fixed_relative_profile']\n", + "aggregated = clustering.aggregated_data['HeatDemand(Q_th)|fixed_relative_profile']\n", "\n", "# Expand aggregated data back to original timesteps\n", - "expanded = result.expand_data(aggregated)\n", + "expanded = clustering.expand_data(aggregated)\n", "\n", "print(f'Original: {len(original.time)} timesteps')\n", "print(f'Aggregated: {len(aggregated.time)} timesteps')\n", @@ -218,11 +226,30 @@ "source": [ "## Summary\n", "\n", - "| Class | Purpose |\n", - "|-------|--------|\n", - "| `Clustering` | Stored on `fs.clustering` after `cluster()` |\n", - "| `ClusterResult` | Contains timestep mapping, weights, and `expand_data()` method |\n", - "| `ClusterStructure` | Maps original periods to clusters |\n", + "| Property | Description |\n", + "|----------|-------------|\n", + "| `clustering.n_clusters` | Number of representative clusters |\n", + "| `clustering.timesteps_per_cluster` | Timesteps in each cluster period |\n", + "| `clustering.cluster_assignments` | Maps original periods to clusters |\n", + "| `clustering.cluster_occurrences` | Count of original periods per cluster |\n", + "| `clustering.timestep_mapping` | Maps original timesteps to representative indices |\n", + "| `clustering.original_data` | Dataset before clustering |\n", + "| `clustering.aggregated_data` | Dataset after clustering |\n", + "| `clustering.results` | `ClusteringResults` with xarray-like interface |\n", + "\n", + "### ClusteringResults (xarray-like)\n", + "\n", + "Access the underlying tsam results via `clustering.results`:\n", + "\n", + "```python\n", + "# Dimension info (like xarray)\n", + "clustering.results.dims # ('period', 'scenario') or ()\n", + "clustering.results.coords # {'period': [2020, 2030], 'scenario': ['high', 'low']}\n", + "\n", + "# Select specific result (like xarray)\n", + "clustering.results.sel(period=2020, scenario='high') # Label-based\n", + "clustering.results.isel(period=0, scenario=1) # Index-based\n", + "```\n", "\n", "### Plot Accessor Methods\n", "\n", @@ -250,8 +277,7 @@ "clustering.plot.heatmap()\n", "\n", "# Expand aggregated data to original timesteps\n", - "result = clustering.result\n", - "expanded = result.expand_data(aggregated_data)\n", + "expanded = clustering.expand_data(aggregated_data)\n", "```" ] }, @@ -306,6 +332,181 @@ "print(f'Clustered: {len(fs_clustered.timesteps)} timesteps')\n", "print(f'Expanded: {len(fs_expanded.timesteps)} timesteps')" ] + }, + { + "cell_type": "markdown", + "id": "21", + "metadata": {}, + "source": [ + "## IO Workflow\n", + "\n", + "When saving and loading a clustered FlowSystem, most clustering information is preserved.\n", + "However, some methods that access tsam's internal `AggregationResult` objects are not available after IO.\n", + "\n", + "### What's Preserved After IO\n", + "\n", + "- **Structure**: `n_clusters`, `timesteps_per_cluster`, `dims`, `coords`\n", + "- **Mappings**: `cluster_assignments`, `cluster_occurrences`, `timestep_mapping`\n", + "- **Data**: `original_data`, `aggregated_data`\n", + "- **Original timesteps**: `original_timesteps`\n", + "- **Results structure**: `results.sel()`, `results.isel()` for `ClusteringResult` access\n", + "\n", + "### What's Lost After IO\n", + "\n", + "- **`clustering.sel()`**: Accessing full `AggregationResult` objects\n", + "- **`clustering.items()`**: Iterating over `AggregationResult` objects\n", + "- **tsam internals**: `AggregationResult.accuracy`, `AggregationResult.plot`, etc." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22", + "metadata": {}, + "outputs": [], + "source": [ + "# Before IO: Full tsam access is available\n", + "result = fs_clustered.clustering.sel() # Get the AggregationResult\n", + "print(f'Before IO - AggregationResult available: {type(result).__name__}')\n", + "print(f' - n_clusters: {result.n_clusters}')\n", + "print(f' - accuracy.rmse (mean): {result.accuracy.rmse.mean():.4f}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23", + "metadata": {}, + "outputs": [], + "source": [ + "# Save and load the clustered system\n", + "import tempfile\n", + "from pathlib import Path\n", + "\n", + "try:\n", + " with tempfile.TemporaryDirectory() as tmpdir:\n", + " path = Path(tmpdir) / 'clustered_system.nc'\n", + " fs_clustered.to_netcdf(path)\n", + " fs_loaded = fx.FlowSystem.from_netcdf(path)\n", + "\n", + " # Structure is preserved\n", + " print('After IO - Structure preserved:')\n", + " print(f' - n_clusters: {fs_loaded.clustering.n_clusters}')\n", + " print(f' - dims: {fs_loaded.clustering.dims}')\n", + " print(f' - original_data variables: {list(fs_loaded.clustering.original_data.data_vars)[:3]}...')\n", + "except OSError as e:\n", + " print(f'Note: NetCDF save/load skipped due to environment issue: {type(e).__name__}')\n", + " print('This can happen in some CI environments. The functionality works locally.')\n", + " fs_loaded = fs_clustered # Use original for subsequent cells" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24", + "metadata": {}, + "outputs": [], + "source": [ + "# After IO: sel() raises ValueError because AggregationResult is not preserved\n", + "try:\n", + " fs_loaded.clustering.sel()\n", + "except ValueError as e:\n", + " print('After IO - sel() raises ValueError:')\n", + " print(f' \"{e}\"')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25", + "metadata": {}, + "outputs": [], + "source": [ + "# Key operations still work after IO:\n", + "# - Optimization\n", + "# - Expansion back to full resolution\n", + "# - Accessing original_data and aggregated_data\n", + "\n", + "fs_loaded.optimize(solver)\n", + "fs_loaded_expanded = fs_loaded.transform.expand()\n", + "\n", + "print('Loaded system can still be:')\n", + "print(f' - Optimized: {fs_loaded.solution is not None}')\n", + "print(f' - Expanded: {len(fs_loaded_expanded.timesteps)} timesteps')" + ] + }, + { + "cell_type": "markdown", + "id": "26", + "metadata": {}, + "source": [ + "### IO Workflow Summary\n", + "\n", + "```\n", + "┌─────────────────┐ to_netcdf() ┌─────────────────┐\n", + "│ fs_clustered │ ─────────────────► │ NetCDF file │\n", + "│ │ │ │\n", + "│ ✓ clustering │ │ ✓ structure │\n", + "│ ✓ sel() │ │ ✓ mappings │\n", + "│ ✓ items() │ │ ✓ data │\n", + "│ ✓ AggregationResult │ ✗ AggregationResult\n", + "└─────────────────┘ └─────────────────┘\n", + " │\n", + " │ from_netcdf()\n", + " ▼\n", + " ┌─────────────────┐\n", + " │ fs_loaded │\n", + " │ │\n", + " │ ✓ optimize() │\n", + " │ ✓ expand() │\n", + " │ ✓ original_data │\n", + " │ ✗ sel() │\n", + " │ ✗ items() │\n", + " └─────────────────┘\n", + "```\n", + "\n", + "!!! tip \"Best Practice\"\n", + " If you need tsam's `AggregationResult` for analysis (accuracy metrics, built-in plots),\n", + " do this **before** saving the FlowSystem. After loading, the core workflow\n", + " (optimize → expand) works normally." + ] + }, + { + "cell_type": "markdown", + "id": "27", + "metadata": {}, + "source": [ + "### Reducing File Size\n", + "\n", + "For smaller files (~38% reduction), use `include_original_data=False` when saving.\n", + "This disables `plot.compare()` after loading, but the core workflow still works:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28", + "metadata": {}, + "outputs": [], + "source": [ + "# Compare file sizes with and without original_data\n", + "try:\n", + " with tempfile.TemporaryDirectory() as tmpdir:\n", + " path_full = Path(tmpdir) / 'full.nc'\n", + " path_small = Path(tmpdir) / 'small.nc'\n", + "\n", + " fs_clustered.to_netcdf(path_full, include_original_data=True)\n", + " fs_clustered.to_netcdf(path_small, include_original_data=False)\n", + "\n", + " size_full = path_full.stat().st_size / 1024\n", + " size_small = path_small.stat().st_size / 1024\n", + "\n", + " print(f'With original_data: {size_full:.1f} KB')\n", + " print(f'Without original_data: {size_small:.1f} KB')\n", + " print(f'Size reduction: {(1 - size_small / size_full) * 100:.0f}%')\n", + "except OSError as e:\n", + " print(f'Note: File size comparison skipped due to environment issue: {type(e).__name__}')" + ] } ], "metadata": { @@ -313,6 +514,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" } }, "nbformat": 4, diff --git a/docs/notebooks/08f-clustering-segmentation.ipynb b/docs/notebooks/08f-clustering-segmentation.ipynb new file mode 100644 index 000000000..ed21c4b13 --- /dev/null +++ b/docs/notebooks/08f-clustering-segmentation.ipynb @@ -0,0 +1,646 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": {}, + "source": [ + "# Intra-Period Segmentation with `cluster()`\n", + "\n", + "Reduce timesteps within each typical period using segmentation.\n", + "\n", + "This notebook demonstrates:\n", + "\n", + "- **Segmentation**: Aggregate timesteps within each cluster into fewer segments\n", + "- **Variable durations**: Each segment can have different duration (hours)\n", + "- **Combined reduction**: Use clustering AND segmentation for maximum speedup\n", + "- **Expansion**: Map segmented results back to original timesteps\n", + "\n", + "!!! note \"Requirements\"\n", + " This notebook requires the `tsam` package: `pip install tsam`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1", + "metadata": {}, + "outputs": [], + "source": [ + "import timeit\n", + "\n", + "import pandas as pd\n", + "import plotly.express as px\n", + "\n", + "import flixopt as fx\n", + "\n", + "fx.CONFIG.notebook()" + ] + }, + { + "cell_type": "markdown", + "id": "2", + "metadata": {}, + "source": [ + "## What is Segmentation?\n", + "\n", + "**Clustering** groups similar time periods (e.g., days) into representative clusters.\n", + "\n", + "**Segmentation** goes further by aggregating timesteps *within* each cluster into fewer segments with variable durations.\n", + "\n", + "```\n", + "Original: | Day 1 (24h) | Day 2 (24h) | Day 3 (24h) | ... | Day 365 (24h) |\n", + " ↓ ↓ ↓ ↓\n", + "Clustered: | Typical Day A (24h) | Typical Day B (24h) | Typical Day C (24h) |\n", + " ↓ ↓ ↓\n", + "Segmented: | Seg1 (4h) | Seg2 (8h) | Seg3 (8h) | Seg4 (4h) | (per typical day)\n", + "```\n", + "\n", + "This can dramatically reduce problem size:\n", + "- **Original**: 365 days × 24 hours = 8,760 timesteps\n", + "- **Clustered (8 days)**: 8 × 24 = 192 timesteps\n", + "- **Segmented (6 segments)**: 8 × 6 = 48 timesteps" + ] + }, + { + "cell_type": "markdown", + "id": "3", + "metadata": {}, + "source": [ + "## Create the FlowSystem\n", + "\n", + "We use a district heating system with one month of data at 15-min resolution:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": {}, + "outputs": [], + "source": [ + "from data.generate_example_systems import create_district_heating_system\n", + "\n", + "flow_system = create_district_heating_system()\n", + "flow_system.connect_and_transform()\n", + "\n", + "print(f'Timesteps: {len(flow_system.timesteps)}')\n", + "print(f'Duration: {(flow_system.timesteps[-1] - flow_system.timesteps[0]).days + 1} days')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5", + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize input data\n", + "heat_demand = flow_system.components['HeatDemand'].inputs[0].fixed_relative_profile\n", + "heat_demand.plotly.line(title='Heat Demand Profile')" + ] + }, + { + "cell_type": "markdown", + "id": "6", + "metadata": {}, + "source": [ + "## Full Optimization (Baseline)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7", + "metadata": {}, + "outputs": [], + "source": [ + "solver = fx.solvers.HighsSolver(mip_gap=0.01)\n", + "\n", + "start = timeit.default_timer()\n", + "fs_full = flow_system.copy()\n", + "fs_full.name = 'Full Optimization'\n", + "fs_full.optimize(solver)\n", + "time_full = timeit.default_timer() - start\n", + "\n", + "print(f'Full optimization: {time_full:.2f} seconds')\n", + "print(f'Total cost: {fs_full.solution[\"costs\"].item():,.0f} €')" + ] + }, + { + "cell_type": "markdown", + "id": "8", + "metadata": {}, + "source": [ + "## Clustering with Segmentation\n", + "\n", + "Use `SegmentConfig` to enable intra-period segmentation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9", + "metadata": {}, + "outputs": [], + "source": [ + "from tsam.config import ExtremeConfig, SegmentConfig\n", + "\n", + "start = timeit.default_timer()\n", + "\n", + "# Cluster into 8 typical days with 6 segments each\n", + "fs_segmented = flow_system.transform.cluster(\n", + " n_clusters=8,\n", + " cluster_duration='1D',\n", + " segments=SegmentConfig(n_segments=6), # 6 segments per day instead of 96 quarter-hours\n", + " extremes=ExtremeConfig(method='new_cluster', max_value=['HeatDemand(Q_th)|fixed_relative_profile']),\n", + ")\n", + "\n", + "time_clustering = timeit.default_timer() - start\n", + "\n", + "print(f'Clustering time: {time_clustering:.2f} seconds')\n", + "print(f'Original timesteps: {len(flow_system.timesteps)}')\n", + "print(\n", + " f'Segmented timesteps: {len(fs_segmented.timesteps)} × {len(fs_segmented.clusters)} clusters = {len(fs_segmented.timesteps) * len(fs_segmented.clusters)}'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "10", + "metadata": {}, + "source": [ + "## Understanding Segmentation Properties\n", + "\n", + "After segmentation, the clustering object has additional properties:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": {}, + "outputs": [], + "source": [ + "clustering = fs_segmented.clustering\n", + "\n", + "print('Segmentation Properties:')\n", + "print(f' is_segmented: {clustering.is_segmented}')\n", + "print(f' n_segments: {clustering.n_segments}')\n", + "print(f' n_clusters: {clustering.n_clusters}')\n", + "print(f' timesteps_per_cluster (original): {clustering.timesteps_per_cluster}')\n", + "print(f'\\nTime dimension uses RangeIndex: {type(fs_segmented.timesteps)}')" + ] + }, + { + "cell_type": "markdown", + "id": "12", + "metadata": {}, + "source": [ + "## Variable Timestep Durations\n", + "\n", + "Each segment has a different duration, determined by how many original timesteps it represents:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13", + "metadata": {}, + "outputs": [], + "source": [ + "# Timestep duration is now a DataArray with (cluster, time) dimensions\n", + "timestep_duration = fs_segmented.timestep_duration\n", + "\n", + "print(f'Timestep duration shape: {dict(timestep_duration.sizes)}')\n", + "print('\\nSegment durations for cluster 0:')\n", + "cluster_0_durations = timestep_duration.sel(cluster=0).values\n", + "for i, dur in enumerate(cluster_0_durations):\n", + " print(f' Segment {i}: {dur:.2f} hours')\n", + "print(f' Total: {cluster_0_durations.sum():.2f} hours (should be 24h)')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14", + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize segment durations across clusters\n", + "duration_df = timestep_duration.to_dataframe('duration').reset_index()\n", + "fig = px.bar(\n", + " duration_df,\n", + " x='time',\n", + " y='duration',\n", + " facet_col='cluster',\n", + " facet_col_wrap=4,\n", + " title='Segment Durations by Cluster',\n", + " labels={'time': 'Segment', 'duration': 'Duration [hours]'},\n", + ")\n", + "fig.update_layout(height=400)\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "15", + "metadata": {}, + "source": [ + "## Optimize the Segmented System" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16", + "metadata": {}, + "outputs": [], + "source": [ + "start = timeit.default_timer()\n", + "fs_segmented.optimize(solver)\n", + "time_segmented = timeit.default_timer() - start\n", + "\n", + "print(f'Segmented optimization: {time_segmented:.2f} seconds')\n", + "print(f'Total cost: {fs_segmented.solution[\"costs\"].item():,.0f} €')\n", + "print(f'\\nSpeedup vs full: {time_full / (time_clustering + time_segmented):.1f}x')" + ] + }, + { + "cell_type": "markdown", + "id": "17", + "metadata": {}, + "source": [ + "## Compare Clustering Quality\n", + "\n", + "View how well the segmented data represents the original:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18", + "metadata": {}, + "outputs": [], + "source": [ + "# Duration curves show how well the distribution is preserved\n", + "fs_segmented.clustering.plot.compare(kind='duration_curve')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19", + "metadata": {}, + "outputs": [], + "source": [ + "# Clustering quality metrics\n", + "fs_segmented.clustering.metrics.to_dataframe().style.format('{:.3f}')" + ] + }, + { + "cell_type": "markdown", + "id": "20", + "metadata": {}, + "source": [ + "## Expand to Original Timesteps\n", + "\n", + "Use `expand()` to map the segmented solution back to all original timesteps:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21", + "metadata": {}, + "outputs": [], + "source": [ + "start = timeit.default_timer()\n", + "fs_expanded = fs_segmented.transform.expand()\n", + "time_expand = timeit.default_timer() - start\n", + "\n", + "print(f'Expansion time: {time_expand:.3f} seconds')\n", + "print(f'Expanded timesteps: {len(fs_expanded.timesteps)}')\n", + "print(f'Objective preserved: {fs_expanded.solution[\"costs\"].item():,.0f} €')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22", + "metadata": {}, + "outputs": [], + "source": [ + "# Compare flow rates: Full vs Expanded\n", + "import xarray as xr\n", + "\n", + "flow_var = 'CHP(Q_th)|flow_rate'\n", + "comparison_ds = xr.concat(\n", + " [fs_full.solution[flow_var], fs_expanded.solution[flow_var]],\n", + " dim=pd.Index(['Full', 'Expanded'], name='method'),\n", + ")\n", + "comparison_ds.plotly.line(color='method', title='CHP Heat Output Comparison')" + ] + }, + { + "cell_type": "markdown", + "id": "23", + "metadata": {}, + "source": [ + "## Two-Stage Workflow with Segmentation\n", + "\n", + "For investment optimization, use segmentation for fast sizing, then dispatch at full resolution:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24", + "metadata": {}, + "outputs": [], + "source": [ + "# Stage 1: Sizing with segmentation (already done)\n", + "SAFETY_MARGIN = 1.05\n", + "sizes_with_margin = {name: float(size.item()) * SAFETY_MARGIN for name, size in fs_segmented.statistics.sizes.items()}\n", + "\n", + "print('Optimized sizes with safety margin:')\n", + "for name, size in sizes_with_margin.items():\n", + " print(f' {name}: {size:.1f}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25", + "metadata": {}, + "outputs": [], + "source": [ + "# Stage 2: Full resolution dispatch with fixed sizes\n", + "start = timeit.default_timer()\n", + "fs_dispatch = flow_system.transform.fix_sizes(sizes_with_margin)\n", + "fs_dispatch.name = 'Two-Stage'\n", + "fs_dispatch.optimize(solver)\n", + "time_dispatch = timeit.default_timer() - start\n", + "\n", + "print(f'Dispatch time: {time_dispatch:.2f} seconds')\n", + "print(f'Final cost: {fs_dispatch.solution[\"costs\"].item():,.0f} €')" + ] + }, + { + "cell_type": "markdown", + "id": "26", + "metadata": {}, + "source": [ + "## Compare Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27", + "metadata": {}, + "outputs": [], + "source": [ + "total_segmented = time_clustering + time_segmented\n", + "total_two_stage = total_segmented + time_dispatch\n", + "\n", + "results = {\n", + " 'Full (baseline)': {\n", + " 'Time [s]': time_full,\n", + " 'Cost [€]': fs_full.solution['costs'].item(),\n", + " 'CHP': fs_full.statistics.sizes['CHP(Q_th)'].item(),\n", + " 'Boiler': fs_full.statistics.sizes['Boiler(Q_th)'].item(),\n", + " 'Storage': fs_full.statistics.sizes['Storage'].item(),\n", + " },\n", + " 'Segmented (8×6)': {\n", + " 'Time [s]': total_segmented,\n", + " 'Cost [€]': fs_segmented.solution['costs'].item(),\n", + " 'CHP': fs_segmented.statistics.sizes['CHP(Q_th)'].item(),\n", + " 'Boiler': fs_segmented.statistics.sizes['Boiler(Q_th)'].item(),\n", + " 'Storage': fs_segmented.statistics.sizes['Storage'].item(),\n", + " },\n", + " 'Two-Stage': {\n", + " 'Time [s]': total_two_stage,\n", + " 'Cost [€]': fs_dispatch.solution['costs'].item(),\n", + " 'CHP': sizes_with_margin['CHP(Q_th)'],\n", + " 'Boiler': sizes_with_margin['Boiler(Q_th)'],\n", + " 'Storage': sizes_with_margin['Storage'],\n", + " },\n", + "}\n", + "\n", + "comparison = pd.DataFrame(results).T\n", + "baseline_cost = comparison.loc['Full (baseline)', 'Cost [€]']\n", + "baseline_time = comparison.loc['Full (baseline)', 'Time [s]']\n", + "comparison['Cost Gap [%]'] = ((comparison['Cost [€]'] - baseline_cost) / abs(baseline_cost) * 100).round(2)\n", + "comparison['Speedup'] = (baseline_time / comparison['Time [s]']).round(1)\n", + "\n", + "comparison.style.format(\n", + " {\n", + " 'Time [s]': '{:.2f}',\n", + " 'Cost [€]': '{:,.0f}',\n", + " 'CHP': '{:.1f}',\n", + " 'Boiler': '{:.1f}',\n", + " 'Storage': '{:.0f}',\n", + " 'Cost Gap [%]': '{:.2f}',\n", + " 'Speedup': '{:.1f}x',\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "28", + "metadata": {}, + "source": [ + "## Segmentation with Multi-Period Systems\n", + "\n", + "Segmentation works with multi-period systems (multiple years, scenarios).\n", + "Each period/scenario combination is segmented independently:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29", + "metadata": {}, + "outputs": [], + "source": [ + "from data.generate_example_systems import create_multiperiod_system\n", + "\n", + "fs_multi = create_multiperiod_system()\n", + "# Use first week only for faster demo\n", + "fs_multi = fs_multi.transform.isel(time=slice(0, 168))\n", + "\n", + "print(f'Periods: {list(fs_multi.periods.values)}')\n", + "print(f'Scenarios: {list(fs_multi.scenarios.values)}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30", + "metadata": {}, + "outputs": [], + "source": [ + "# Cluster with segmentation\n", + "fs_multi_seg = fs_multi.transform.cluster(\n", + " n_clusters=3,\n", + " cluster_duration='1D',\n", + " segments=SegmentConfig(n_segments=6),\n", + " extremes=ExtremeConfig(method='new_cluster', max_value=['Building(Heat)|fixed_relative_profile']),\n", + ")\n", + "\n", + "print(f'Original: {len(fs_multi.timesteps)} timesteps')\n", + "print(f'Segmented: {len(fs_multi_seg.timesteps)} × {len(fs_multi_seg.clusters)} clusters')\n", + "print(f'is_segmented: {fs_multi_seg.clustering.is_segmented}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31", + "metadata": {}, + "outputs": [], + "source": [ + "# Cluster assignments have period/scenario dimensions\n", + "fs_multi_seg.clustering.cluster_assignments" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32", + "metadata": {}, + "outputs": [], + "source": [ + "# Optimize and expand\n", + "fs_multi_seg.optimize(solver)\n", + "fs_multi_expanded = fs_multi_seg.transform.expand()\n", + "\n", + "print(f'Expanded timesteps: {len(fs_multi_expanded.timesteps)}')\n", + "print(f'Objective: {fs_multi_expanded.solution[\"objective\"].item():,.0f} €')" + ] + }, + { + "cell_type": "markdown", + "id": "33", + "metadata": {}, + "source": [ + "## API Reference\n", + "\n", + "### SegmentConfig Parameters\n", + "\n", + "```python\n", + "from tsam.config import SegmentConfig\n", + "\n", + "segments = SegmentConfig(\n", + " n_segments=6, # Number of segments per cluster period\n", + " representation_method='mean', # How to represent segment values ('mean', 'medoid', etc.)\n", + ")\n", + "```\n", + "\n", + "### Segmentation Properties\n", + "\n", + "After segmentation, `fs.clustering` has additional properties:\n", + "\n", + "| Property | Description |\n", + "|----------|-------------|\n", + "| `is_segmented` | `True` if segmentation was used |\n", + "| `n_segments` | Number of segments per cluster |\n", + "| `timesteps_per_cluster` | Original timesteps per cluster (before segmentation) |\n", + "\n", + "### Timestep Duration\n", + "\n", + "For segmented systems, `fs.timestep_duration` is a DataArray with `(cluster, time)` dimensions:\n", + "\n", + "```python\n", + "# Each segment has different duration\n", + "fs_segmented.timestep_duration # Shape: (n_clusters, n_segments)\n", + "\n", + "# Sum should equal original period duration\n", + "fs_segmented.timestep_duration.sum('time') # Should be 24h for daily clusters\n", + "```\n", + "\n", + "### Example Workflow\n", + "\n", + "```python\n", + "from tsam.config import ExtremeConfig, SegmentConfig\n", + "\n", + "# Cluster with segmentation\n", + "fs_segmented = flow_system.transform.cluster(\n", + " n_clusters=8,\n", + " cluster_duration='1D',\n", + " segments=SegmentConfig(n_segments=6),\n", + " extremes=ExtremeConfig(method='new_cluster', max_value=['Demand|profile']),\n", + ")\n", + "\n", + "# Optimize\n", + "fs_segmented.optimize(solver)\n", + "\n", + "# Expand back to original timesteps\n", + "fs_expanded = fs_segmented.transform.expand()\n", + "\n", + "# Two-stage workflow\n", + "sizes = {k: v.item() * 1.05 for k, v in fs_segmented.statistics.sizes.items()}\n", + "fs_dispatch = flow_system.transform.fix_sizes(sizes)\n", + "fs_dispatch.optimize(solver)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "34", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "You learned how to:\n", + "\n", + "- Use **`SegmentConfig`** to enable intra-period segmentation\n", + "- Work with **variable timestep durations** for each segment\n", + "- **Combine clustering and segmentation** for maximum problem size reduction\n", + "- **Expand segmented solutions** back to original timesteps\n", + "- Use segmentation with **multi-period systems**\n", + "\n", + "### Key Takeaways\n", + "\n", + "1. **Segmentation reduces problem size further**: From 8×24=192 to 8×6=48 timesteps\n", + "2. **Variable durations preserve accuracy**: Important periods get more timesteps\n", + "3. **Works with multi-period**: Each period/scenario is segmented independently\n", + "4. **expand() works correctly**: Maps segment values to all original timesteps\n", + "5. **Two-stage is still recommended**: Use segmentation for sizing, full resolution for dispatch\n", + "\n", + "### Trade-offs\n", + "\n", + "| More Segments | Fewer Segments |\n", + "|---------------|----------------|\n", + "| Higher accuracy | Lower accuracy |\n", + "| Slower solve | Faster solve |\n", + "| More memory | Less memory |\n", + "\n", + "Start with 6-12 segments and adjust based on your accuracy needs." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/notebooks/09-plotting-and-data-access.ipynb b/docs/notebooks/09-plotting-and-data-access.ipynb index 39fa788da..7f92a9e96 100644 --- a/docs/notebooks/09-plotting-and-data-access.ipynb +++ b/docs/notebooks/09-plotting-and-data-access.ipynb @@ -831,8 +831,16 @@ "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "version": "3.11" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" } }, "nbformat": 4, diff --git a/docs/notebooks/10-transmission.ipynb b/docs/notebooks/10-transmission.ipynb index 85d2c53d8..224183319 100644 --- a/docs/notebooks/10-transmission.ipynb +++ b/docs/notebooks/10-transmission.ipynb @@ -633,8 +633,16 @@ "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "version": "3.10.0" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" } }, "nbformat": 4, diff --git a/docs/user-guide/optimization/clustering.md b/docs/user-guide/optimization/clustering.md index f975595d6..c314cf5f4 100644 --- a/docs/user-guide/optimization/clustering.md +++ b/docs/user-guide/optimization/clustering.md @@ -23,6 +23,7 @@ The recommended approach: cluster for fast sizing, then validate at full resolut ```python import flixopt as fx +from tsam.config import ExtremeConfig # Load or create your FlowSystem flow_system = fx.FlowSystem(timesteps) @@ -32,7 +33,7 @@ flow_system.add_elements(...) fs_clustered = flow_system.transform.cluster( n_clusters=12, cluster_duration='1D', - time_series_for_high_peaks=['HeatDemand(Q)|fixed_relative_profile'], + extremes=ExtremeConfig(method='new_cluster', max_value=['HeatDemand(Q)|fixed_relative_profile']), ) fs_clustered.optimize(fx.solvers.HighsSolver()) @@ -50,62 +51,86 @@ flow_rates = fs_expanded.solution['Boiler(Q_th)|flow_rate'] |-----------|-------------|---------| | `n_clusters` | Number of typical periods | `12` (typical days for a year) | | `cluster_duration` | Duration of each cluster | `'1D'`, `'24h'`, or `24` (hours) | -| `time_series_for_high_peaks` | Time series where peak clusters must be captured | `['HeatDemand(Q)\|fixed_relative_profile']` | -| `time_series_for_low_peaks` | Time series where minimum clusters must be captured | `['SolarGen(P)\|fixed_relative_profile']` | -| `cluster_method` | Clustering algorithm | `'k_means'`, `'hierarchical'`, `'k_medoids'` | -| `representation_method` | How clusters are represented | `'meanRepresentation'`, `'medoidRepresentation'` | -| `random_state` | Random seed for reproducibility | `42` | -| `rescale_cluster_periods` | Rescale clusters to match original means | `True` (default) | +| `weights` | Clustering weights per time series | `{'demand': 2.0, 'solar': 1.0}` | +| `cluster` | tsam `ClusterConfig` for clustering options | `ClusterConfig(method='k_medoids')` | +| `extremes` | tsam `ExtremeConfig` for peak preservation | `ExtremeConfig(method='new_cluster', max_value=[...])` | +| `predef_cluster_assignments` | Manual cluster assignments | Array of cluster indices | -### Peak Selection +### Peak Selection with ExtremeConfig -Use `time_series_for_high_peaks` to ensure extreme conditions are represented: +Use `ExtremeConfig` to ensure extreme conditions are represented: ```python +from tsam.config import ExtremeConfig + # Ensure the peak demand day is included fs_clustered = flow_system.transform.cluster( n_clusters=8, cluster_duration='1D', - time_series_for_high_peaks=['HeatDemand(Q)|fixed_relative_profile'], + extremes=ExtremeConfig( + method='new_cluster', # Create new cluster for extremes + max_value=['HeatDemand(Q)|fixed_relative_profile'], # Capture peak demand + ), ) ``` Without peak selection, the clustering algorithm might average out extreme days, leading to undersized equipment. -### Advanced Clustering Options +**ExtremeConfig options:** + +| Field | Description | +|-------|-------------| +| `method` | How extremes are handled: `'new_cluster'`, `'append'`, `'replace_cluster_center'` | +| `max_value` | Time series where maximum values should be preserved | +| `min_value` | Time series where minimum values should be preserved | +| `max_period` | Time series where period with maximum sum should be preserved | +| `min_period` | Time series where period with minimum sum should be preserved | -Fine-tune the clustering algorithm with advanced parameters: +### Advanced Clustering Options with ClusterConfig + +Fine-tune the clustering algorithm with `ClusterConfig`: ```python +from tsam.config import ClusterConfig, ExtremeConfig + fs_clustered = flow_system.transform.cluster( n_clusters=8, cluster_duration='1D', - cluster_method='hierarchical', # Alternative to k_means - representation_method='medoidRepresentation', # Use actual periods, not averages - rescale_cluster_periods=True, # Match original time series means - random_state=42, # Reproducible results + cluster=ClusterConfig( + method='hierarchical', # Clustering algorithm + representation='medoid', # Use actual periods, not averages + ), + extremes=ExtremeConfig(method='new_cluster', max_value=['demand']), ) ``` -**Available clustering algorithms** (`cluster_method`): +**Available clustering algorithms** (`ClusterConfig.method`): | Method | Description | |--------|-------------| -| `'k_means'` | Fast, good for most cases (default) | -| `'hierarchical'` | Produces consistent hierarchical groupings | +| `'hierarchical'` | Produces consistent hierarchical groupings (default) | +| `'k_means'` | Fast, good for most cases | | `'k_medoids'` | Uses actual periods as representatives | | `'k_maxoids'` | Maximizes representativeness | | `'averaging'` | Simple averaging of similar periods | -For advanced tsam parameters not exposed directly, use `**kwargs`: +**Representation methods** (`ClusterConfig.representation`): + +| Method | Description | +|--------|-------------| +| `'medoid'` | Use actual periods as representatives (default) | +| `'mean'` | Average of all periods in cluster | +| `'distribution'` | Preserve value distribution (duration curves) | + +For additional tsam parameters, pass them as keyword arguments: ```python -# Pass any tsam.TimeSeriesAggregation parameter +# Pass any tsam.aggregate() parameter fs_clustered = flow_system.transform.cluster( n_clusters=8, cluster_duration='1D', - sameMean=True, # Normalize all time series to same mean - sortValues=True, # Cluster by duration curves instead of shape + normalize_column_means=True, # Normalize all time series to same mean + preserve_column_means=True, # Rescale results to match original means ) ``` diff --git a/docs/user-guide/optimization/index.md b/docs/user-guide/optimization/index.md index c17eb63e4..868580656 100644 --- a/docs/user-guide/optimization/index.md +++ b/docs/user-guide/optimization/index.md @@ -56,11 +56,13 @@ flow_system.solve(fx.solvers.HighsSolver()) For large problems, use time series clustering to reduce computational complexity: ```python +from tsam.config import ExtremeConfig + # Cluster to 12 typical days fs_clustered = flow_system.transform.cluster( n_clusters=12, cluster_duration='1D', - time_series_for_high_peaks=['HeatDemand(Q)|fixed_relative_profile'], + extremes=ExtremeConfig(method='new_cluster', max_value=['HeatDemand(Q)|fixed_relative_profile']), ) # Optimize the clustered system diff --git a/flixopt/clustering/__init__.py b/flixopt/clustering/__init__.py index 1e78cfa04..43ace2d44 100644 --- a/flixopt/clustering/__init__.py +++ b/flixopt/clustering/__init__.py @@ -1,42 +1,51 @@ """ Time Series Aggregation Module for flixopt. -This module provides data structures for time series clustering/aggregation. - -Key classes: -- ClusterResult: Universal result container for clustering -- ClusterStructure: Hierarchical structure info for storage inter-cluster linking -- Clustering: Stored on FlowSystem after clustering +This module provides wrapper classes around tsam's clustering functionality: +- Clustering: Top-level class stored on FlowSystem after clustering +- ClusteringResults: Manages collection of tsam ClusteringResult objects (for IO) Example usage: # Cluster a FlowSystem to reduce timesteps + from tsam.config import ExtremeConfig + fs_clustered = flow_system.transform.cluster( n_clusters=8, cluster_duration='1D', - time_series_for_high_peaks=['Demand|fixed_relative_profile'], + extremes=ExtremeConfig(method='new_cluster', max_value=['Demand|fixed_relative_profile']), ) - # Access clustering metadata - info = fs_clustered.clustering - print(f'Number of clusters: {info.result.cluster_structure.n_clusters}') + # Access clustering structure (available before AND after IO) + clustering = fs_clustered.clustering + print(f'Number of clusters: {clustering.n_clusters}') + print(f'Dims: {clustering.dims}') # e.g., ('period', 'scenario') + print(f'Coords: {clustering.coords}') # e.g., {'period': [2024, 2025]} + + # Access tsam AggregationResult for detailed analysis + # NOTE: Only available BEFORE saving/loading. Lost after IO. + result = clustering.sel(period=2024, scenario='high') + result.cluster_representatives # DataFrame with aggregated time series + result.accuracy # AccuracyMetrics (rmse, mae) + result.plot.compare() # tsam's built-in comparison plot + + # Iterate over all results (only before IO) + for key, result in clustering.items(): + print(f'{key}: {result.n_clusters} clusters') + + # Save and load - structure preserved, AggregationResult access lost + fs_clustered.to_netcdf('system.nc') + # Use include_original_data=False for smaller files (~38% reduction) + fs_clustered.to_netcdf('system.nc', include_original_data=False) # Expand back to full resolution fs_expanded = fs_clustered.transform.expand() """ -from .base import ( - Clustering, - ClusterResult, - ClusterStructure, - create_cluster_structure_from_mapping, -) +from .base import AggregationResults, Clustering, ClusteringResults __all__ = [ - # Core classes - 'ClusterResult', + 'ClusteringResults', + 'AggregationResults', 'Clustering', - 'ClusterStructure', - # Utilities - 'create_cluster_structure_from_mapping', ] diff --git a/flixopt/clustering/base.py b/flixopt/clustering/base.py index 10224a1a5..ee0d2bf43 100644 --- a/flixopt/clustering/base.py +++ b/flixopt/clustering/base.py @@ -1,24 +1,16 @@ """ -Base classes and data structures for time series aggregation (clustering). +Clustering classes for time series aggregation. -This module provides an abstraction layer for time series aggregation that -supports multiple backends (TSAM, manual/external, etc.). - -Terminology: -- "cluster" = a group of similar time chunks (e.g., similar days grouped together) -- "typical period" = a representative time chunk for a cluster (TSAM terminology) -- "cluster duration" = the length of each time chunk (e.g., 24h for daily clustering) - -Note: This is separate from the model's "period" dimension (years/months) and -"scenario" dimension. The aggregation operates on the 'time' dimension. - -All data structures use xarray for consistent handling of coordinates. +This module provides wrapper classes around tsam's clustering functionality: +- `ClusteringResults`: Collection of tsam ClusteringResult objects for multi-dim (period, scenario) data +- `Clustering`: Top-level class stored on FlowSystem after clustering """ from __future__ import annotations -import warnings -from dataclasses import dataclass +import functools +import json +from collections import Counter from typing import TYPE_CHECKING, Any import numpy as np @@ -26,6 +18,11 @@ import xarray as xr if TYPE_CHECKING: + from pathlib import Path + + from tsam import AggregationResult + from tsam import ClusteringResult as TsamClusteringResult + from ..color_processing import ColorType from ..plot_result import PlotResult from ..statistics_accessor import SelectType @@ -44,7 +41,7 @@ def _apply_slot_defaults(plotly_kwargs: dict, defaults: dict[str, str | None]) - plotly_kwargs.setdefault(slot, value) -def _select_dims(da: xr.DataArray, period: str | None = None, scenario: str | None = None) -> xr.DataArray: +def _select_dims(da: xr.DataArray, period: Any = None, scenario: Any = None) -> xr.DataArray: """Select from DataArray by period/scenario if those dimensions exist.""" if 'period' in da.dims and period is not None: da = da.sel(period=period) @@ -53,486 +50,1307 @@ def _select_dims(da: xr.DataArray, period: str | None = None, scenario: str | No return da -@dataclass -class ClusterStructure: - """Structure information for inter-cluster storage linking. +def combine_slices( + slices: dict[tuple, np.ndarray], + extra_dims: list[str], + dim_coords: dict[str, list], + output_dim: str, + output_coord: Any, + attrs: dict | None = None, +) -> xr.DataArray: + """Combine {(dim_values): 1D_array} dict into a DataArray. - This class captures the hierarchical structure of time series clustering, - which is needed for proper storage state-of-charge tracking across - typical periods when using cluster(). + This utility simplifies the common pattern of iterating over extra dimensions + (like period, scenario), processing each slice, and combining results. - Note: The "original_cluster" dimension indexes the original cluster-sized - time segments (e.g., 0..364 for 365 days), NOT the model's "period" dimension - (years). Each original segment gets assigned to a representative cluster. + Args: + slices: Dict mapping dimension value tuples to 1D numpy arrays. + Keys are tuples like ('period1', 'scenario1') matching extra_dims order. + extra_dims: Dimension names in order (e.g., ['period', 'scenario']). + dim_coords: Dict mapping dimension names to coordinate values. + output_dim: Name of the output dimension (typically 'time'). + output_coord: Coordinate values for output dimension. + attrs: Optional DataArray attributes. - Attributes: - cluster_order: Maps original cluster index → representative cluster ID. - dims: [original_cluster] for simple case, or - [original_cluster, period, scenario] for multi-period/scenario systems. - Values are cluster IDs (0 to n_clusters-1). - cluster_occurrences: Count of how many original time chunks each cluster represents. - dims: [cluster] for simple case, or [cluster, period, scenario] for multi-dim. - n_clusters: Number of distinct clusters (typical periods). - timesteps_per_cluster: Number of timesteps in each cluster (e.g., 24 for daily). + Returns: + DataArray with dims [output_dim, *extra_dims]. + + Raises: + ValueError: If slices is empty. + KeyError: If a required key is missing from slices. Example: - For 365 days clustered into 8 typical days: - - cluster_order: shape (365,), values 0-7 indicating which cluster each day belongs to - - cluster_occurrences: shape (8,), e.g., [45, 46, 46, 46, 46, 45, 45, 46] - - n_clusters: 8 - - timesteps_per_cluster: 24 (for hourly data) - - For multi-scenario (e.g., 2 scenarios): - - cluster_order: shape (365, 2) with dims [original_cluster, scenario] - - cluster_occurrences: shape (8, 2) with dims [cluster, scenario] + >>> slices = { + ... ('P1', 'base'): np.array([1, 2, 3]), + ... ('P1', 'high'): np.array([4, 5, 6]), + ... ('P2', 'base'): np.array([7, 8, 9]), + ... ('P2', 'high'): np.array([10, 11, 12]), + ... } + >>> result = combine_slices( + ... slices, + ... extra_dims=['period', 'scenario'], + ... dim_coords={'period': ['P1', 'P2'], 'scenario': ['base', 'high']}, + ... output_dim='time', + ... output_coord=[0, 1, 2], + ... ) + >>> result.dims + ('time', 'period', 'scenario') """ + if not slices: + raise ValueError('slices cannot be empty') + + first = next(iter(slices.values())) + n_output = len(first) + shape = [n_output] + [len(dim_coords[d]) for d in extra_dims] + data = np.empty(shape, dtype=first.dtype) + + for combo in np.ndindex(*shape[1:]): + key = tuple(dim_coords[d][i] for d, i in zip(extra_dims, combo, strict=True)) + try: + data[(slice(None),) + combo] = slices[key] + except KeyError: + raise KeyError(f'Missing slice for key {key} (extra_dims={extra_dims})') from None + + return xr.DataArray( + data, + dims=[output_dim] + extra_dims, + coords={output_dim: output_coord, **dim_coords}, + attrs=attrs or {}, + ) - cluster_order: xr.DataArray - cluster_occurrences: xr.DataArray - n_clusters: int | xr.DataArray - timesteps_per_cluster: int - - def __post_init__(self): - """Validate and ensure proper DataArray formatting.""" - # Ensure cluster_order is a DataArray with proper dims - if not isinstance(self.cluster_order, xr.DataArray): - self.cluster_order = xr.DataArray(self.cluster_order, dims=['original_cluster'], name='cluster_order') - elif self.cluster_order.name is None: - self.cluster_order = self.cluster_order.rename('cluster_order') - - # Ensure cluster_occurrences is a DataArray with proper dims - if not isinstance(self.cluster_occurrences, xr.DataArray): - self.cluster_occurrences = xr.DataArray( - self.cluster_occurrences, dims=['cluster'], name='cluster_occurrences' - ) - elif self.cluster_occurrences.name is None: - self.cluster_occurrences = self.cluster_occurrences.rename('cluster_occurrences') - def __repr__(self) -> str: - n_clusters = ( - int(self.n_clusters) if isinstance(self.n_clusters, (int, np.integer)) else int(self.n_clusters.values) - ) - # Handle multi-dimensional cluster_occurrences (with period/scenario dims) - occ_data = self.cluster_occurrences - extra_dims = [d for d in occ_data.dims if d != 'cluster'] - if extra_dims: - # Multi-dimensional: show shape info instead of individual values - occ_info = f'shape={dict(occ_data.sizes)}' - else: - # Simple case: list of occurrences per cluster - occ_info = [int(occ_data.sel(cluster=c).values) for c in range(n_clusters)] - return ( - f'ClusterStructure(\n' - f' {self.n_original_clusters} original periods → {n_clusters} clusters\n' - f' timesteps_per_cluster={self.timesteps_per_cluster}\n' - f' occurrences={occ_info}\n' - f')' - ) +def _cluster_occurrences(cr: TsamClusteringResult) -> np.ndarray: + """Compute cluster occurrences from ClusteringResult.""" + counts = Counter(cr.cluster_assignments) + return np.array([counts.get(i, 0) for i in range(cr.n_clusters)]) - def _create_reference_structure(self) -> tuple[dict, dict[str, xr.DataArray]]: - """Create reference structure for serialization.""" - ref = {'__class__': self.__class__.__name__} - arrays = {} - # Store DataArrays with references - arrays[str(self.cluster_order.name)] = self.cluster_order - ref['cluster_order'] = f':::{self.cluster_order.name}' +def _build_timestep_mapping(cr: TsamClusteringResult, n_timesteps: int) -> np.ndarray: + """Build mapping from original timesteps to representative timestep indices. - arrays[str(self.cluster_occurrences.name)] = self.cluster_occurrences - ref['cluster_occurrences'] = f':::{self.cluster_occurrences.name}' + For segmented systems, the mapping uses segment_assignments from tsam to map + each original timestep position to its corresponding segment index. + """ + timesteps_per_cluster = cr.n_timesteps_per_period + # For segmented systems, representative time dimension has n_segments entries + # For non-segmented, it has timesteps_per_cluster entries + n_segments = cr.n_segments + is_segmented = n_segments is not None + time_dim_size = n_segments if is_segmented else timesteps_per_cluster + + # For segmented systems, tsam provides segment_assignments which maps + # each position within a period to its segment index + segment_assignments = cr.segment_assignments if is_segmented else None + + mapping = np.zeros(n_timesteps, dtype=np.int32) + for period_idx, cluster_id in enumerate(cr.cluster_assignments): + for pos in range(timesteps_per_cluster): + orig_idx = period_idx * timesteps_per_cluster + pos + if orig_idx < n_timesteps: + if is_segmented and segment_assignments is not None: + # For segmented: use tsam's segment_assignments to get segment index + # segment_assignments[cluster_id][pos] gives the segment index + segment_idx = segment_assignments[cluster_id][pos] + mapping[orig_idx] = int(cluster_id) * time_dim_size + segment_idx + else: + # Non-segmented: direct position mapping + mapping[orig_idx] = int(cluster_id) * time_dim_size + pos + return mapping + + +class ClusteringResults: + """Collection of tsam ClusteringResult objects for multi-dimensional data. + + Manages multiple ClusteringResult objects keyed by (period, scenario) tuples + and provides convenient access and multi-dimensional DataArray building. + + Follows xarray-like patterns with `.dims`, `.coords`, `.sel()`, and `.isel()`. - # Store scalar values - if isinstance(self.n_clusters, xr.DataArray): - n_clusters_name = self.n_clusters.name or 'n_clusters' - n_clusters_da = self.n_clusters.rename(n_clusters_name) - arrays[n_clusters_name] = n_clusters_da - ref['n_clusters'] = f':::{n_clusters_name}' - else: - ref['n_clusters'] = int(self.n_clusters) + Attributes: + dims: Tuple of dimension names, e.g., ('period', 'scenario'). + coords: Dict mapping dimension names to their coordinate values. - ref['timesteps_per_cluster'] = self.timesteps_per_cluster + Example: + >>> results = ClusteringResults({(): cr}, dim_names=[]) + >>> results.n_clusters + 2 + >>> results.cluster_assignments # Returns DataArray + + + >>> # Multi-dimensional case + >>> results = ClusteringResults( + ... {(2024, 'high'): cr1, (2024, 'low'): cr2}, + ... dim_names=['period', 'scenario'], + ... ) + >>> results.dims + ('period', 'scenario') + >>> results.coords + {'period': [2024], 'scenario': ['high', 'low']} + >>> results.sel(period=2024, scenario='high') # Label-based + + >>> results.isel(period=0, scenario=1) # Index-based + + """ + + def __init__( + self, + results: dict[tuple, TsamClusteringResult], + dim_names: list[str], + ): + """Initialize ClusteringResults. + + Args: + results: Dict mapping (period, scenario) tuples to tsam ClusteringResult objects. + For simple cases without periods/scenarios, use {(): result}. + dim_names: Names of extra dimensions, e.g., ['period', 'scenario']. + """ + if not results: + raise ValueError('results cannot be empty') + self._results = results + self._dim_names = dim_names - return ref, arrays + # ========================================================================== + # xarray-like interface + # ========================================================================== @property - def n_original_clusters(self) -> int: - """Number of original periods (before clustering).""" - return len(self.cluster_order.coords['original_cluster']) + def dims(self) -> tuple[str, ...]: + """Dimension names as tuple (xarray-like).""" + return tuple(self._dim_names) + + @property + def dim_names(self) -> list[str]: + """Dimension names as list (backwards compatibility).""" + return list(self._dim_names) @property - def has_multi_dims(self) -> bool: - """Check if cluster_order has period/scenario dimensions.""" - return 'period' in self.cluster_order.dims or 'scenario' in self.cluster_order.dims + def coords(self) -> dict[str, list]: + """Coordinate values for each dimension (xarray-like). + + Returns: + Dict mapping dimension names to lists of coordinate values. + """ + return {dim: self._get_dim_values(dim) for dim in self._dim_names} - def get_cluster_order_for_slice(self, period: str | None = None, scenario: str | None = None) -> np.ndarray: - """Get cluster_order for a specific (period, scenario) combination. + def sel(self, **kwargs: Any) -> TsamClusteringResult: + """Select result by dimension labels (xarray-like). Args: - period: Period label (None if no period dimension). - scenario: Scenario label (None if no scenario dimension). + **kwargs: Dimension name=value pairs, e.g., period=2024, scenario='high'. Returns: - 1D numpy array of cluster indices for the specified slice. + The tsam ClusteringResult for the specified combination. + + Raises: + KeyError: If no result found for the specified combination. + + Example: + >>> results.sel(period=2024, scenario='high') + """ - return _select_dims(self.cluster_order, period, scenario).values.astype(int) + key = self._make_key(**kwargs) + if key not in self._results: + raise KeyError(f'No result found for {kwargs}') + return self._results[key] - def get_cluster_occurrences_for_slice( - self, period: str | None = None, scenario: str | None = None - ) -> dict[int, int]: - """Get cluster occurrence counts for a specific (period, scenario) combination. + def isel(self, **kwargs: int) -> TsamClusteringResult: + """Select result by dimension indices (xarray-like). Args: - period: Period label (None if no period dimension). - scenario: Scenario label (None if no scenario dimension). + **kwargs: Dimension name=index pairs, e.g., period=0, scenario=1. Returns: - Dict mapping cluster ID to occurrence count. + The tsam ClusteringResult for the specified combination. Raises: - ValueError: If period/scenario dimensions exist but no selector was provided. + IndexError: If index is out of range for a dimension. + + Example: + >>> results.isel(period=0, scenario=1) + """ - occ = _select_dims(self.cluster_occurrences, period, scenario) - extra_dims = [d for d in occ.dims if d != 'cluster'] - if extra_dims: - raise ValueError( - f'cluster_occurrences has dimensions {extra_dims} that were not selected. ' - f"Provide 'period' and/or 'scenario' arguments to select a specific slice." - ) - return {int(c): int(occ.sel(cluster=c).values) for c in occ.coords['cluster'].values} + label_kwargs = {} + for dim, idx in kwargs.items(): + coord_values = self._get_dim_values(dim) + if coord_values is None: + raise KeyError(f"Dimension '{dim}' not found in dims {self.dims}") + if idx < 0 or idx >= len(coord_values): + raise IndexError(f"Index {idx} out of range for dimension '{dim}' with {len(coord_values)} values") + label_kwargs[dim] = coord_values[idx] + return self.sel(**label_kwargs) - def plot(self, colors: str | list[str] | None = None, show: bool | None = None) -> PlotResult: - """Plot cluster assignment visualization. + def __getitem__(self, key: tuple) -> TsamClusteringResult: + """Get result by key tuple.""" + return self._results[key] - Shows which cluster each original period belongs to, and the - number of occurrences per cluster. For multi-period/scenario structures, - creates a faceted grid plot. + # === Iteration === - Args: - colors: Colorscale name (str) or list of colors. - Defaults to plotly template's sequential colorscale. - show: Whether to display the figure. Defaults to CONFIG.Plotting.default_show. + def __iter__(self): + """Iterate over ClusteringResult objects.""" + return iter(self._results.values()) + + def __len__(self) -> int: + """Number of ClusteringResult objects.""" + return len(self._results) + + def items(self): + """Iterate over (key, ClusteringResult) pairs.""" + return self._results.items() + + def keys(self): + """Iterate over keys.""" + return self._results.keys() + + def values(self): + """Iterate over ClusteringResult objects.""" + return self._results.values() + + # === Properties from first result === + + @property + def _first_result(self) -> TsamClusteringResult: + """Get the first ClusteringResult (for structure info).""" + return next(iter(self._results.values())) + + @property + def n_clusters(self) -> int: + """Number of clusters (same for all results).""" + return self._first_result.n_clusters + + @property + def timesteps_per_cluster(self) -> int: + """Number of timesteps per cluster (same for all results).""" + return self._first_result.n_timesteps_per_period + + @property + def n_original_periods(self) -> int: + """Number of original periods (same for all results).""" + return self._first_result.n_original_periods + + @property + def n_segments(self) -> int | None: + """Number of segments per cluster, or None if not segmented.""" + return self._first_result.n_segments + + # === Multi-dim DataArrays === + + @property + def cluster_assignments(self) -> xr.DataArray: + """Maps each original cluster to its typical cluster index. Returns: - PlotResult containing the figure and underlying data. + DataArray with dims [original_cluster, period?, scenario?]. """ - from ..config import CONFIG - from ..plot_result import PlotResult + # Note: No coords on original_cluster - they cause issues when used as isel() indexer + return self._build_property_array( + lambda cr: np.array(cr.cluster_assignments), + base_dims=['original_cluster'], + name='cluster_assignments', + ) + + @property + def cluster_occurrences(self) -> xr.DataArray: + """How many original clusters map to each typical cluster. - n_clusters = ( - int(self.n_clusters) if isinstance(self.n_clusters, (int, np.integer)) else int(self.n_clusters.values) + Returns: + DataArray with dims [cluster, period?, scenario?]. + """ + return self._build_property_array( + _cluster_occurrences, + base_dims=['cluster'], + base_coords={'cluster': range(self.n_clusters)}, + name='cluster_occurrences', ) - # Build DataArray with 1-based original_cluster coords - cluster_da = self.cluster_order.assign_coords( - original_cluster=np.arange(1, self.cluster_order.sizes['original_cluster'] + 1) + @property + def cluster_centers(self) -> xr.DataArray: + """Which original cluster is the representative (center) for each typical cluster. + + Returns: + DataArray with dims [cluster, period?, scenario?]. + """ + return self._build_property_array( + lambda cr: np.array(cr.cluster_centers), + base_dims=['cluster'], + base_coords={'cluster': range(self.n_clusters)}, + name='cluster_centers', ) - has_period = 'period' in cluster_da.dims - has_scenario = 'scenario' in cluster_da.dims + @property + def segment_assignments(self) -> xr.DataArray | None: + """For each timestep within a cluster, which segment it belongs to. - # Transpose for heatmap: first dim = y-axis, second dim = x-axis - if has_period: - cluster_da = cluster_da.transpose('period', 'original_cluster', ...) - elif has_scenario: - cluster_da = cluster_da.transpose('scenario', 'original_cluster', ...) + Returns: + DataArray with dims [cluster, time, period?, scenario?], or None if not segmented. + """ + if self._first_result.segment_assignments is None: + return None + timesteps = self._first_result.n_timesteps_per_period + return self._build_property_array( + lambda cr: np.array(cr.segment_assignments), + base_dims=['cluster', 'time'], + base_coords={'cluster': range(self.n_clusters), 'time': range(timesteps)}, + name='segment_assignments', + ) - # Data to return (without dummy dims) - ds = xr.Dataset({'cluster_order': cluster_da}) + @property + def segment_durations(self) -> xr.DataArray | None: + """Duration of each segment in timesteps. - # For plotting: add dummy y-dim if needed (heatmap requires 2D) - if not has_period and not has_scenario: - plot_da = cluster_da.expand_dims(y=['']).transpose('y', 'original_cluster') - plot_ds = xr.Dataset({'cluster_order': plot_da}) - else: - plot_ds = ds + Returns: + DataArray with dims [cluster, segment, period?, scenario?], or None if not segmented. + """ + if self._first_result.segment_durations is None: + return None + n_segments = self._first_result.n_segments + + def _get_padded_durations(cr: TsamClusteringResult) -> np.ndarray: + """Pad ragged segment durations to uniform shape.""" + return np.array([list(d) + [np.nan] * (n_segments - len(d)) for d in cr.segment_durations]) + + return self._build_property_array( + _get_padded_durations, + base_dims=['cluster', 'segment'], + base_coords={'cluster': range(self.n_clusters), 'segment': range(n_segments)}, + name='segment_durations', + ) - imshow_kwargs = {'title': f'Cluster Assignment ({self.n_original_clusters} → {n_clusters} clusters)'} - if colors is not None: - imshow_kwargs['color_continuous_scale'] = colors - fig = plot_ds.plotly.imshow(**imshow_kwargs) + @property + def segment_centers(self) -> xr.DataArray | None: + """Center of each intra-period segment. - fig.update_coloraxes(colorbar_title='Cluster') - if not has_period and not has_scenario: - fig.update_yaxes(showticklabels=False) + Only available if segmentation was configured during clustering. - plot_result = PlotResult(data=ds, figure=fig) + Returns: + DataArray or None if no segmentation. + """ + first = self._first_result + if first.segment_centers is None: + return None - if show is None: - show = CONFIG.Plotting.default_show - if show: - plot_result.show() + # tsam's segment_centers may be None even with segments configured + return None - return plot_result + @property + def position_within_segment(self) -> xr.DataArray | None: + """Position of each timestep within its segment (0-indexed). + + For each (cluster, time) position, returns how many timesteps into the + segment that position is. Used for interpolation within segments. + + Returns: + DataArray with dims [cluster, time] or [cluster, time, period?, scenario?]. + Returns None if no segmentation. + """ + segment_assignments = self.segment_assignments + if segment_assignments is None: + return None + + def _compute_positions(seg_assigns: np.ndarray) -> np.ndarray: + """Compute position within segment for each (cluster, time).""" + n_clusters, n_times = seg_assigns.shape + positions = np.zeros_like(seg_assigns) + for c in range(n_clusters): + pos = 0 + prev_seg = -1 + for t in range(n_times): + seg = seg_assigns[c, t] + if seg != prev_seg: + pos = 0 + prev_seg = seg + positions[c, t] = pos + pos += 1 + return positions + + # Handle extra dimensions by applying _compute_positions to each slice + extra_dims = [d for d in segment_assignments.dims if d not in ('cluster', 'time')] + + if not extra_dims: + positions = _compute_positions(segment_assignments.values) + return xr.DataArray( + positions, + dims=['cluster', 'time'], + coords=segment_assignments.coords, + name='position_within_segment', + ) + + # Multi-dimensional case: compute for each period/scenario slice + result = xr.apply_ufunc( + _compute_positions, + segment_assignments, + input_core_dims=[['cluster', 'time']], + output_core_dims=[['cluster', 'time']], + vectorize=True, + ) + return result.rename('position_within_segment') + + # === Serialization === + + def to_dict(self) -> dict: + """Serialize to dict. + + The dict can be used to reconstruct via from_dict(). + """ + return { + 'dim_names': list(self._dim_names), + 'results': {self._key_to_str(key): result.to_dict() for key, result in self._results.items()}, + } + + @classmethod + def from_dict(cls, d: dict) -> ClusteringResults: + """Reconstruct from dict. + Args: + d: Dict from to_dict(). + + Returns: + Reconstructed ClusteringResults. + """ + from tsam import ClusteringResult + + dim_names = d['dim_names'] + results = {} + for key_str, result_dict in d['results'].items(): + key = cls._str_to_key(key_str, dim_names) + results[key] = ClusteringResult.from_dict(result_dict) + return cls(results, dim_names) + + # === Private helpers === + + def _make_key(self, **kwargs: Any) -> tuple: + """Create a key tuple from dimension keyword arguments.""" + key_parts = [] + for dim in self._dim_names: + if dim in kwargs: + key_parts.append(kwargs[dim]) + return tuple(key_parts) + + def _get_dim_values(self, dim: str) -> list | None: + """Get unique values for a dimension, or None if dimension not present.""" + if dim not in self._dim_names: + return None + idx = self._dim_names.index(dim) + return sorted(set(k[idx] for k in self._results.keys())) + + def _build_property_array( + self, + get_data: callable, + base_dims: list[str], + base_coords: dict | None = None, + name: str | None = None, + ) -> xr.DataArray: + """Build a DataArray property, handling both single and multi-dimensional cases.""" + base_coords = base_coords or {} + periods = self._get_dim_values('period') + scenarios = self._get_dim_values('scenario') + + # Build list of (dim_name, values) for dimensions that exist + extra_dims = [] + if periods is not None: + extra_dims.append(('period', periods)) + if scenarios is not None: + extra_dims.append(('scenario', scenarios)) + + # Simple case: no extra dimensions + if not extra_dims: + return xr.DataArray(get_data(self._results[()]), dims=base_dims, coords=base_coords, name=name) + + # Multi-dimensional: stack data for each combination + first_data = get_data(next(iter(self._results.values()))) + shape = list(first_data.shape) + [len(vals) for _, vals in extra_dims] + data = np.empty(shape, dtype=first_data.dtype) # Preserve dtype + + for combo in np.ndindex(*[len(vals) for _, vals in extra_dims]): + key = tuple(extra_dims[i][1][idx] for i, idx in enumerate(combo)) + data[(...,) + combo] = get_data(self._results[key]) + + dims = base_dims + [dim_name for dim_name, _ in extra_dims] + coords = {**base_coords, **{dim_name: vals for dim_name, vals in extra_dims}} + return xr.DataArray(data, dims=dims, coords=coords, name=name) + + @staticmethod + def _key_to_str(key: tuple) -> str: + """Convert key tuple to string for serialization.""" + if not key: + return '__single__' + return '|'.join(str(k) for k in key) + + @staticmethod + def _str_to_key(key_str: str, dim_names: list[str]) -> tuple: + """Convert string back to key tuple.""" + if key_str == '__single__': + return () + parts = key_str.split('|') + # Try to convert to int if possible (for period years) + result = [] + for part in parts: + try: + result.append(int(part)) + except ValueError: + result.append(part) + return tuple(result) + + def __repr__(self) -> str: + if not self.dims: + return f'ClusteringResults(n_clusters={self.n_clusters})' + coords_str = ', '.join(f'{k}: {len(v)}' for k, v in self.coords.items()) + return f'ClusteringResults(dims={self.dims}, coords=({coords_str}), n_clusters={self.n_clusters})' -@dataclass -class ClusterResult: - """Universal result from any time series aggregation method. + def apply(self, data: xr.Dataset) -> AggregationResults: + """Apply clustering to dataset for all (period, scenario) combinations. + + Args: + data: Dataset with time-varying data. Must have 'time' dimension. + May have 'period' and/or 'scenario' dimensions matching this object. + + Returns: + AggregationResults with full access to aggregated data. + Use `.clustering` on the result to get ClusteringResults for IO. + + Example: + >>> agg_results = clustering_results.apply(dataset) + >>> agg_results.clustering # Get ClusteringResults for IO + >>> for key, result in agg_results: + ... print(result.cluster_representatives) + """ + from ..core import drop_constant_arrays - This dataclass captures all information needed to: - 1. Transform a FlowSystem to use aggregated (clustered) timesteps - 2. Expand a solution back to original resolution - 3. Properly weight results for statistics + results = {} + for key, cr in self._results.items(): + # Build selector for this key + selector = dict(zip(self._dim_names, key, strict=False)) + + # Select the slice for this (period, scenario) + data_slice = data.sel(**selector, drop=True) if selector else data + + # Drop constant arrays and convert to DataFrame + time_varying = drop_constant_arrays(data_slice, dim='time') + df = time_varying.to_dataframe() + + # Apply clustering + results[key] = cr.apply(df) + + return Clustering._from_aggregation_results(results, self._dim_names) + + +class Clustering: + """Clustering information for a FlowSystem. + + Thin wrapper around tsam 3.0's AggregationResult objects, providing: + 1. Multi-dimensional access for (period, scenario) combinations + 2. Structure properties (n_clusters, dims, coords, cluster_assignments) + 3. JSON persistence via ClusteringResults + + Use ``sel()`` to access individual tsam AggregationResult objects for + detailed analysis (cluster_representatives, accuracy, plotting). Attributes: - timestep_mapping: Maps each original timestep to its representative index. - dims: [original_time] for simple case, or - [original_time, period, scenario] for multi-period/scenario systems. - Values are indices into the representative timesteps (0 to n_representatives-1). - n_representatives: Number of representative timesteps after aggregation. - representative_weights: Weight for each representative timestep. - dims: [time] or [time, period, scenario] - Typically equals the number of original timesteps each representative covers. - Used as cluster_weight in the FlowSystem. - aggregated_data: Time series data aggregated to representative timesteps. - Optional - some backends may not aggregate data. - cluster_structure: Hierarchical clustering structure for storage linking. - Optional - only needed when using cluster() mode. - original_data: Reference to original data before aggregation. - Optional - useful for expand(). + results: ClusteringResults for structure access (works after JSON load). + original_timesteps: Original timesteps before clustering. + dims: Dimension names, e.g., ('period', 'scenario'). + coords: Coordinate values, e.g., {'period': [2024, 2025]}. Example: - For 8760 hourly timesteps clustered into 192 representative timesteps (8 clusters x 24h): - - timestep_mapping: shape (8760,), values 0-191 - - n_representatives: 192 - - representative_weights: shape (192,), summing to 8760 + >>> clustering = fs_clustered.clustering + >>> clustering.n_clusters + 8 + >>> clustering.dims + ('period',) + + # Access tsam AggregationResult for detailed analysis + >>> result = clustering.sel(period=2024) + >>> result.cluster_representatives # DataFrame + >>> result.accuracy # AccuracyMetrics + >>> result.plot.compare() # tsam's built-in plotting """ - timestep_mapping: xr.DataArray - n_representatives: int | xr.DataArray - representative_weights: xr.DataArray - aggregated_data: xr.Dataset | None = None - cluster_structure: ClusterStructure | None = None - original_data: xr.Dataset | None = None - - def __post_init__(self): - """Validate and ensure proper DataArray formatting.""" - # Ensure timestep_mapping is a DataArray - if not isinstance(self.timestep_mapping, xr.DataArray): - self.timestep_mapping = xr.DataArray(self.timestep_mapping, dims=['original_time'], name='timestep_mapping') - elif self.timestep_mapping.name is None: - self.timestep_mapping = self.timestep_mapping.rename('timestep_mapping') - - # Ensure representative_weights is a DataArray - # Can be (cluster, time) for 2D structure or (time,) for flat structure - if not isinstance(self.representative_weights, xr.DataArray): - self.representative_weights = xr.DataArray(self.representative_weights, name='representative_weights') - elif self.representative_weights.name is None: - self.representative_weights = self.representative_weights.rename('representative_weights') + # ========================================================================== + # Core properties (delegated to ClusteringResults) + # ========================================================================== - def __repr__(self) -> str: - n_rep = ( - int(self.n_representatives) - if isinstance(self.n_representatives, (int, np.integer)) - else int(self.n_representatives.values) - ) - has_structure = self.cluster_structure is not None - has_data = self.original_data is not None and self.aggregated_data is not None - return ( - f'ClusterResult(\n' - f' {self.n_original_timesteps} original → {n_rep} representative timesteps\n' - f' weights sum={float(self.representative_weights.sum().values):.0f}\n' - f' cluster_structure={has_structure}, data={has_data}\n' - f')' - ) + @property + def n_clusters(self) -> int: + """Number of clusters (typical periods).""" + return self.results.n_clusters - def _create_reference_structure(self) -> tuple[dict, dict[str, xr.DataArray]]: - """Create reference structure for serialization.""" - ref = {'__class__': self.__class__.__name__} - arrays = {} + @property + def timesteps_per_cluster(self) -> int: + """Number of timesteps in each cluster.""" + return self.results.timesteps_per_cluster - # Store DataArrays with references - arrays[str(self.timestep_mapping.name)] = self.timestep_mapping - ref['timestep_mapping'] = f':::{self.timestep_mapping.name}' + @property + def timesteps_per_period(self) -> int: + """Alias for timesteps_per_cluster.""" + return self.timesteps_per_cluster - arrays[str(self.representative_weights.name)] = self.representative_weights - ref['representative_weights'] = f':::{self.representative_weights.name}' + @property + def n_original_clusters(self) -> int: + """Number of original periods (before clustering).""" + return self.results.n_original_periods - # Store scalar values - if isinstance(self.n_representatives, xr.DataArray): - n_rep_name = self.n_representatives.name or 'n_representatives' - n_rep_da = self.n_representatives.rename(n_rep_name) - arrays[n_rep_name] = n_rep_da - ref['n_representatives'] = f':::{n_rep_name}' - else: - ref['n_representatives'] = int(self.n_representatives) + @property + def dim_names(self) -> list[str]: + """Names of extra dimensions, e.g., ['period', 'scenario'].""" + return self.results.dim_names + + @property + def dims(self) -> tuple[str, ...]: + """Dimension names as tuple (xarray-like).""" + return self.results.dims + + @property + def coords(self) -> dict[str, list]: + """Coordinate values for each dimension (xarray-like). + + Returns: + Dict mapping dimension names to lists of coordinate values. + + Example: + >>> clustering.coords + {'period': [2024, 2025], 'scenario': ['low', 'high']} + """ + return self.results.coords + + def sel( + self, + period: int | str | None = None, + scenario: str | None = None, + ) -> AggregationResult: + """Select AggregationResult by period and/or scenario. + + Access individual tsam AggregationResult objects for detailed analysis. + + Note: + This method is only available before saving/loading the FlowSystem. + After IO (to_dataset/from_dataset or to_json), the full AggregationResult + data is not preserved. Use `results.sel()` for structure-only access + after loading. - # Store nested ClusterStructure if present - if self.cluster_structure is not None: - cs_ref, cs_arrays = self.cluster_structure._create_reference_structure() - ref['cluster_structure'] = cs_ref - arrays.update(cs_arrays) + Args: + period: Period value (e.g., 2024). Required if clustering has periods. + scenario: Scenario name (e.g., 'high'). Required if clustering has scenarios. + + Returns: + The tsam AggregationResult for the specified combination. + Access its properties like `cluster_representatives`, `accuracy`, etc. - # Skip aggregated_data and original_data - not needed for serialization + Raises: + KeyError: If no result found for the specified combination. + ValueError: If accessed on a Clustering loaded from JSON/NetCDF. - return ref, arrays + Example: + >>> result = clustering.sel(period=2024, scenario='high') + >>> result.cluster_representatives # DataFrame with aggregated data + >>> result.accuracy # AccuracyMetrics + >>> result.plot.compare() # tsam's built-in comparison plot + """ + self._require_full_data('sel()') + # Build key from provided args in dim order + key_parts = [] + if 'period' in self._dim_names: + if period is None: + raise KeyError(f"'period' is required. Available: {self.coords.get('period', [])}") + key_parts.append(period) + if 'scenario' in self._dim_names: + if scenario is None: + raise KeyError(f"'scenario' is required. Available: {self.coords.get('scenario', [])}") + key_parts.append(scenario) + key = tuple(key_parts) + if key not in self._aggregation_results: + raise KeyError(f'No result found for period={period}, scenario={scenario}') + return self._aggregation_results[key] @property - def n_original_timesteps(self) -> int: - """Number of original timesteps (before aggregation).""" - return len(self.timestep_mapping.coords['original_time']) + def is_segmented(self) -> bool: + """Whether intra-period segmentation was used. - def get_expansion_mapping(self) -> xr.DataArray: - """Get mapping from original timesteps to representative indices. + Segmented systems have variable timestep durations within each cluster, + where each segment represents a different number of original timesteps. + """ + return self.results.n_segments is not None - This is the same as timestep_mapping but ensures proper naming - for use in expand(). + @property + def n_segments(self) -> int | None: + """Number of segments per cluster, or None if not segmented.""" + return self.results.n_segments + + @property + def cluster_assignments(self) -> xr.DataArray: + """Mapping from original periods to cluster IDs. Returns: - DataArray mapping original timesteps to representative indices. + DataArray with dims [original_cluster] or [original_cluster, period?, scenario?]. """ - return self.timestep_mapping.rename('expansion_mapping') + return self.results.cluster_assignments - def get_timestep_mapping_for_slice(self, period: str | None = None, scenario: str | None = None) -> np.ndarray: - """Get timestep_mapping for a specific (period, scenario) combination. + @property + def n_representatives(self) -> int: + """Number of representative timesteps after clustering.""" + return self.n_clusters * self.timesteps_per_cluster - Args: - period: Period label (None if no period dimension). - scenario: Scenario label (None if no scenario dimension). + # ========================================================================== + # Derived properties + # ========================================================================== + + @property + def cluster_occurrences(self) -> xr.DataArray: + """Count of how many original periods each cluster represents. + + Returns: + DataArray with dims [cluster] or [cluster, period?, scenario?]. + """ + return self.results.cluster_occurrences + + @property + def representative_weights(self) -> xr.DataArray: + """Weight for each cluster (number of original periods it represents). + + This is the same as cluster_occurrences but named for API consistency. + Used as cluster_weight in FlowSystem. + """ + return self.cluster_occurrences.rename('representative_weights') + + @functools.cached_property + def timestep_mapping(self) -> xr.DataArray: + """Mapping from original timesteps to representative timestep indices. + + Each value indicates which representative timestep index (0 to n_representatives-1) + corresponds to each original timestep. + + Note: This property is cached for performance since it's accessed frequently + during expand() operations. + """ + return self._build_timestep_mapping() + + @property + def metrics(self) -> xr.Dataset: + """Clustering quality metrics (RMSE, MAE, etc.). + + Returns: + Dataset with dims [time_series, period?, scenario?], or empty Dataset if no metrics. + """ + if self._metrics is None: + return xr.Dataset() + return self._metrics + + @property + def cluster_start_positions(self) -> np.ndarray: + """Integer positions where clusters start in reduced timesteps. + + Returns: + 1D array: [0, T, 2T, ...] where T = timesteps_per_cluster. + """ + n_timesteps = self.n_clusters * self.timesteps_per_cluster + return np.arange(0, n_timesteps, self.timesteps_per_cluster) + + @property + def cluster_centers(self) -> xr.DataArray: + """Which original period is the representative (center) for each cluster. + + Returns: + DataArray with dims [cluster] containing original period indices. + """ + return self.results.cluster_centers + + @property + def segment_assignments(self) -> xr.DataArray | None: + """For each timestep within a cluster, which intra-period segment it belongs to. + + Only available if segmentation was configured during clustering. + + Returns: + DataArray with dims [cluster, time] or None if no segmentation. + """ + return self.results.segment_assignments + + @property + def segment_durations(self) -> xr.DataArray | None: + """Duration of each intra-period segment in hours. + + Only available if segmentation was configured during clustering. + + Returns: + DataArray with dims [cluster, segment] or None if no segmentation. + """ + return self.results.segment_durations + + @property + def segment_centers(self) -> xr.DataArray | None: + """Center of each intra-period segment. + + Only available if segmentation was configured during clustering. Returns: - 1D numpy array of representative timestep indices for the specified slice. + DataArray with dims [cluster, segment] or None if no segmentation. """ - return _select_dims(self.timestep_mapping, period, scenario).values.astype(int) + return self.results.segment_centers - def expand_data(self, aggregated: xr.DataArray, original_time: xr.DataArray | None = None) -> xr.DataArray: + # ========================================================================== + # Methods + # ========================================================================== + + def expand_data( + self, + aggregated: xr.DataArray, + original_time: pd.DatetimeIndex | None = None, + ) -> xr.DataArray: """Expand aggregated data back to original timesteps. - Uses the stored timestep_mapping to map each original timestep to its - representative value from the aggregated data. Handles multi-dimensional - data with period/scenario dimensions. + Uses the timestep_mapping to map each original timestep to its + representative value from the aggregated data. Fully vectorized using + xarray's advanced indexing - no loops over period/scenario dimensions. Args: - aggregated: DataArray with aggregated (reduced) time dimension. - original_time: Original time coordinates. If None, uses coords from - original_data if available. + aggregated: DataArray with aggregated (cluster, time) or (time,) dimension. + original_time: Original time coordinates. Defaults to self.original_timesteps. Returns: DataArray expanded to original timesteps. + """ + if original_time is None: + original_time = self.original_timesteps - Example: - >>> result = fs_clustered.clustering.result - >>> aggregated_values = result.aggregated_data['Demand|profile'] - >>> expanded = result.expand_data(aggregated_values) - >>> len(expanded.time) == len(original_timesteps) # True + timestep_mapping = self.timestep_mapping # Already multi-dimensional DataArray + + if 'cluster' not in aggregated.dims: + # No cluster dimension: use mapping directly as time index + expanded = aggregated.isel(time=timestep_mapping) + else: + # Has cluster dimension: compute cluster and time indices from mapping + # For segmented systems, time dimension is n_segments, not timesteps_per_cluster + if self.is_segmented and self.n_segments is not None: + time_dim_size = self.n_segments + else: + time_dim_size = self.timesteps_per_cluster + + cluster_indices = timestep_mapping // time_dim_size + time_indices = timestep_mapping % time_dim_size + + # xarray's advanced indexing handles broadcasting across period/scenario dims + expanded = aggregated.isel(cluster=cluster_indices, time=time_indices) + + # Clean up: drop coordinate artifacts from isel, then rename original_time -> time + # The isel operation may leave 'cluster' and 'time' as non-dimension coordinates + expanded = expanded.drop_vars(['cluster', 'time'], errors='ignore') + expanded = expanded.rename({'original_time': 'time'}).assign_coords(time=original_time) + + return expanded.transpose('time', ...).assign_attrs(aggregated.attrs) + + def build_expansion_divisor( + self, + original_time: pd.DatetimeIndex | None = None, + ) -> xr.DataArray: + """Build divisor for correcting segment totals when expanding to hourly. + + For segmented systems, each segment value is a total that gets repeated N times + when expanded to hourly resolution (where N = segment duration in timesteps). + This divisor allows converting those totals back to hourly rates during expansion. + + For each original timestep, returns the number of original timesteps that map + to the same (cluster, segment) - i.e., the segment duration in timesteps. + + Fully vectorized using xarray's advanced indexing - no loops over period/scenario. + + Args: + original_time: Original time coordinates. Defaults to self.original_timesteps. + + Returns: + DataArray with dims ['time'] or ['time', 'period'?, 'scenario'?] containing + the number of timesteps in each segment, aligned to original timesteps. """ + if not self.is_segmented or self.n_segments is None: + raise ValueError('build_expansion_divisor requires a segmented clustering') + if original_time is None: - if self.original_data is None: - raise ValueError('original_time required when original_data is not available') - original_time = self.original_data.coords['time'] - - timestep_mapping = self.timestep_mapping - has_cluster_dim = 'cluster' in aggregated.dims - timesteps_per_cluster = self.cluster_structure.timesteps_per_cluster if has_cluster_dim else None - - def _expand_slice(mapping: np.ndarray, data: xr.DataArray) -> np.ndarray: - """Expand a single slice using the mapping.""" - # Validate that data has only expected dimensions for indexing - expected_dims = {'cluster', 'time'} if has_cluster_dim else {'time'} - actual_dims = set(data.dims) - unexpected_dims = actual_dims - expected_dims - if unexpected_dims: - raise ValueError( - f'Data slice has unexpected dimensions {unexpected_dims}. ' - f'Expected only {expected_dims}. Make sure period/scenario selections are applied.' - ) - if has_cluster_dim: - cluster_ids = mapping // timesteps_per_cluster - time_within = mapping % timesteps_per_cluster - # Ensure dimension order is (cluster, time) for correct indexing - if data.dims != ('cluster', 'time'): - data = data.transpose('cluster', 'time') - return data.values[cluster_ids, time_within] - return data.values[mapping] - - # Simple case: no period/scenario dimensions - extra_dims = [d for d in timestep_mapping.dims if d != 'original_time'] - if not extra_dims: - expanded_values = _expand_slice(timestep_mapping.values, aggregated) - return xr.DataArray(expanded_values, coords={'time': original_time}, dims=['time'], attrs=aggregated.attrs) - - # Multi-dimensional: expand each slice and recombine - dim_coords = {d: list(timestep_mapping.coords[d].values) for d in extra_dims} - expanded_slices = {} - for combo in np.ndindex(*[len(v) for v in dim_coords.values()]): - selector = {d: dim_coords[d][i] for d, i in zip(extra_dims, combo, strict=True)} - mapping = _select_dims(timestep_mapping, **selector).values - data_slice = ( - _select_dims(aggregated, **selector) if any(d in aggregated.dims for d in selector) else aggregated - ) - expanded_slices[tuple(selector.values())] = xr.DataArray( - _expand_slice(mapping, data_slice), coords={'time': original_time}, dims=['time'] - ) + original_time = self.original_timesteps - # Concatenate iteratively along each extra dimension - result_arrays = expanded_slices - for dim in reversed(extra_dims): - dim_vals = dim_coords[dim] - grouped = {} - for key, arr in result_arrays.items(): - rest_key = key[:-1] if len(key) > 1 else () - grouped.setdefault(rest_key, []).append(arr) - result_arrays = {k: xr.concat(v, dim=pd.Index(dim_vals, name=dim)) for k, v in grouped.items()} - result = list(result_arrays.values())[0] - return result.transpose('time', ...).assign_attrs(aggregated.attrs) - - def validate(self) -> None: - """Validate that all fields are consistent. + timestep_mapping = self.timestep_mapping # Already multi-dimensional + segment_durations = self.results.segment_durations # [cluster, segment, period?, scenario?] - Raises: - ValueError: If validation fails. + # Decode cluster and segment indices from timestep_mapping + # For segmented systems, encoding is: cluster_id * n_segments + segment_idx + time_dim_size = self.n_segments + cluster_indices = timestep_mapping // time_dim_size + segment_indices = timestep_mapping % time_dim_size # This IS the segment index + + # Get duration for each segment directly + # segment_durations[cluster, segment] -> duration + divisor = segment_durations.isel(cluster=cluster_indices, segment=segment_indices) + + # Clean up coordinates and rename + divisor = divisor.drop_vars(['cluster', 'time', 'segment'], errors='ignore') + divisor = divisor.rename({'original_time': 'time'}).assign_coords(time=original_time) + + return divisor.transpose('time', ...).rename('expansion_divisor') + + def get_result( + self, + period: Any = None, + scenario: Any = None, + ) -> TsamClusteringResult: + """Get the tsam ClusteringResult for a specific (period, scenario). + + Args: + period: Period label (if applicable). + scenario: Scenario label (if applicable). + + Returns: + The tsam ClusteringResult for the specified combination. + """ + return self.results.sel(period=period, scenario=scenario) + + def apply( + self, + data: pd.DataFrame, + period: Any = None, + scenario: Any = None, + ) -> AggregationResult: + """Apply the saved clustering to new data. + + Args: + data: DataFrame with time series data to cluster. + period: Period label (if applicable). + scenario: Scenario label (if applicable). + + Returns: + tsam AggregationResult with the clustering applied. + """ + return self.results.sel(period=period, scenario=scenario).apply(data) + + def to_json(self, path: str | Path) -> None: + """Save the clustering for reuse. + + Uses ClusteringResults.to_dict() which preserves full tsam ClusteringResult. + Can be loaded later with Clustering.from_json() and used with + flow_system.transform.apply_clustering(). + + Args: + path: Path to save the JSON file. + """ + data = { + 'results': self.results.to_dict(), + 'original_timesteps': [ts.isoformat() for ts in self.original_timesteps], + } + + with open(path, 'w') as f: + json.dump(data, f, indent=2) + + @classmethod + def from_json( + cls, + path: str | Path, + original_timesteps: pd.DatetimeIndex | None = None, + ) -> Clustering: + """Load a clustering from JSON. + + The loaded Clustering has full apply() support because ClusteringResult + is fully preserved via tsam's serialization. + + Args: + path: Path to the JSON file. + original_timesteps: Original timesteps for the new FlowSystem. + If None, uses the timesteps stored in the JSON. + + Returns: + A Clustering that can be used with apply_clustering(). + """ + with open(path) as f: + data = json.load(f) + + results = ClusteringResults.from_dict(data['results']) + + if original_timesteps is None: + original_timesteps = pd.DatetimeIndex([pd.Timestamp(ts) for ts in data['original_timesteps']]) + + return cls( + results=results, + original_timesteps=original_timesteps, + ) + + # ========================================================================== + # Visualization + # ========================================================================== + + @property + def plot(self) -> ClusteringPlotAccessor: + """Access plotting methods for clustering visualization. + + Returns: + ClusteringPlotAccessor with compare(), heatmap(), and clusters() methods. """ - n_rep = ( - int(self.n_representatives) - if isinstance(self.n_representatives, (int, np.integer)) - else int(self.n_representatives.max().values) + return ClusteringPlotAccessor(self) + + # ========================================================================== + # Private helpers + # ========================================================================== + + def _build_timestep_mapping(self) -> xr.DataArray: + """Build timestep_mapping DataArray.""" + n_original = len(self.original_timesteps) + original_time_coord = self.original_timesteps.rename('original_time') + return self.results._build_property_array( + lambda cr: _build_timestep_mapping(cr, n_original), + base_dims=['original_time'], + base_coords={'original_time': original_time_coord}, + name='timestep_mapping', ) - # Check mapping values are within range - max_idx = int(self.timestep_mapping.max().values) - if max_idx >= n_rep: - raise ValueError(f'timestep_mapping contains index {max_idx} but n_representatives is {n_rep}') - - # Check weights dimensions - # representative_weights should have (cluster,) dimension with n_clusters elements - # (plus optional period/scenario dimensions) - if self.cluster_structure is not None: - n_clusters = self.cluster_structure.n_clusters - if 'cluster' in self.representative_weights.dims: - weights_n_clusters = self.representative_weights.sizes['cluster'] - if weights_n_clusters != n_clusters: - raise ValueError( - f'representative_weights has {weights_n_clusters} clusters ' - f'but cluster_structure has {n_clusters}' - ) + def _create_reference_structure(self, include_original_data: bool = True) -> tuple[dict, dict[str, xr.DataArray]]: + """Create serialization structure for to_dataset(). - # Check weights sum roughly equals number of original periods - # (each weight is how many original periods that cluster represents) - # Sum should be checked per period/scenario slice, not across all dimensions - if self.cluster_structure is not None: - n_original_clusters = self.cluster_structure.n_original_clusters - # Sum over cluster dimension only (keep period/scenario if present) - weight_sum_per_slice = self.representative_weights.sum(dim='cluster') - # Check each slice - if weight_sum_per_slice.size == 1: - # Simple case: no period/scenario - weight_sum = float(weight_sum_per_slice.values) - if abs(weight_sum - n_original_clusters) > 1e-6: - warnings.warn( - f'representative_weights sum ({weight_sum}) does not match ' - f'n_original_clusters ({n_original_clusters})', - stacklevel=2, - ) + Args: + include_original_data: Whether to include original_data in serialization. + Set to False for smaller files when plot.compare() isn't needed after IO. + Defaults to True. + + Returns: + Tuple of (reference_dict, arrays_dict). + """ + arrays = {} + + # Collect original_data arrays + # Rename 'time' to 'original_time' to avoid conflict with clustered FlowSystem's time coord + original_data_refs = None + if include_original_data and self.original_data is not None: + original_data_refs = [] + # Use variables for faster access (avoids _construct_dataarray overhead) + variables = self.original_data.variables + for name in self.original_data.data_vars: + var = variables[name] + ref_name = f'original_data|{name}' + # Rename time dim to avoid xarray alignment issues + if 'time' in var.dims: + new_dims = tuple('original_time' if d == 'time' else d for d in var.dims) + arrays[ref_name] = xr.Variable(new_dims, var.values, attrs=var.attrs) + else: + arrays[ref_name] = var + original_data_refs.append(f':::{ref_name}') + + # NOTE: aggregated_data is NOT serialized - it's identical to the FlowSystem's + # main data arrays and would be redundant. After loading, aggregated_data is + # reconstructed from the FlowSystem's dataset. + + # Collect metrics arrays + metrics_refs = None + if self._metrics is not None: + metrics_refs = [] + # Use variables for faster access (avoids _construct_dataarray overhead) + metrics_vars = self._metrics.variables + for name in self._metrics.data_vars: + ref_name = f'metrics|{name}' + arrays[ref_name] = metrics_vars[name] + metrics_refs.append(f':::{ref_name}') + + reference = { + '__class__': 'Clustering', + 'results': self.results.to_dict(), # Full ClusteringResults serialization + 'original_timesteps': [ts.isoformat() for ts in self.original_timesteps], + '_original_data_refs': original_data_refs, + '_metrics_refs': metrics_refs, + } + + return reference, arrays + + def __init__( + self, + results: ClusteringResults | dict | None = None, + original_timesteps: pd.DatetimeIndex | list[str] | None = None, + original_data: xr.Dataset | None = None, + aggregated_data: xr.Dataset | None = None, + _metrics: xr.Dataset | None = None, + # These are for reconstruction from serialization + _original_data_refs: list[str] | None = None, + _metrics_refs: list[str] | None = None, + # Internal: AggregationResult dict for full data access + _aggregation_results: dict[tuple, AggregationResult] | None = None, + _dim_names: list[str] | None = None, + ): + """Initialize Clustering object. + + Args: + results: ClusteringResults instance, or dict from to_dict() (for deserialization). + Not needed if _aggregation_results is provided. + original_timesteps: Original timesteps before clustering. + original_data: Original dataset before clustering (for expand/plotting). + aggregated_data: Aggregated dataset after clustering (for plotting). + After loading from file, this is reconstructed from FlowSystem data. + _metrics: Pre-computed metrics dataset. + _original_data_refs: Internal: resolved DataArrays from serialization. + _metrics_refs: Internal: resolved DataArrays from serialization. + _aggregation_results: Internal: dict of AggregationResult for full data access. + _dim_names: Internal: dimension names when using _aggregation_results. + """ + # Handle ISO timestamp strings from serialization + if ( + isinstance(original_timesteps, list) + and len(original_timesteps) > 0 + and isinstance(original_timesteps[0], str) + ): + original_timesteps = pd.DatetimeIndex([pd.Timestamp(ts) for ts in original_timesteps]) + + # Store AggregationResults if provided (full data access) + self._aggregation_results = _aggregation_results + self._dim_names = _dim_names or [] + + # Handle results - only needed for serialization path + if results is not None: + if isinstance(results, dict): + results = ClusteringResults.from_dict(results) + self._results_cache = results + else: + self._results_cache = None + + # Flag indicating this was loaded from serialization (missing full AggregationResult data) + self._from_serialization = _aggregation_results is None and results is not None + + self.original_timesteps = original_timesteps if original_timesteps is not None else pd.DatetimeIndex([]) + self._metrics = _metrics + + # Handle reconstructed data from refs (list of DataArrays) + if _original_data_refs is not None and isinstance(_original_data_refs, list): + # These are resolved DataArrays from the structure resolver + if all(isinstance(da, xr.DataArray) for da in _original_data_refs): + # Rename 'original_time' back to 'time' and strip 'original_data|' prefix + data_vars = {} + for da in _original_data_refs: + if 'original_time' in da.dims: + da = da.rename({'original_time': 'time'}) + # Strip 'original_data|' prefix from name (added during serialization) + name = da.name + if name.startswith('original_data|'): + name = name[14:] # len('original_data|') = 14 + data_vars[name] = da.rename(name) + self.original_data = xr.Dataset(data_vars) else: - # Multi-dimensional: check each slice - for val in weight_sum_per_slice.values.flat: - if abs(float(val) - n_original_clusters) > 1e-6: - warnings.warn( - f'representative_weights sum per slice ({float(val)}) does not match ' - f'n_original_clusters ({n_original_clusters})', - stacklevel=2, - ) - break # Only warn once + self.original_data = original_data + else: + self.original_data = original_data + + self.aggregated_data = aggregated_data + + if _metrics_refs is not None and isinstance(_metrics_refs, list): + if all(isinstance(da, xr.DataArray) for da in _metrics_refs): + # Strip 'metrics|' prefix from name (added during serialization) + data_vars = {} + for da in _metrics_refs: + name = da.name + if name.startswith('metrics|'): + name = name[8:] # len('metrics|') = 8 + data_vars[name] = da.rename(name) + self._metrics = xr.Dataset(data_vars) + + @property + def results(self) -> ClusteringResults: + """ClusteringResults for structure access (derived from AggregationResults or cached).""" + if self._results_cache is not None: + return self._results_cache + if self._aggregation_results is not None: + # Derive from AggregationResults (cached on first access) + self._results_cache = ClusteringResults( + {k: r.clustering for k, r in self._aggregation_results.items()}, + self._dim_names, + ) + return self._results_cache + raise ValueError('No results available - neither AggregationResults nor ClusteringResults set') + + @classmethod + def _from_aggregation_results( + cls, + aggregation_results: dict[tuple, AggregationResult], + dim_names: list[str], + original_timesteps: pd.DatetimeIndex | None = None, + original_data: xr.Dataset | None = None, + ) -> Clustering: + """Create Clustering from AggregationResult dict. + + This is the primary way to create a Clustering with full data access. + Called by ClusteringResults.apply() and TransformAccessor. + + Args: + aggregation_results: Dict mapping (period, scenario) tuples to AggregationResult. + dim_names: Dimension names, e.g., ['period', 'scenario']. + original_timesteps: Original timesteps (optional, for expand). + original_data: Original dataset (optional, for plotting). + + Returns: + Clustering with full AggregationResult access. + """ + return cls( + original_timesteps=original_timesteps, + original_data=original_data, + _aggregation_results=aggregation_results, + _dim_names=dim_names, + ) + + # ========================================================================== + # Iteration over AggregationResults (for direct access to tsam results) + # ========================================================================== + + def __iter__(self): + """Iterate over (key, AggregationResult) pairs. + + Raises: + ValueError: If accessed on a Clustering loaded from JSON. + """ + self._require_full_data('iteration') + return iter(self._aggregation_results.items()) + + def __len__(self) -> int: + """Number of (period, scenario) combinations.""" + if self._aggregation_results is not None: + return len(self._aggregation_results) + return len(list(self.results.keys())) + + def __getitem__(self, key: tuple) -> AggregationResult: + """Get AggregationResult by (period, scenario) key. + + Raises: + ValueError: If accessed on a Clustering loaded from JSON. + """ + self._require_full_data('item access') + return self._aggregation_results[key] + + def items(self): + """Iterate over (key, AggregationResult) pairs. + + Raises: + ValueError: If accessed on a Clustering loaded from JSON. + """ + self._require_full_data('items()') + return self._aggregation_results.items() + + def keys(self): + """Iterate over (period, scenario) keys.""" + if self._aggregation_results is not None: + return self._aggregation_results.keys() + return self.results.keys() + + def values(self): + """Iterate over AggregationResult objects. + + Raises: + ValueError: If accessed on a Clustering loaded from JSON. + """ + self._require_full_data('values()') + return self._aggregation_results.values() + + def _require_full_data(self, operation: str) -> None: + """Raise error if full AggregationResult data is not available.""" + if self._from_serialization: + raise ValueError( + f'{operation} requires full AggregationResult data, ' + f'but this Clustering was loaded from JSON. ' + f'Use apply_clustering() to get full results.' + ) + + def __repr__(self) -> str: + return ( + f'Clustering(\n' + f' {self.n_original_clusters} periods → {self.n_clusters} clusters\n' + f' timesteps_per_cluster={self.timesteps_per_cluster}\n' + f' dims={self.dim_names}\n' + f')' + ) class ClusteringPlotAccessor: @@ -540,13 +1358,6 @@ class ClusteringPlotAccessor: Provides visualization methods for comparing original vs aggregated data and understanding the clustering structure. - - Example: - >>> fs_clustered = flow_system.transform.cluster(n_clusters=8, cluster_duration='1D') - >>> fs_clustered.clustering.plot.compare() # timeseries comparison - >>> fs_clustered.clustering.plot.compare(kind='duration_curve') # duration curve - >>> fs_clustered.clustering.plot.heatmap() # structure visualization - >>> fs_clustered.clustering.plot.clusters() # cluster profiles """ def __init__(self, clustering: Clustering): @@ -583,7 +1394,6 @@ def compare( Returns: PlotResult containing the comparison figure and underlying data. """ - import pandas as pd import plotly.graph_objects as go from ..config import CONFIG @@ -593,8 +1403,8 @@ def compare( if kind not in ('timeseries', 'duration_curve'): raise ValueError(f"Unknown kind '{kind}'. Use 'timeseries' or 'duration_curve'.") - result = self._clustering.result - if result.original_data is None or result.aggregated_data is None: + clustering = self._clustering + if clustering.original_data is None or clustering.aggregated_data is None: raise ValueError('No original/aggregated data available for comparison') resolved_variables = self._resolve_variables(variables) @@ -602,21 +1412,25 @@ def compare( # Build Dataset with variables as data_vars data_vars = {} for var in resolved_variables: - original = result.original_data[var] - clustered = result.expand_data(result.aggregated_data[var]) + original = clustering.original_data[var] + clustered = clustering.expand_data(clustering.aggregated_data[var]) combined = xr.concat([original, clustered], dim=pd.Index(['Original', 'Clustered'], name='representation')) data_vars[var] = combined ds = xr.Dataset(data_vars) - # Apply selection ds = _apply_selection(ds, select) - # For duration curve: flatten and sort values if kind == 'duration_curve': sorted_vars = {} + # Use variables for faster access (avoids _construct_dataarray overhead) + variables = ds.variables + rep_values = ds.coords['representation'].values + rep_idx = {rep: i for i, rep in enumerate(rep_values)} for var in ds.data_vars: - for rep in ds.coords['representation'].values: - values = np.sort(ds[var].sel(representation=rep).values.flatten())[::-1] + data = variables[var].values + for rep in rep_values: + # Direct numpy indexing instead of .sel() + values = np.sort(data[rep_idx[rep]].flatten())[::-1] sorted_vars[(var, rep)] = values # Get length from first sorted array n = len(next(iter(sorted_vars.values()))) @@ -631,15 +1445,15 @@ def compare( } ) - # Set title based on kind - if kind == 'timeseries': - title = ( + title = ( + ( 'Original vs Clustered' if len(resolved_variables) > 1 else f'Original vs Clustered: {resolved_variables[0]}' ) - else: - title = 'Duration Curve' if len(resolved_variables) > 1 else f'Duration Curve: {resolved_variables[0]}' + if kind == 'timeseries' + else ('Duration Curve' if len(resolved_variables) > 1 else f'Duration Curve: {resolved_variables[0]}') + ) # Early return for data_only mode if data_only: @@ -674,15 +1488,24 @@ def compare( return plot_result def _get_time_varying_variables(self) -> list[str]: - """Get list of time-varying variables from original data.""" - result = self._clustering.result - if result.original_data is None: + """Get list of time-varying variables from original data that also exist in aggregated data.""" + if self._clustering.original_data is None: return [] + # Get variables that exist in both original and aggregated data + aggregated_vars = ( + set(self._clustering.aggregated_data.data_vars) + if self._clustering.aggregated_data is not None + else set(self._clustering.original_data.data_vars) + ) return [ name - for name in result.original_data.data_vars - if 'time' in result.original_data[name].dims - and not np.isclose(result.original_data[name].min(), result.original_data[name].max()) + for name in self._clustering.original_data.data_vars + if name in aggregated_vars + and 'time' in self._clustering.original_data[name].dims + and not np.isclose( + self._clustering.original_data[name].min(), + self._clustering.original_data[name].max(), + ) ] def _resolve_variables(self, variables: str | list[str] | None) -> list[str]: @@ -741,35 +1564,20 @@ def heatmap( from ..plot_result import PlotResult from ..statistics_accessor import _apply_selection - result = self._clustering.result - cs = result.cluster_structure - if cs is None: - raise ValueError('No cluster structure available') + clustering = self._clustering + cluster_assignments = clustering.cluster_assignments + timesteps_per_cluster = clustering.timesteps_per_cluster + original_time = clustering.original_timesteps - cluster_order_da = cs.cluster_order - timesteps_per_cluster = cs.timesteps_per_cluster - original_time = result.original_data.coords['time'] if result.original_data is not None else None - - # Apply selection if provided if select: - cluster_order_da = _apply_selection(cluster_order_da.to_dataset(name='cluster'), select)['cluster'] - - # Expand cluster_order to per-timestep: repeat each value timesteps_per_cluster times - # Uses np.repeat along axis=0 (original_cluster dim) - extra_dims = [d for d in cluster_order_da.dims if d != 'original_cluster'] - expanded_values = np.repeat(cluster_order_da.values, timesteps_per_cluster, axis=0) + cluster_assignments = _apply_selection(cluster_assignments.to_dataset(name='cluster'), select)['cluster'] - # Validate length consistency when using original time coordinates - if original_time is not None and len(original_time) != expanded_values.shape[0]: - raise ValueError( - f'Length mismatch: original_time has {len(original_time)} elements but expanded ' - f'cluster data has {expanded_values.shape[0]} elements ' - f'(n_clusters={cluster_order_da.sizes.get("original_cluster", len(cluster_order_da))} * ' - f'timesteps_per_cluster={timesteps_per_cluster})' - ) + # Expand cluster_assignments to per-timestep + extra_dims = [d for d in cluster_assignments.dims if d != 'original_cluster'] + expanded_values = np.repeat(cluster_assignments.values, timesteps_per_cluster, axis=0) - coords = {'time': original_time} if original_time is not None else {} - coords.update({d: cluster_order_da.coords[d].values for d in extra_dims}) + coords = {'time': original_time} + coords.update({d: cluster_assignments.coords[d].values for d in extra_dims}) cluster_da = xr.DataArray(expanded_values, dims=['time'] + extra_dims, coords=coords) cluster_da.name = 'cluster' @@ -777,7 +1585,6 @@ def heatmap( if data_only: return PlotResult(data=xr.Dataset({'cluster': cluster_da}), figure=go.Figure()) - # Add dummy y dimension for heatmap visualization (single row) heatmap_da = cluster_da.expand_dims('y', axis=-1).assign_coords(y=['Cluster']) heatmap_da.name = 'cluster_assignment' heatmap_da = heatmap_da.transpose('time', 'y', ...) @@ -792,7 +1599,6 @@ def heatmap( **plotly_kwargs, ) - # Clean up: hide y-axis since it's just a single row fig.update_yaxes(showticklabels=False) fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1])) @@ -842,64 +1648,32 @@ def clusters( from ..plot_result import PlotResult from ..statistics_accessor import _apply_selection - result = self._clustering.result - cs = result.cluster_structure - if result.aggregated_data is None or cs is None: - raise ValueError('No aggregated data or cluster structure available') - - # Apply selection to aggregated data - aggregated_data = _apply_selection(result.aggregated_data, select) + clustering = self._clustering + if clustering.aggregated_data is None: + raise ValueError('No aggregated data available') - time_vars = self._get_time_varying_variables() - if not time_vars: - raise ValueError('No time-varying variables found') - - # Resolve variables + aggregated_data = _apply_selection(clustering.aggregated_data, select) resolved_variables = self._resolve_variables(variables) - n_clusters = int(cs.n_clusters) if isinstance(cs.n_clusters, (int, np.integer)) else int(cs.n_clusters.values) - timesteps_per_cluster = cs.timesteps_per_cluster + n_clusters = clustering.n_clusters + timesteps_per_cluster = clustering.timesteps_per_cluster + cluster_occurrences = clustering.cluster_occurrences - # Check dimensions of all variables for consistency - has_cluster_dim = None - for var in resolved_variables: - da = aggregated_data[var] - var_has_cluster = 'cluster' in da.dims - extra_dims = [d for d in da.dims if d not in ('time', 'cluster')] - if extra_dims: - raise ValueError( - f'clusters() requires data with only time (or cluster, time) dimensions. ' - f'Variable {var!r} has extra dimensions: {extra_dims}. ' - f'Use select={{{extra_dims[0]!r}: }} to select a specific {extra_dims[0]}.' - ) - if has_cluster_dim is None: - has_cluster_dim = var_has_cluster - elif has_cluster_dim != var_has_cluster: - raise ValueError( - f'All variables must have consistent dimensions. ' - f'Variable {var!r} has {"" if var_has_cluster else "no "}cluster dimension, ' - f'but previous variables {"do" if has_cluster_dim else "do not"}.' - ) - - # Build Dataset with cluster dimension, using labels with occurrence counts - # Check if cluster_occurrences has extra dims - occ_extra_dims = [d for d in cs.cluster_occurrences.dims if d not in ('cluster',)] + # Build cluster labels + occ_extra_dims = [d for d in cluster_occurrences.dims if d != 'cluster'] if occ_extra_dims: - # Use simple labels without occurrence counts for multi-dim case cluster_labels = [f'Cluster {c}' for c in range(n_clusters)] else: cluster_labels = [ - f'Cluster {c} (×{int(cs.cluster_occurrences.sel(cluster=c).values)})' for c in range(n_clusters) + f'Cluster {c} (×{int(cluster_occurrences.sel(cluster=c).values)})' for c in range(n_clusters) ] data_vars = {} for var in resolved_variables: da = aggregated_data[var] - if has_cluster_dim: - # Data already has (cluster, time) dims - just update cluster labels + if 'cluster' in da.dims: data_by_cluster = da.values else: - # Data has (time,) dim - reshape to (cluster, time) data_by_cluster = da.values.reshape(n_clusters, timesteps_per_cluster) data_vars[var] = xr.DataArray( data_by_cluster, @@ -911,7 +1685,7 @@ def clusters( # Early return for data_only mode (include occurrences in result) if data_only: - data_vars['occurrences'] = cs.cluster_occurrences + data_vars['occurrences'] = cluster_occurrences return PlotResult(data=xr.Dataset(data_vars), figure=go.Figure()) title = 'Clusters' if len(resolved_variables) > 1 else f'Clusters: {resolved_variables[0]}' @@ -933,8 +1707,7 @@ def clusters( fig.update_yaxes(matches=None) fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1])) - # Include occurrences in result data - data_vars['occurrences'] = cs.cluster_occurrences + data_vars['occurrences'] = cluster_occurrences result_data = xr.Dataset(data_vars) plot_result = PlotResult(data=result_data, figure=fig) @@ -946,222 +1719,12 @@ def clusters( return plot_result -@dataclass -class Clustering: - """Information about an aggregation stored on a FlowSystem. - - This is stored on the FlowSystem after aggregation to enable: - - expand() to map back to original timesteps - - Statistics to properly weight results - - Inter-cluster storage linking - - Serialization/deserialization of aggregated models - - Attributes: - result: The ClusterResult from the aggregation backend. - backend_name: Name of the aggregation backend used (e.g., 'tsam', 'manual'). - metrics: Clustering quality metrics (RMSE, MAE, etc.) as xr.Dataset. - Each metric (e.g., 'RMSE', 'MAE') is a DataArray with dims - ``[time_series, period?, scenario?]``. - - Example: - >>> fs_clustered = flow_system.transform.cluster(n_clusters=8, cluster_duration='1D') - >>> fs_clustered.clustering.n_clusters - 8 - >>> fs_clustered.clustering.plot.compare() - >>> fs_clustered.clustering.plot.heatmap() - """ - - result: ClusterResult - backend_name: str = 'unknown' - metrics: xr.Dataset | None = None - - def _create_reference_structure(self) -> tuple[dict, dict[str, xr.DataArray]]: - """Create reference structure for serialization.""" - ref = {'__class__': self.__class__.__name__} - arrays = {} - - # Store nested ClusterResult - result_ref, result_arrays = self.result._create_reference_structure() - ref['result'] = result_ref - arrays.update(result_arrays) - - # Store scalar values - ref['backend_name'] = self.backend_name - - return ref, arrays - - def __repr__(self) -> str: - cs = self.result.cluster_structure - if cs is not None: - n_clusters = ( - int(cs.n_clusters) if isinstance(cs.n_clusters, (int, np.integer)) else int(cs.n_clusters.values) - ) - structure_info = f'{cs.n_original_clusters} periods → {n_clusters} clusters' - else: - structure_info = 'no structure' - return f'Clustering(\n backend={self.backend_name!r}\n {structure_info}\n)' - - @property - def plot(self) -> ClusteringPlotAccessor: - """Access plotting methods for clustering visualization. - - Returns: - ClusteringPlotAccessor with compare(), heatmap(), and clusters() methods. - - Example: - >>> fs.clustering.plot.compare() # timeseries comparison - >>> fs.clustering.plot.compare(kind='duration_curve') # duration curve - >>> fs.clustering.plot.heatmap() # structure visualization - >>> fs.clustering.plot.clusters() # cluster profiles - """ - return ClusteringPlotAccessor(self) - - # Convenience properties delegating to nested objects - - @property - def cluster_order(self) -> xr.DataArray: - """Which cluster each original period belongs to.""" - if self.result.cluster_structure is None: - raise ValueError('No cluster_structure available') - return self.result.cluster_structure.cluster_order - - @property - def occurrences(self) -> xr.DataArray: - """How many original periods each cluster represents.""" - if self.result.cluster_structure is None: - raise ValueError('No cluster_structure available') - return self.result.cluster_structure.cluster_occurrences - - @property - def n_clusters(self) -> int: - """Number of clusters.""" - if self.result.cluster_structure is None: - raise ValueError('No cluster_structure available') - n = self.result.cluster_structure.n_clusters - return int(n) if isinstance(n, (int, np.integer)) else int(n.values) - - @property - def n_original_clusters(self) -> int: - """Number of original periods (before clustering).""" - if self.result.cluster_structure is None: - raise ValueError('No cluster_structure available') - return self.result.cluster_structure.n_original_clusters - - @property - def timesteps_per_period(self) -> int: - """Number of timesteps in each period/cluster. - - Alias for :attr:`timesteps_per_cluster`. - """ - return self.timesteps_per_cluster - - @property - def timesteps_per_cluster(self) -> int: - """Number of timesteps in each cluster.""" - if self.result.cluster_structure is None: - raise ValueError('No cluster_structure available') - return self.result.cluster_structure.timesteps_per_cluster - - @property - def timestep_mapping(self) -> xr.DataArray: - """Mapping from original timesteps to representative timestep indices.""" - return self.result.timestep_mapping - - @property - def cluster_start_positions(self) -> np.ndarray: - """Integer positions where clusters start. - - Returns the indices of the first timestep of each cluster. - Use these positions to build masks for specific use cases. - - Returns: - 1D numpy array of positions: [0, T, 2T, ...] where T = timesteps_per_period. - - Example: - For 2 clusters with 24 timesteps each: - >>> clustering.cluster_start_positions - array([0, 24]) - """ - if self.result.cluster_structure is None: - raise ValueError('No cluster_structure available') - - n_timesteps = self.n_clusters * self.timesteps_per_period - return np.arange(0, n_timesteps, self.timesteps_per_period) - - @property - def original_timesteps(self) -> pd.DatetimeIndex: - """Original timesteps before clustering. - - Derived from the 'original_time' coordinate of timestep_mapping. - - Raises: - KeyError: If 'original_time' coordinate is missing from timestep_mapping. - """ - if 'original_time' not in self.result.timestep_mapping.coords: - raise KeyError( - "timestep_mapping is missing 'original_time' coordinate. " - 'This may indicate corrupted or incompatible clustering results.' - ) - return pd.DatetimeIndex(self.result.timestep_mapping.coords['original_time'].values) - - -def create_cluster_structure_from_mapping( - timestep_mapping: xr.DataArray, - timesteps_per_cluster: int, -) -> ClusterStructure: - """Create ClusterStructure from a timestep mapping. - - This is a convenience function for creating ClusterStructure when you - have the timestep mapping but not the full clustering metadata. - - Args: - timestep_mapping: Mapping from original timesteps to representative indices. - timesteps_per_cluster: Number of timesteps per cluster period. - - Returns: - ClusterStructure derived from the mapping. - """ - n_original = len(timestep_mapping) - n_original_clusters = n_original // timesteps_per_cluster - - # Determine cluster order from the mapping - # Each original period maps to the cluster of its first timestep - cluster_order = [] - for p in range(n_original_clusters): - start_idx = p * timesteps_per_cluster - cluster_idx = int(timestep_mapping.isel(original_time=start_idx).values) // timesteps_per_cluster - cluster_order.append(cluster_idx) - - cluster_order_da = xr.DataArray(cluster_order, dims=['original_cluster'], name='cluster_order') - - # Count occurrences of each cluster - unique_clusters = np.unique(cluster_order) - n_clusters = int(unique_clusters.max()) + 1 if len(unique_clusters) > 0 else 0 - occurrences = {} - for c in unique_clusters: - occurrences[int(c)] = sum(1 for x in cluster_order if x == c) - - cluster_occurrences_da = xr.DataArray( - [occurrences.get(c, 0) for c in range(n_clusters)], - dims=['cluster'], - name='cluster_occurrences', - ) - - return ClusterStructure( - cluster_order=cluster_order_da, - cluster_occurrences=cluster_occurrences_da, - n_clusters=n_clusters, - timesteps_per_cluster=timesteps_per_cluster, - ) +# Backwards compatibility alias +AggregationResults = Clustering def _register_clustering_classes(): - """Register clustering classes for IO. - - Called from flow_system.py after all imports are complete to avoid circular imports. - """ + """Register clustering classes for IO.""" from ..structure import CLASS_REGISTRY - CLASS_REGISTRY['ClusterStructure'] = ClusterStructure - CLASS_REGISTRY['ClusterResult'] = ClusterResult CLASS_REGISTRY['Clustering'] = Clustering diff --git a/flixopt/clustering/intercluster_helpers.py b/flixopt/clustering/intercluster_helpers.py index 43758b79e..bce1ab99b 100644 --- a/flixopt/clustering/intercluster_helpers.py +++ b/flixopt/clustering/intercluster_helpers.py @@ -11,7 +11,7 @@ - **SOC_boundary**: Absolute state-of-charge at the boundary between original periods. With N original periods, there are N+1 boundary points. -- **Linking**: SOC_boundary[d+1] = SOC_boundary[d] + delta_SOC[cluster_order[d]] +- **Linking**: SOC_boundary[d+1] = SOC_boundary[d] + delta_SOC[cluster_assignments[d]] Each boundary is connected to the next via the net charge change of the representative cluster for that period. diff --git a/flixopt/components.py b/flixopt/components.py index 7cb5b9fc4..4b91fe6ff 100644 --- a/flixopt/components.py +++ b/flixopt/components.py @@ -4,6 +4,7 @@ from __future__ import annotations +import functools import logging import warnings from typing import TYPE_CHECKING, Literal @@ -17,7 +18,7 @@ from .features import InvestmentModel, PiecewiseModel from .interface import InvestParameters, PiecewiseConversion, StatusParameters from .modeling import BoundingPatterns, _scalar_safe_isel, _scalar_safe_isel_drop, _scalar_safe_reduce -from .structure import FlowSystemModel, register_class_for_io +from .structure import FlowSystemModel, VariableCategory, register_class_for_io if TYPE_CHECKING: import linopy @@ -944,8 +945,13 @@ def _create_storage_variables(self): upper=ub, coords=self._model.get_coords(extra_timestep=True), short_name='charge_state', + category=VariableCategory.CHARGE_STATE, + ) + self.add_variables( + coords=self._model.get_coords(), + short_name='netto_discharge', + category=VariableCategory.NETTO_DISCHARGE, ) - self.add_variables(coords=self._model.get_coords(), short_name='netto_discharge') def _add_netto_discharge_constraint(self): """Add constraint: netto_discharge = discharging - charging.""" @@ -976,6 +982,7 @@ def _add_investment_model(self): label_of_element=self.label_of_element, label_of_model=self.label_of_element, parameters=self.element.capacity_in_flow_hours, + size_category=VariableCategory.STORAGE_SIZE, ), short_name='investment', ) @@ -1096,7 +1103,7 @@ def _absolute_charge_state_bounds(self) -> tuple[xr.DataArray, xr.DataArray]: relative_upper_bound * cap, ) - @property + @functools.cached_property def _relative_charge_state_bounds(self) -> tuple[xr.DataArray, xr.DataArray]: """ Get relative charge state bounds with final timestep values. @@ -1146,7 +1153,9 @@ def _relative_charge_state_bounds(self) -> tuple[xr.DataArray, xr.DataArray]: # Original is scalar - broadcast to full time range (constant value) max_bounds = rel_max.expand_dims(time=timesteps_extra) - return min_bounds, max_bounds + # Ensure both bounds have matching dimensions (broadcast once here, + # so downstream code doesn't need to handle dimension mismatches) + return xr.broadcast(min_bounds, max_bounds) @property def _investment(self) -> InvestmentModel | None: @@ -1208,7 +1217,7 @@ class InterclusterStorageModel(StorageModel): 1. **Cluster start constraint**: ``ΔE(cluster_start) = 0`` Each representative cluster starts with zero relative charge. - 2. **Linking constraint**: ``SOC_boundary[d+1] = SOC_boundary[d] + delta_SOC[cluster_order[d]]`` + 2. **Linking constraint**: ``SOC_boundary[d+1] = SOC_boundary[d] + delta_SOC[cluster_assignments[d]]`` The boundary SOC after period d equals the boundary before plus the net charge/discharge of the representative cluster for that period. @@ -1313,6 +1322,7 @@ def _add_investment_model(self): label_of_element=self.label_of_element, label_of_model=self.label_of_element, parameters=self.element.capacity_in_flow_hours, + size_category=VariableCategory.STORAGE_SIZE, ), short_name='investment', ) @@ -1347,18 +1357,13 @@ def _add_intercluster_linking(self) -> None: ) clustering = self._model.flow_system.clustering - if clustering is None or clustering.result.cluster_structure is None: + if clustering is None: return - cluster_structure = clustering.result.cluster_structure - n_clusters = ( - int(cluster_structure.n_clusters) - if isinstance(cluster_structure.n_clusters, (int, np.integer)) - else int(cluster_structure.n_clusters.values) - ) - timesteps_per_cluster = cluster_structure.timesteps_per_cluster - n_original_clusters = cluster_structure.n_original_clusters - cluster_order = cluster_structure.cluster_order + n_clusters = clustering.n_clusters + timesteps_per_cluster = clustering.timesteps_per_cluster + n_original_clusters = clustering.n_original_clusters + cluster_assignments = clustering.cluster_assignments # 1. Constrain ΔE = 0 at cluster starts self._add_cluster_start_constraints(n_clusters, timesteps_per_cluster) @@ -1374,6 +1379,7 @@ def _add_intercluster_linking(self) -> None: coords=boundary_coords, dims=boundary_dims, short_name='SOC_boundary', + category=VariableCategory.SOC_BOUNDARY, ) # 3. Link SOC_boundary to investment size @@ -1388,7 +1394,7 @@ def _add_intercluster_linking(self) -> None: # 5. Add linking constraints self._add_linking_constraints( - soc_boundary, delta_soc, cluster_order, n_original_clusters, timesteps_per_cluster + soc_boundary, delta_soc, cluster_assignments, n_original_clusters, timesteps_per_cluster ) # 6. Add cyclic or initial constraint @@ -1417,7 +1423,7 @@ def _add_intercluster_linking(self) -> None: # 7. Add combined bound constraints self._add_combined_bound_constraints( soc_boundary, - cluster_order, + cluster_assignments, capacity_bounds.has_investment, n_original_clusters, timesteps_per_cluster, @@ -1467,14 +1473,14 @@ def _add_linking_constraints( self, soc_boundary: xr.DataArray, delta_soc: xr.DataArray, - cluster_order: xr.DataArray, + cluster_assignments: xr.DataArray, n_original_clusters: int, timesteps_per_cluster: int, ) -> None: """Add constraints linking consecutive SOC_boundary values. Per Blanke et al. (2022) Eq. 5, implements: - SOC_boundary[d+1] = SOC_boundary[d] * (1-loss)^N + delta_SOC[cluster_order[d]] + SOC_boundary[d+1] = SOC_boundary[d] * (1-loss)^N + delta_SOC[cluster_assignments[d]] where N is timesteps_per_cluster and loss is self-discharge rate per timestep. @@ -1484,7 +1490,7 @@ def _add_linking_constraints( Args: soc_boundary: SOC_boundary variable. delta_soc: Net SOC change per cluster. - cluster_order: Mapping from original periods to representative clusters. + cluster_assignments: Mapping from original periods to representative clusters. n_original_clusters: Number of original (non-clustered) periods. timesteps_per_cluster: Number of timesteps in each cluster period. """ @@ -1497,16 +1503,16 @@ def _add_linking_constraints( soc_before = soc_before.rename({'cluster_boundary': 'original_cluster'}) soc_before = soc_before.assign_coords(original_cluster=np.arange(n_original_clusters)) - # Get delta_soc for each original period using cluster_order - delta_soc_ordered = delta_soc.isel(cluster=cluster_order) + # Get delta_soc for each original period using cluster_assignments + delta_soc_ordered = delta_soc.isel(cluster=cluster_assignments) # Apply self-discharge decay factor (1-loss)^hours to soc_before per Eq. 5 - # relative_loss_per_hour is per-hour, so we need hours = timesteps * duration - # Use mean over time (linking operates at period level, not timestep) + # relative_loss_per_hour is per-hour, so we need total hours per cluster + # Use sum over time to get total duration (handles both regular and segmented systems) # Keep as DataArray to respect per-period/scenario values rel_loss = _scalar_safe_reduce(self.element.relative_loss_per_hour, 'time', 'mean') - hours_per_cluster = timesteps_per_cluster * _scalar_safe_reduce(self._model.timestep_duration, 'time', 'mean') - decay_n = (1 - rel_loss) ** hours_per_cluster + total_hours_per_cluster = _scalar_safe_reduce(self._model.timestep_duration, 'time', 'sum') + decay_n = (1 - rel_loss) ** total_hours_per_cluster lhs = soc_after - soc_before * decay_n - delta_soc_ordered self.add_constraints(lhs == 0, short_name='link') @@ -1514,7 +1520,7 @@ def _add_linking_constraints( def _add_combined_bound_constraints( self, soc_boundary: xr.DataArray, - cluster_order: xr.DataArray, + cluster_assignments: xr.DataArray, has_investment: bool, n_original_clusters: int, timesteps_per_cluster: int, @@ -1530,11 +1536,11 @@ def _add_combined_bound_constraints( middle, and end of each cluster. With 2D (cluster, time) structure, we simply select charge_state at a - given time offset, then reorder by cluster_order to get original_cluster order. + given time offset, then reorder by cluster_assignments to get original_cluster order. Args: soc_boundary: SOC_boundary variable. - cluster_order: Mapping from original periods to clusters. + cluster_assignments: Mapping from original periods to clusters. has_investment: Whether the storage has investment sizing. n_original_clusters: Number of original periods. timesteps_per_cluster: Timesteps in each cluster. @@ -1552,13 +1558,15 @@ def _add_combined_bound_constraints( rel_loss = _scalar_safe_reduce(self.element.relative_loss_per_hour, 'time', 'mean') mean_timestep_duration = _scalar_safe_reduce(self._model.timestep_duration, 'time', 'mean') - sample_offsets = [0, timesteps_per_cluster // 2, timesteps_per_cluster - 1] + # Use actual time dimension size (may be smaller than timesteps_per_cluster for segmented systems) + actual_time_size = charge_state.sizes['time'] + sample_offsets = [0, actual_time_size // 2, actual_time_size - 1] for sample_name, offset in zip(['start', 'mid', 'end'], sample_offsets, strict=False): - # With 2D structure: select time offset, then reorder by cluster_order + # With 2D structure: select time offset, then reorder by cluster_assignments cs_at_offset = charge_state.isel(time=offset) # Shape: (cluster, ...) - # Reorder to original_cluster order using cluster_order indexer - cs_t = cs_at_offset.isel(cluster=cluster_order) + # Reorder to original_cluster order using cluster_assignments indexer + cs_t = cs_at_offset.isel(cluster=cluster_assignments) # Suppress xarray warning about index loss - we immediately assign new coords anyway with warnings.catch_warnings(): warnings.filterwarnings('ignore', message='.*does not create an index anymore.*') diff --git a/flixopt/core.py b/flixopt/core.py index 46eef9493..ba8618e1a 100644 --- a/flixopt/core.py +++ b/flixopt/core.py @@ -614,28 +614,39 @@ def get_dataarray_stats(arr: xr.DataArray) -> dict: return stats -def drop_constant_arrays(ds: xr.Dataset, dim: str = 'time', drop_arrays_without_dim: bool = True) -> xr.Dataset: +def drop_constant_arrays( + ds: xr.Dataset, dim: str = 'time', drop_arrays_without_dim: bool = True, atol: float = 1e-10 +) -> xr.Dataset: """Drop variables with constant values along a dimension. Args: ds: Input dataset to filter. dim: Dimension along which to check for constant values. drop_arrays_without_dim: If True, also drop variables that don't have the specified dimension. + atol: Absolute tolerance for considering values as constant (based on max - min). Returns: Dataset with constant variables removed. """ drop_vars = [] + # Use ds.variables for faster access (avoids _construct_dataarray overhead) + variables = ds.variables - for name, da in ds.data_vars.items(): + for name in ds.data_vars: + var = variables[name] # Skip variables without the dimension - if dim not in da.dims: + if dim not in var.dims: if drop_arrays_without_dim: drop_vars.append(name) continue - # Check if variable is constant along the dimension - if (da.max(dim, skipna=True) == da.min(dim, skipna=True)).all().item(): + # Check if variable is constant along the dimension using numpy (ptp < atol) + axis = var.dims.index(dim) + data = var.values + # Use numpy operations directly for speed + with np.errstate(invalid='ignore'): # Ignore NaN warnings + ptp = np.nanmax(data, axis=axis) - np.nanmin(data, axis=axis) + if np.all(ptp < atol): drop_vars.append(name) if drop_vars: diff --git a/flixopt/effects.py b/flixopt/effects.py index 3a2322988..b32a4edd8 100644 --- a/flixopt/effects.py +++ b/flixopt/effects.py @@ -17,7 +17,15 @@ from .core import PlausibilityError from .features import ShareAllocationModel -from .structure import Element, ElementContainer, ElementModel, FlowSystemModel, Submodel, register_class_for_io +from .structure import ( + Element, + ElementContainer, + ElementModel, + FlowSystemModel, + Submodel, + VariableCategory, + register_class_for_io, +) if TYPE_CHECKING: from collections.abc import Iterator @@ -377,6 +385,7 @@ def _do_modeling(self): upper=self.element.maximum_total if self.element.maximum_total is not None else np.inf, coords=self._model.get_coords(['period', 'scenario']), name=self.label_full, + category=VariableCategory.TOTAL, ) self.add_constraints( @@ -394,6 +403,7 @@ def _do_modeling(self): upper=self.element.maximum_over_periods if self.element.maximum_over_periods is not None else np.inf, coords=self._model.get_coords(['scenario']), short_name='total_over_periods', + category=VariableCategory.TOTAL_OVER_PERIODS, ) self.add_constraints(self.total_over_periods == weighted_total, short_name='total_over_periods') diff --git a/flixopt/elements.py b/flixopt/elements.py index 0cee53738..791596b28 100644 --- a/flixopt/elements.py +++ b/flixopt/elements.py @@ -4,6 +4,7 @@ from __future__ import annotations +import functools import logging from typing import TYPE_CHECKING @@ -20,6 +21,7 @@ Element, ElementModel, FlowSystemModel, + VariableCategory, register_class_for_io, ) @@ -672,6 +674,7 @@ def _do_modeling(self): upper=self.absolute_flow_rate_bounds[1], coords=self._model.get_coords(), short_name='flow_rate', + category=VariableCategory.FLOW_RATE, ) self._constraint_flow_rate() @@ -687,6 +690,7 @@ def _do_modeling(self): ), coords=['period', 'scenario'], short_name='total_flow_hours', + category=VariableCategory.TOTAL, ) # Weighted sum over all periods constraint @@ -717,6 +721,7 @@ def _do_modeling(self): ), coords=['scenario'], short_name='flow_hours_over_periods', + category=VariableCategory.TOTAL_OVER_PERIODS, ) # Load factor constraints @@ -726,7 +731,12 @@ def _do_modeling(self): self._create_shares() def _create_status_model(self): - status = self.add_variables(binary=True, short_name='status', coords=self._model.get_coords()) + status = self.add_variables( + binary=True, + short_name='status', + coords=self._model.get_coords(), + category=VariableCategory.STATUS, + ) self.add_submodels( StatusModel( model=self._model, @@ -746,6 +756,7 @@ def _create_investment_model(self): label_of_element=self.label_of_element, parameters=self.element.size, label_of_model=self.label_of_element, + size_category=VariableCategory.FLOW_SIZE, ), 'investment', ) @@ -856,11 +867,13 @@ def _create_bounds_for_load_factor(self): short_name='load_factor_min', ) - @property + @functools.cached_property def relative_flow_rate_bounds(self) -> tuple[xr.DataArray, xr.DataArray]: if self.element.fixed_relative_profile is not None: return self.element.fixed_relative_profile, self.element.fixed_relative_profile - return self.element.relative_minimum, self.element.relative_maximum + # Ensure both bounds have matching dimensions (broadcast once here, + # so downstream code doesn't need to handle dimension mismatches) + return xr.broadcast(self.element.relative_minimum, self.element.relative_maximum) @property def absolute_flow_rate_bounds(self) -> tuple[xr.DataArray, xr.DataArray]: @@ -957,11 +970,17 @@ def _do_modeling(self): imbalance_penalty = self.element.imbalance_penalty_per_flow_hour * self._model.timestep_duration self.virtual_supply = self.add_variables( - lower=0, coords=self._model.get_coords(), short_name='virtual_supply' + lower=0, + coords=self._model.get_coords(), + short_name='virtual_supply', + category=VariableCategory.VIRTUAL_FLOW, ) self.virtual_demand = self.add_variables( - lower=0, coords=self._model.get_coords(), short_name='virtual_demand' + lower=0, + coords=self._model.get_coords(), + short_name='virtual_demand', + category=VariableCategory.VIRTUAL_FLOW, ) # Σ(inflows) + virtual_supply = Σ(outflows) + virtual_demand @@ -1028,7 +1047,12 @@ def _do_modeling(self): # Create component status variable and StatusModel if needed if self.element.status_parameters: - status = self.add_variables(binary=True, short_name='status', coords=self._model.get_coords()) + status = self.add_variables( + binary=True, + short_name='status', + coords=self._model.get_coords(), + category=VariableCategory.STATUS, + ) if len(all_flows) == 1: self.add_constraints(status == all_flows[0].submodel.status.status, short_name='status') else: diff --git a/flixopt/features.py b/flixopt/features.py index bb9864d64..e85636435 100644 --- a/flixopt/features.py +++ b/flixopt/features.py @@ -11,7 +11,7 @@ import numpy as np from .modeling import BoundingPatterns, ModelingPrimitives, ModelingUtilities -from .structure import FlowSystemModel, Submodel +from .structure import FlowSystemModel, Submodel, VariableCategory if TYPE_CHECKING: from collections.abc import Collection @@ -37,6 +37,7 @@ class InvestmentModel(Submodel): label_of_element: The label of the parent (Element). Used to construct the full label of the model. parameters: The parameters of the feature model. label_of_model: The label of the model. This is needed to construct the full label of the model. + size_category: Category for the size variable (FLOW_SIZE, STORAGE_SIZE, or SIZE for generic). """ parameters: InvestParameters @@ -47,9 +48,11 @@ def __init__( label_of_element: str, parameters: InvestParameters, label_of_model: str | None = None, + size_category: VariableCategory = VariableCategory.SIZE, ): self.piecewise_effects: PiecewiseEffectsModel | None = None self.parameters = parameters + self._size_category = size_category super().__init__(model, label_of_element=label_of_element, label_of_model=label_of_model) def _do_modeling(self): @@ -69,6 +72,7 @@ def _create_variables_and_constraints(self): lower=size_min if self.parameters.mandatory else 0, upper=size_max, coords=self._model.get_coords(['period', 'scenario']), + category=self._size_category, ) if not self.parameters.mandatory: @@ -76,6 +80,7 @@ def _create_variables_and_constraints(self): binary=True, coords=self._model.get_coords(['period', 'scenario']), short_name='invested', + category=VariableCategory.INVESTED, ) BoundingPatterns.bounds_with_state( self, @@ -193,7 +198,12 @@ def _do_modeling(self): # Create a separate binary 'inactive' variable when needed for downtime tracking or explicit use # When not needed, the expression (1 - self.status) can be used instead if self.parameters.use_downtime_tracking: - inactive = self.add_variables(binary=True, short_name='inactive', coords=self._model.get_coords()) + inactive = self.add_variables( + binary=True, + short_name='inactive', + coords=self._model.get_coords(), + category=VariableCategory.INACTIVE, + ) self.add_constraints(self.status + inactive == 1, short_name='complementary') # 3. Total duration tracking @@ -207,12 +217,23 @@ def _do_modeling(self): ), short_name='active_hours', coords=['period', 'scenario'], + category=VariableCategory.TOTAL, ) # 4. Switch tracking using existing pattern if self.parameters.use_startup_tracking: - self.add_variables(binary=True, short_name='startup', coords=self.get_coords()) - self.add_variables(binary=True, short_name='shutdown', coords=self.get_coords()) + self.add_variables( + binary=True, + short_name='startup', + coords=self.get_coords(), + category=VariableCategory.STARTUP, + ) + self.add_variables( + binary=True, + short_name='shutdown', + coords=self.get_coords(), + category=VariableCategory.SHUTDOWN, + ) # Determine previous_state: None means relaxed (no constraint at t=0) previous_state = self._previous_status.isel(time=-1) if self._previous_status is not None else None @@ -233,6 +254,7 @@ def _do_modeling(self): upper=self.parameters.startup_limit, coords=self._model.get_coords(('period', 'scenario')), short_name='startup_count', + category=VariableCategory.STARTUP_COUNT, ) # Sum over all temporal dimensions (time, and cluster if present) startup_temporal_dims = [d for d in self.startup.dims if d not in ('period', 'scenario')] @@ -387,12 +409,14 @@ def _do_modeling(self): binary=True, short_name='inside_piece', coords=self._model.get_coords(dims=self.dims), + category=VariableCategory.INSIDE_PIECE, ) self.lambda0 = self.add_variables( lower=0, upper=1, short_name='lambda0', coords=self._model.get_coords(dims=self.dims), + category=VariableCategory.LAMBDA0, ) self.lambda1 = self.add_variables( @@ -400,6 +424,7 @@ def _do_modeling(self): upper=1, short_name='lambda1', coords=self._model.get_coords(dims=self.dims), + category=VariableCategory.LAMBDA1, ) # Create constraints @@ -495,6 +520,7 @@ def _do_modeling(self): coords=self._model.get_coords(self.dims), binary=True, short_name='zero_point', + category=VariableCategory.ZERO_POINT, ) rhs = self.zero_point else: @@ -619,6 +645,7 @@ def _do_modeling(self): coords=self._model.get_coords([dim for dim in self._dims if dim != 'time']), name=self.label_full, short_name='total', + category=VariableCategory.TOTAL, ) # eq: sum = sum(share_i) # skalar self._eq_total = self.add_constraints(self.total == 0, name=self.label_full) @@ -629,6 +656,7 @@ def _do_modeling(self): upper=np.inf if (self._max_per_hour is None) else self._max_per_hour * self._model.timestep_duration, coords=self._model.get_coords(self._dims), short_name='per_timestep', + category=VariableCategory.PER_TIMESTEP, ) self._eq_total_per_timestep = self.add_constraints(self.total_per_timestep == 0, short_name='per_timestep') @@ -668,10 +696,13 @@ def add_share( if name in self.shares: self.share_constraints[name].lhs -= expression else: + # Temporal shares (with 'time' dim) are segment totals that need division + category = VariableCategory.SHARE if 'time' in dims else None self.shares[name] = self.add_variables( coords=self._model.get_coords(dims), name=f'{name}->{self.label_full}', short_name=name, + category=category, ) self.share_constraints[name] = self.add_constraints( diff --git a/flixopt/flow_system.py b/flixopt/flow_system.py index 0f6c27e40..a68333e98 100644 --- a/flixopt/flow_system.py +++ b/flixopt/flow_system.py @@ -15,7 +15,6 @@ import pandas as pd import xarray as xr -from . import __version__ from . import io as fx_io from .components import Storage from .config import CONFIG, DEPRECATION_REMOVAL_VERSION @@ -29,7 +28,14 @@ from .elements import Bus, Component, Flow from .optimize_accessor import OptimizeAccessor from .statistics_accessor import StatisticsAccessor -from .structure import CompositeContainerMixin, Element, ElementContainer, FlowSystemModel, Interface +from .structure import ( + CompositeContainerMixin, + Element, + ElementContainer, + FlowSystemModel, + Interface, + VariableCategory, +) from .topology_accessor import TopologyAccessor from .transform_accessor import TransformAccessor @@ -173,7 +179,7 @@ class FlowSystem(Interface, CompositeContainerMixin[Element]): def __init__( self, - timesteps: pd.DatetimeIndex, + timesteps: pd.DatetimeIndex | pd.RangeIndex, periods: pd.Index | None = None, scenarios: pd.Index | None = None, clusters: pd.Index | None = None, @@ -185,6 +191,7 @@ def __init__( scenario_independent_sizes: bool | list[str] = True, scenario_independent_flow_rates: bool | list[str] = False, name: str | None = None, + timestep_duration: xr.DataArray | None = None, ): self.timesteps = self._validate_timesteps(timesteps) @@ -193,14 +200,27 @@ def __init__( self.timesteps_extra, self.hours_of_last_timestep, self.hours_of_previous_timesteps, - timestep_duration, + computed_timestep_duration, ) = self._compute_time_metadata(self.timesteps, hours_of_last_timestep, hours_of_previous_timesteps) self.periods = None if periods is None else self._validate_periods(periods) self.scenarios = None if scenarios is None else self._validate_scenarios(scenarios) self.clusters = clusters # Cluster dimension for clustered FlowSystems - self.timestep_duration = self.fit_to_model_coords('timestep_duration', timestep_duration) + # Use provided timestep_duration if given (for segmented systems), otherwise use computed value + # For RangeIndex (segmented systems), computed_timestep_duration is None + if timestep_duration is not None: + self.timestep_duration = timestep_duration + elif computed_timestep_duration is not None: + self.timestep_duration = self.fit_to_model_coords('timestep_duration', computed_timestep_duration) + else: + # RangeIndex (segmented systems) requires explicit timestep_duration + if isinstance(self.timesteps, pd.RangeIndex): + raise ValueError( + 'timestep_duration is required when using RangeIndex timesteps (segmented systems). ' + 'Provide timestep_duration explicitly or use DatetimeIndex timesteps.' + ) + self.timestep_duration = None # Cluster weight for cluster() optimization (default 1.0) # Represents how many original timesteps each cluster represents @@ -241,6 +261,10 @@ def __init__( # Solution dataset - populated after optimization or loaded from file self._solution: xr.Dataset | None = None + # Variable categories for segment expansion handling + # Populated when model is built, used by transform.expand() + self._variable_categories: dict[str, VariableCategory] = {} + # Aggregation info - populated by transform.cluster() self.clustering: Clustering | None = None @@ -264,14 +288,19 @@ def __init__( self.name = name @staticmethod - def _validate_timesteps(timesteps: pd.DatetimeIndex) -> pd.DatetimeIndex: - """Validate timesteps format and rename if needed.""" - if not isinstance(timesteps, pd.DatetimeIndex): - raise TypeError('timesteps must be a pandas DatetimeIndex') + def _validate_timesteps( + timesteps: pd.DatetimeIndex | pd.RangeIndex, + ) -> pd.DatetimeIndex | pd.RangeIndex: + """Validate timesteps format and rename if needed. + + Accepts either DatetimeIndex (standard) or RangeIndex (for segmented systems). + """ + if not isinstance(timesteps, (pd.DatetimeIndex, pd.RangeIndex)): + raise TypeError('timesteps must be a pandas DatetimeIndex or RangeIndex') if len(timesteps) < 2: raise ValueError('timesteps must contain at least 2 timestamps') if timesteps.name != 'time': - timesteps.name = 'time' + timesteps = timesteps.rename('time') if not timesteps.is_monotonic_increasing: raise ValueError('timesteps must be sorted') return timesteps @@ -317,9 +346,17 @@ def _validate_periods(periods: pd.Index) -> pd.Index: @staticmethod def _create_timesteps_with_extra( - timesteps: pd.DatetimeIndex, hours_of_last_timestep: float | None - ) -> pd.DatetimeIndex: - """Create timesteps with an extra step at the end.""" + timesteps: pd.DatetimeIndex | pd.RangeIndex, hours_of_last_timestep: float | None + ) -> pd.DatetimeIndex | pd.RangeIndex: + """Create timesteps with an extra step at the end. + + For DatetimeIndex, adds an extra timestep using hours_of_last_timestep. + For RangeIndex (segmented systems), simply appends the next integer. + """ + if isinstance(timesteps, pd.RangeIndex): + # For RangeIndex, just add one more integer + return pd.RangeIndex(len(timesteps) + 1, name='time') + if hours_of_last_timestep is None: hours_of_last_timestep = (timesteps[-1] - timesteps[-2]) / pd.Timedelta(hours=1) @@ -327,8 +364,18 @@ def _create_timesteps_with_extra( return pd.DatetimeIndex(timesteps.append(last_date), name='time') @staticmethod - def calculate_timestep_duration(timesteps_extra: pd.DatetimeIndex) -> xr.DataArray: - """Calculate duration of each timestep in hours as a 1D DataArray.""" + def calculate_timestep_duration( + timesteps_extra: pd.DatetimeIndex | pd.RangeIndex, + ) -> xr.DataArray | None: + """Calculate duration of each timestep in hours as a 1D DataArray. + + For RangeIndex (segmented systems), returns None since duration cannot be + computed from the index. Use timestep_duration parameter instead. + """ + if isinstance(timesteps_extra, pd.RangeIndex): + # Cannot compute duration from RangeIndex - must be provided externally + return None + hours_per_step = np.diff(timesteps_extra) / pd.Timedelta(hours=1) return xr.DataArray( hours_per_step, coords={'time': timesteps_extra[:-1]}, dims='time', name='timestep_duration' @@ -336,11 +383,17 @@ def calculate_timestep_duration(timesteps_extra: pd.DatetimeIndex) -> xr.DataArr @staticmethod def _calculate_hours_of_previous_timesteps( - timesteps: pd.DatetimeIndex, hours_of_previous_timesteps: float | np.ndarray | None - ) -> float | np.ndarray: - """Calculate duration of regular timesteps.""" + timesteps: pd.DatetimeIndex | pd.RangeIndex, hours_of_previous_timesteps: float | np.ndarray | None + ) -> float | np.ndarray | None: + """Calculate duration of regular timesteps. + + For RangeIndex (segmented systems), returns None if not provided. + """ if hours_of_previous_timesteps is not None: return hours_of_previous_timesteps + if isinstance(timesteps, pd.RangeIndex): + # Cannot compute from RangeIndex + return None # Calculate from the first interval first_interval = timesteps[1] - timesteps[0] return first_interval.total_seconds() / 3600 # Convert to hours @@ -385,33 +438,42 @@ def calculate_weight_per_period(periods_extra: pd.Index) -> xr.DataArray: @classmethod def _compute_time_metadata( cls, - timesteps: pd.DatetimeIndex, + timesteps: pd.DatetimeIndex | pd.RangeIndex, hours_of_last_timestep: int | float | None = None, hours_of_previous_timesteps: int | float | np.ndarray | None = None, - ) -> tuple[pd.DatetimeIndex, float, float | np.ndarray, xr.DataArray]: + ) -> tuple[ + pd.DatetimeIndex | pd.RangeIndex, + float | None, + float | np.ndarray | None, + xr.DataArray | None, + ]: """ Compute all time-related metadata from timesteps. This is the single source of truth for time metadata computation, used by both __init__ and dataset operations (sel/isel/resample) to ensure consistency. + For RangeIndex (segmented systems), timestep_duration cannot be calculated from + the index and must be provided externally after FlowSystem creation. + Args: - timesteps: The time index to compute metadata from + timesteps: The time index to compute metadata from (DatetimeIndex or RangeIndex) hours_of_last_timestep: Duration of the last timestep. If None, computed from the time index. hours_of_previous_timesteps: Duration of previous timesteps. If None, computed from the time index. Can be a scalar or array. Returns: Tuple of (timesteps_extra, hours_of_last_timestep, hours_of_previous_timesteps, timestep_duration) + For RangeIndex, hours_of_last_timestep and timestep_duration may be None. """ # Create timesteps with extra step at the end timesteps_extra = cls._create_timesteps_with_extra(timesteps, hours_of_last_timestep) - # Calculate timestep duration + # Calculate timestep duration (returns None for RangeIndex) timestep_duration = cls.calculate_timestep_duration(timesteps_extra) # Extract hours_of_last_timestep if not provided - if hours_of_last_timestep is None: + if hours_of_last_timestep is None and timestep_duration is not None: hours_of_last_timestep = timestep_duration.isel(time=-1).item() # Compute hours_of_previous_timesteps (handles both None and provided cases) @@ -627,7 +689,7 @@ def _create_reference_structure(self) -> tuple[dict, dict[str, xr.DataArray]]: return reference_structure, all_extracted_arrays - def to_dataset(self, include_solution: bool = True) -> xr.Dataset: + def to_dataset(self, include_solution: bool = True, include_original_data: bool = True) -> xr.Dataset: """ Convert the FlowSystem to an xarray Dataset. Ensures FlowSystem is connected before serialization. @@ -645,70 +707,32 @@ def to_dataset(self, include_solution: bool = True) -> xr.Dataset: include_solution: Whether to include the optimization solution in the dataset. Defaults to True. Set to False to get only the FlowSystem structure without solution data (useful for copying or saving templates). + include_original_data: Whether to include clustering.original_data in the dataset. + Defaults to True. Set to False for smaller files (~38% reduction) when + clustering.plot.compare() isn't needed after loading. The core workflow + (optimize → expand) works without original_data. Returns: xr.Dataset: Dataset containing all DataArrays with structure in attributes + + See Also: + from_dataset: Create FlowSystem from dataset + to_netcdf: Save to NetCDF file """ if not self.connected_and_transformed: logger.info('FlowSystem is not connected_and_transformed. Connecting and transforming data now.') self.connect_and_transform() - ds = super().to_dataset() + # Get base dataset from parent class + base_ds = super().to_dataset() - # Include solution data if present and requested - if include_solution and self.solution is not None: - # Rename 'time' to 'solution_time' in solution variables to preserve full solution - # (linopy solution may have extra timesteps, e.g., for final charge states) - solution_renamed = ( - self.solution.rename({'time': 'solution_time'}) if 'time' in self.solution.dims else self.solution - ) - # Add solution variables with 'solution|' prefix to avoid conflicts - solution_vars = {f'solution|{name}': var for name, var in solution_renamed.data_vars.items()} - ds = ds.assign(solution_vars) - # Also add the solution_time coordinate if it exists - if 'solution_time' in solution_renamed.coords: - ds = ds.assign_coords(solution_time=solution_renamed.coords['solution_time']) - ds.attrs['has_solution'] = True - else: - ds.attrs['has_solution'] = False - - # Include carriers if any are registered - if self._carriers: - carriers_structure = {} - for name, carrier in self._carriers.items(): - carrier_ref, _ = carrier._create_reference_structure() - carriers_structure[name] = carrier_ref - ds.attrs['carriers'] = json.dumps(carriers_structure) - - # Serialize Clustering object for full reconstruction in from_dataset() - if self.clustering is not None: - clustering_ref, clustering_arrays = self.clustering._create_reference_structure() - # Add clustering arrays with prefix - for name, arr in clustering_arrays.items(): - ds[f'clustering|{name}'] = arr - ds.attrs['clustering'] = json.dumps(clustering_ref) - - # Add version info - ds.attrs['flixopt_version'] = __version__ - - # Ensure model coordinates are always present in the Dataset - # (even if no data variable uses them, they define the model structure) - model_coords = {'time': self.timesteps} - if self.periods is not None: - model_coords['period'] = self.periods - if self.scenarios is not None: - model_coords['scenario'] = self.scenarios - if self.clusters is not None: - model_coords['cluster'] = self.clusters - ds = ds.assign_coords(model_coords) - - return ds + # Add FlowSystem-specific data (solution, clustering, metadata) + return fx_io.flow_system_to_dataset(self, base_ds, include_solution, include_original_data) @classmethod def from_dataset(cls, ds: xr.Dataset) -> FlowSystem: """ Create a FlowSystem from an xarray Dataset. - Handles FlowSystem-specific reconstruction logic. If the dataset contains solution data (variables prefixed with 'solution|'), the solution will be restored to the FlowSystem. Solution time coordinates @@ -723,124 +747,20 @@ def from_dataset(cls, ds: xr.Dataset) -> FlowSystem: Returns: FlowSystem instance - """ - # Get the reference structure from attrs - reference_structure = dict(ds.attrs) - - # Separate solution variables from config variables - solution_prefix = 'solution|' - solution_vars = {} - config_vars = {} - for name, array in ds.data_vars.items(): - if name.startswith(solution_prefix): - # Remove prefix for solution dataset - original_name = name[len(solution_prefix) :] - solution_vars[original_name] = array - else: - config_vars[name] = array - - # Create arrays dictionary from config variables only - arrays_dict = config_vars - - # Extract cluster index if present (clustered FlowSystem) - clusters = ds.indexes.get('cluster') - - # For clustered datasets, cluster_weight is (cluster,) shaped - set separately - if clusters is not None: - cluster_weight_for_constructor = None - else: - cluster_weight_for_constructor = ( - cls._resolve_dataarray_reference(reference_structure['cluster_weight'], arrays_dict) - if 'cluster_weight' in reference_structure - else None - ) - - # Resolve scenario_weights only if scenario dimension exists - scenario_weights = None - if ds.indexes.get('scenario') is not None and 'scenario_weights' in reference_structure: - scenario_weights = cls._resolve_dataarray_reference(reference_structure['scenario_weights'], arrays_dict) - - # Create FlowSystem instance with constructor parameters - flow_system = cls( - timesteps=ds.indexes['time'], - periods=ds.indexes.get('period'), - scenarios=ds.indexes.get('scenario'), - clusters=clusters, - hours_of_last_timestep=reference_structure.get('hours_of_last_timestep'), - hours_of_previous_timesteps=reference_structure.get('hours_of_previous_timesteps'), - weight_of_last_period=reference_structure.get('weight_of_last_period'), - scenario_weights=scenario_weights, - cluster_weight=cluster_weight_for_constructor, - scenario_independent_sizes=reference_structure.get('scenario_independent_sizes', True), - scenario_independent_flow_rates=reference_structure.get('scenario_independent_flow_rates', False), - name=reference_structure.get('name'), - ) - - # Restore components - components_structure = reference_structure.get('components', {}) - for comp_label, comp_data in components_structure.items(): - component = cls._resolve_reference_structure(comp_data, arrays_dict) - if not isinstance(component, Component): - logger.critical(f'Restoring component {comp_label} failed.') - flow_system._add_components(component) - - # Restore buses - buses_structure = reference_structure.get('buses', {}) - for bus_label, bus_data in buses_structure.items(): - bus = cls._resolve_reference_structure(bus_data, arrays_dict) - if not isinstance(bus, Bus): - logger.critical(f'Restoring bus {bus_label} failed.') - flow_system._add_buses(bus) - - # Restore effects - effects_structure = reference_structure.get('effects', {}) - for effect_label, effect_data in effects_structure.items(): - effect = cls._resolve_reference_structure(effect_data, arrays_dict) - if not isinstance(effect, Effect): - logger.critical(f'Restoring effect {effect_label} failed.') - flow_system._add_effects(effect) - - # Restore solution if present - if reference_structure.get('has_solution', False) and solution_vars: - solution_ds = xr.Dataset(solution_vars) - # Rename 'solution_time' back to 'time' if present - if 'solution_time' in solution_ds.dims: - solution_ds = solution_ds.rename({'solution_time': 'time'}) - flow_system.solution = solution_ds - - # Restore carriers if present - if 'carriers' in reference_structure: - carriers_structure = json.loads(reference_structure['carriers']) - for carrier_data in carriers_structure.values(): - carrier = cls._resolve_reference_structure(carrier_data, {}) - flow_system._carriers.add(carrier) - - # Restore Clustering object if present - if 'clustering' in reference_structure: - clustering_structure = json.loads(reference_structure['clustering']) - # Collect clustering arrays (prefixed with 'clustering|') - clustering_arrays = {} - for name, arr in ds.data_vars.items(): - if name.startswith('clustering|'): - # Remove 'clustering|' prefix (11 chars) from both key and DataArray name - # This ensures that if the FlowSystem is serialized again, the arrays - # won't get double-prefixed (clustering|clustering|...) - arr_name = name[11:] - clustering_arrays[arr_name] = arr.rename(arr_name) - clustering = cls._resolve_reference_structure(clustering_structure, clustering_arrays) - flow_system.clustering = clustering - - # Restore cluster_weight from clustering's representative_weights - # This is needed because cluster_weight_for_constructor was set to None for clustered datasets - if hasattr(clustering, 'result') and hasattr(clustering.result, 'representative_weights'): - flow_system.cluster_weight = clustering.result.representative_weights - - # Reconnect network to populate bus inputs/outputs (not stored in NetCDF). - flow_system.connect_and_transform() - return flow_system + See Also: + to_dataset: Convert FlowSystem to dataset + from_netcdf: Load from NetCDF file + """ + return fx_io.restore_flow_system_from_dataset(ds) - def to_netcdf(self, path: str | pathlib.Path, compression: int = 5, overwrite: bool = False): + def to_netcdf( + self, + path: str | pathlib.Path, + compression: int = 5, + overwrite: bool = False, + include_original_data: bool = True, + ): """ Save the FlowSystem to a NetCDF file. Ensures FlowSystem is connected before saving. @@ -852,6 +772,9 @@ def to_netcdf(self, path: str | pathlib.Path, compression: int = 5, overwrite: b path: The path to the netCDF file. Parent directories are created if they don't exist. compression: The compression level to use when saving the file (0-9). overwrite: If True, overwrite existing file. If False, raise error if file exists. + include_original_data: Whether to include clustering.original_data in the file. + Defaults to True. Set to False for smaller files (~38% reduction) when + clustering.plot.compare() isn't needed after loading. Raises: FileExistsError: If overwrite=False and file already exists. @@ -861,11 +784,21 @@ def to_netcdf(self, path: str | pathlib.Path, compression: int = 5, overwrite: b self.connect_and_transform() path = pathlib.Path(path) + + if not overwrite and path.exists(): + raise FileExistsError(f'File already exists: {path}. Use overwrite=True to overwrite existing file.') + + path.parent.mkdir(parents=True, exist_ok=True) + # Set name from filename (without extension) self.name = path.stem - super().to_netcdf(path, compression, overwrite) - logger.info(f'Saved FlowSystem to {path}') + try: + ds = self.to_dataset(include_original_data=include_original_data) + fx_io.save_dataset_to_netcdf(ds, path, compression=compression) + logger.info(f'Saved FlowSystem to {path}') + except Exception as e: + raise OSError(f'Failed to save FlowSystem to NetCDF file {path}: {e}') from e @classmethod def from_netcdf(cls, path: str | pathlib.Path) -> FlowSystem: @@ -1523,6 +1456,9 @@ def solve(self, solver: _Solver) -> FlowSystem: # Store solution on FlowSystem for direct Element access self.solution = self.model.solution + # Copy variable categories for segment expansion handling + self._variable_categories = self.model.variable_categories.copy() + logger.info(f'Optimization solved successfully. Objective: {self.model.objective.value:.4f}') return self @@ -1553,6 +1489,69 @@ def solution(self, value: xr.Dataset | None) -> None: self._solution = value self._statistics = None # Invalidate cached statistics + @property + def variable_categories(self) -> dict[str, VariableCategory]: + """Variable categories for filtering and segment expansion. + + Returns: + Dict mapping variable names to their VariableCategory. + """ + return self._variable_categories + + def get_variables_by_category(self, *categories: VariableCategory, from_solution: bool = True) -> list[str]: + """Get variable names matching any of the specified categories. + + Args: + *categories: One or more VariableCategory values to filter by. + from_solution: If True, only return variables present in solution. + If False, return all registered variables matching categories. + + Returns: + List of variable names matching any of the specified categories. + + Example: + >>> fs.get_variables_by_category(VariableCategory.FLOW_RATE) + ['Boiler(Q_th)|flow_rate', 'CHP(Q_th)|flow_rate', ...] + >>> fs.get_variables_by_category(VariableCategory.SIZE, VariableCategory.INVESTED) + ['Boiler(Q_th)|size', 'Boiler(Q_th)|invested', ...] + """ + category_set = set(categories) + + if self._variable_categories: + # Use registered categories + matching = [name for name, cat in self._variable_categories.items() if cat in category_set] + elif self._solution is not None: + # Fallback for old files without categories: match by suffix pattern + # Category values match the variable suffix (e.g., FLOW_RATE.value = 'flow_rate') + matching = [] + for cat in category_set: + # Handle new sub-categories that map to old |size suffix + if cat == VariableCategory.FLOW_SIZE: + flow_labels = set(self.flows.keys()) + matching.extend( + v + for v in self._solution.data_vars + if v.endswith('|size') and v.rsplit('|', 1)[0] in flow_labels + ) + elif cat == VariableCategory.STORAGE_SIZE: + storage_labels = set(self.storages.keys()) + matching.extend( + v + for v in self._solution.data_vars + if v.endswith('|size') and v.rsplit('|', 1)[0] in storage_labels + ) + else: + # Standard suffix matching + suffix = f'|{cat.value}' + matching.extend(v for v in self._solution.data_vars if v.endswith(suffix)) + else: + matching = [] + + if from_solution and self._solution is not None: + solution_vars = set(self._solution.data_vars) + matching = [v for v in matching if v in solution_vars] + return matching + @property def is_locked(self) -> bool: """Check if the FlowSystem is locked (has a solution). @@ -1579,6 +1578,7 @@ def _invalidate_model(self) -> None: self._connected_and_transformed = False self._topology = None # Invalidate topology accessor (and its cached colors) self._flow_carriers = None # Invalidate flow-to-carrier mapping + self._variable_categories.clear() # Clear stale categories for segment expansion for element in self.values(): element.submodel = None element._variable_names = [] @@ -1932,10 +1932,19 @@ def __repr__(self) -> str: """Return a detailed string representation showing all containers.""" r = fx_io.format_title_with_underline('FlowSystem', '=') - # Timestep info - time_period = f'{self.timesteps[0].date()} to {self.timesteps[-1].date()}' - freq_str = str(self.timesteps.freq).replace('<', '').replace('>', '') if self.timesteps.freq else 'irregular' - r += f'Timesteps: {len(self.timesteps)} ({freq_str}) [{time_period}]\n' + # Timestep info - handle both DatetimeIndex and RangeIndex (segmented) + if self.is_segmented: + r += f'Timesteps: {len(self.timesteps)} segments (segmented)\n' + else: + time_period = f'{self.timesteps[0].date()} to {self.timesteps[-1].date()}' + freq_str = ( + str(self.timesteps.freq).replace('<', '').replace('>', '') if self.timesteps.freq else 'irregular' + ) + r += f'Timesteps: {len(self.timesteps)} ({freq_str}) [{time_period}]\n' + + # Add clusters if present + if self.clusters is not None: + r += f'Clusters: {len(self.clusters)}\n' # Add periods if present if self.periods is not None: @@ -2116,10 +2125,19 @@ def _cluster_timesteps_per_cluster(self) -> int | None: return len(self.timesteps) if self.clusters is not None else None @property - def _cluster_time_coords(self) -> pd.DatetimeIndex | None: + def _cluster_time_coords(self) -> pd.DatetimeIndex | pd.RangeIndex | None: """Get time coordinates for clustered system (same as timesteps).""" return self.timesteps if self.clusters is not None else None + @property + def is_segmented(self) -> bool: + """Check if this FlowSystem uses segmented time (RangeIndex instead of DatetimeIndex). + + Segmented systems have variable timestep durations stored in timestep_duration, + and use a RangeIndex for time coordinates instead of DatetimeIndex. + """ + return isinstance(self.timesteps, pd.RangeIndex) + @property def n_timesteps(self) -> int: """Number of timesteps (within each cluster if clustered).""" diff --git a/flixopt/io.py b/flixopt/io.py index 7ab74c3e4..ad12e6893 100644 --- a/flixopt/io.py +++ b/flixopt/io.py @@ -21,6 +21,7 @@ if TYPE_CHECKING: import linopy + from .flow_system import FlowSystem from .types import Numeric_TPS logger = logging.getLogger('flixopt') @@ -560,14 +561,18 @@ def save_dataset_to_netcdf( ds.attrs = {'attrs': json.dumps(ds.attrs)} # Convert all DataArray attrs to JSON strings - for var_name, data_var in ds.data_vars.items(): - if data_var.attrs: # Only if there are attrs - ds[var_name].attrs = {'attrs': json.dumps(data_var.attrs)} + # Use ds.variables to avoid slow _construct_dataarray calls + variables = ds.variables + for var_name in ds.data_vars: + var = variables[var_name] + if var.attrs: # Only if there are attrs + var.attrs = {'attrs': json.dumps(var.attrs)} # Also handle coordinate attrs if they exist - for coord_name, coord_var in ds.coords.items(): - if hasattr(coord_var, 'attrs') and coord_var.attrs: - ds[coord_name].attrs = {'attrs': json.dumps(coord_var.attrs)} + for coord_name in ds.coords: + var = variables[coord_name] + if var.attrs: + var.attrs = {'attrs': json.dumps(var.attrs)} # Suppress numpy binary compatibility warnings from netCDF4 (numpy 1->2 transition) with warnings.catch_warnings(): @@ -601,25 +606,38 @@ def _reduce_constant_arrays(ds: xr.Dataset) -> xr.Dataset: Dataset with constant dimensions reduced. """ new_data_vars = {} + variables = ds.variables - for name, da in ds.data_vars.items(): - if not da.dims or da.size == 0: - new_data_vars[name] = da + for name in ds.data_vars: + var = variables[name] + dims = var.dims + data = var.values + + if not dims or data.size == 0: + new_data_vars[name] = var continue - # Try to reduce each dimension - reduced = da - for dim in list(da.dims): - if dim not in reduced.dims: + # Try to reduce each dimension using numpy operations + reduced_data = data + reduced_dims = list(dims) + + for _axis, dim in enumerate(dims): + if dim not in reduced_dims: continue # Already removed - # Check if constant along this dimension - first_slice = reduced.isel({dim: 0}) - is_constant = (reduced == first_slice).all() + + current_axis = reduced_dims.index(dim) + # Check if constant along this axis using numpy + first_slice = np.take(reduced_data, 0, axis=current_axis) + # Broadcast first_slice to compare + expanded = np.expand_dims(first_slice, axis=current_axis) + is_constant = np.allclose(reduced_data, expanded, equal_nan=True) + if is_constant: # Remove this dimension by taking first slice - reduced = first_slice + reduced_data = first_slice + reduced_dims.pop(current_axis) - new_data_vars[name] = reduced + new_data_vars[name] = xr.Variable(tuple(reduced_dims), reduced_data, attrs=var.attrs) return xr.Dataset(new_data_vars, coords=ds.coords, attrs=ds.attrs) @@ -644,24 +662,52 @@ def _stack_equal_vars(ds: xr.Dataset, stacked_dim: str = '__stacked__') -> xr.Da Stacked variables are named 'stacked_{dims}' and have a coordinate '{stacked_dim}_{dims}' containing the original variable names. """ + # Use ds.variables to avoid slow _construct_dataarray calls + variables = ds.variables + data_var_names = set(ds.data_vars) + + # Group variables by their dimensions groups = defaultdict(list) - for name, var in ds.data_vars.items(): + for name in data_var_names: + var = variables[name] groups[var.dims].append(name) new_data_vars = {} for dims, var_names in groups.items(): if len(var_names) == 1: - new_data_vars[var_names[0]] = ds[var_names[0]] + # Single variable - use Variable directly + new_data_vars[var_names[0]] = variables[var_names[0]] else: dim_suffix = '_'.join(dims) if dims else 'scalar' group_stacked_dim = f'{stacked_dim}_{dim_suffix}' - stacked = xr.concat([ds[name] for name in var_names], dim=group_stacked_dim) - stacked = stacked.assign_coords({group_stacked_dim: var_names}) + # Stack using numpy directly - much faster than xr.concat + # All variables in this group have the same dims/shape + arrays = [variables[name].values for name in var_names] + stacked_data = np.stack(arrays, axis=0) + + # Capture per-variable attrs before stacking + per_variable_attrs = {name: dict(variables[name].attrs) for name in var_names} - new_data_vars[f'stacked_{dim_suffix}'] = stacked + # Create new Variable with stacked dimension first + stacked_var = xr.Variable( + dims=(group_stacked_dim,) + dims, + data=stacked_data, + attrs={'__per_variable_attrs__': per_variable_attrs}, + ) + new_data_vars[f'stacked_{dim_suffix}'] = stacked_var + + # Build result dataset preserving coordinates + result = xr.Dataset(new_data_vars, coords=ds.coords, attrs=ds.attrs) + + # Add the stacking coordinates (variable names) + for dims, var_names in groups.items(): + if len(var_names) > 1: + dim_suffix = '_'.join(dims) if dims else 'scalar' + group_stacked_dim = f'{stacked_dim}_{dim_suffix}' + result = result.assign_coords({group_stacked_dim: var_names}) - return xr.Dataset(new_data_vars, attrs=ds.attrs) + return result def _unstack_vars(ds: xr.Dataset, stacked_prefix: str = '__stacked__') -> xr.Dataset: @@ -676,16 +722,38 @@ def _unstack_vars(ds: xr.Dataset, stacked_prefix: str = '__stacked__') -> xr.Dat Dataset with individual variables restored from stacked arrays. """ new_data_vars = {} - for name, var in ds.data_vars.items(): - stacked_dims = [d for d in var.dims if d.startswith(stacked_prefix)] - if stacked_dims: - stacked_dim = stacked_dims[0] - for label in var[stacked_dim].values: - new_data_vars[str(label)] = var.sel({stacked_dim: label}, drop=True) + variables = ds.variables + + for name in ds.data_vars: + var = variables[name] + # Find stacked dimension (if any) + stacked_dim = None + stacked_dim_idx = None + for i, d in enumerate(var.dims): + if d.startswith(stacked_prefix): + stacked_dim = d + stacked_dim_idx = i + break + + if stacked_dim is not None: + # Get labels from the stacked coordinate + labels = ds.coords[stacked_dim].values + # Get remaining dims (everything except stacked dim) + remaining_dims = var.dims[:stacked_dim_idx] + var.dims[stacked_dim_idx + 1 :] + # Get per-variable attrs if available + per_variable_attrs = var.attrs.get('__per_variable_attrs__', {}) + # Extract each slice using numpy indexing (much faster than .sel()) + data = var.values + for idx, label in enumerate(labels): + # Use numpy indexing to get the slice + sliced_data = np.take(data, idx, axis=stacked_dim_idx) + # Restore original attrs if available + restored_attrs = per_variable_attrs.get(str(label), {}) + new_data_vars[str(label)] = xr.Variable(remaining_dims, sliced_data, attrs=restored_attrs) else: new_data_vars[name] = var - return xr.Dataset(new_data_vars, attrs=ds.attrs) + return xr.Dataset(new_data_vars, coords=ds.coords, attrs=ds.attrs) def load_dataset_from_netcdf(path: str | pathlib.Path) -> xr.Dataset: @@ -711,14 +779,18 @@ def load_dataset_from_netcdf(path: str | pathlib.Path) -> xr.Dataset: ds.attrs = json.loads(ds.attrs['attrs']) # Restore DataArray attrs (before unstacking, as stacked vars have no individual attrs) - for var_name, data_var in ds.data_vars.items(): - if 'attrs' in data_var.attrs: - ds[var_name].attrs = json.loads(data_var.attrs['attrs']) + # Use ds.variables to avoid slow _construct_dataarray calls + variables = ds.variables + for var_name in ds.data_vars: + var = variables[var_name] + if 'attrs' in var.attrs: + var.attrs = json.loads(var.attrs['attrs']) # Restore coordinate attrs - for coord_name, coord_var in ds.coords.items(): - if hasattr(coord_var, 'attrs') and 'attrs' in coord_var.attrs: - ds[coord_name].attrs = json.loads(coord_var.attrs['attrs']) + for coord_name in ds.coords: + var = variables[coord_name] + if 'attrs' in var.attrs: + var.attrs = json.loads(var.attrs['attrs']) # Unstack variables if they were stacked during saving # Detection: check if any dataset dimension starts with '__stacked__' @@ -1428,3 +1500,481 @@ def suppress_output(): os.close(fd) except OSError: pass # FD already closed or invalid + + +# ============================================================================ +# FlowSystem Dataset I/O +# ============================================================================ + + +class FlowSystemDatasetIO: + """Unified I/O handler for FlowSystem dataset serialization and deserialization. + + This class provides optimized methods for converting FlowSystem objects to/from + xarray Datasets. It uses shared constants for variable prefixes and implements + fast DataArray construction to avoid xarray's slow _construct_dataarray method. + + Constants: + SOLUTION_PREFIX: Prefix for solution variables ('solution|') + CLUSTERING_PREFIX: Prefix for clustering variables ('clustering|') + + Example: + # Serialization (FlowSystem -> Dataset) + ds = FlowSystemDatasetIO.to_dataset(flow_system, base_ds) + + # Deserialization (Dataset -> FlowSystem) + fs = FlowSystemDatasetIO.from_dataset(ds) + """ + + # Shared prefixes for variable namespacing + SOLUTION_PREFIX = 'solution|' + CLUSTERING_PREFIX = 'clustering|' + + # --- Deserialization (Dataset -> FlowSystem) --- + + @classmethod + def from_dataset(cls, ds: xr.Dataset) -> FlowSystem: + """Create FlowSystem from dataset. + + This is the main entry point for dataset restoration. + Called by FlowSystem.from_dataset(). + + If the dataset contains solution data (variables prefixed with 'solution|'), + the solution will be restored to the FlowSystem. Solution time coordinates + are renamed back from 'solution_time' to 'time'. + + Supports clustered datasets with (cluster, time) dimensions. When detected, + creates a synthetic DatetimeIndex for compatibility and stores the clustered + data structure for later use. + + Args: + ds: Dataset containing the FlowSystem data + + Returns: + FlowSystem instance with all components, buses, effects, and solution restored + """ + from .flow_system import FlowSystem + + # Parse dataset structure + reference_structure = dict(ds.attrs) + solution_var_names, config_var_names = cls._separate_variables(ds) + coord_cache = {k: ds.coords[k] for k in ds.coords} + arrays_dict = {name: cls._fast_get_dataarray(ds, name, coord_cache) for name in config_var_names} + + # Create and populate FlowSystem + flow_system = cls._create_flow_system(ds, reference_structure, arrays_dict, FlowSystem) + cls._restore_elements(flow_system, reference_structure, arrays_dict, FlowSystem) + cls._restore_solution(flow_system, ds, reference_structure, solution_var_names) + cls._restore_clustering(flow_system, ds, reference_structure, config_var_names, arrays_dict, FlowSystem) + cls._restore_metadata(flow_system, reference_structure, FlowSystem) + flow_system.connect_and_transform() + return flow_system + + @classmethod + def _separate_variables(cls, ds: xr.Dataset) -> tuple[dict[str, str], list[str]]: + """Separate solution variables from config variables. + + Args: + ds: Source dataset + + Returns: + Tuple of (solution_var_names dict, config_var_names list) + """ + solution_var_names: dict[str, str] = {} # Maps original_name -> ds_name + config_var_names: list[str] = [] + + for name in ds.data_vars: + if name.startswith(cls.SOLUTION_PREFIX): + solution_var_names[name[len(cls.SOLUTION_PREFIX) :]] = name + else: + config_var_names.append(name) + + return solution_var_names, config_var_names + + @staticmethod + def _fast_get_dataarray(ds: xr.Dataset, name: str, coord_cache: dict[str, xr.DataArray]) -> xr.DataArray: + """Construct DataArray from Variable without slow coordinate inference. + + This bypasses the slow _construct_dataarray method (~1.5ms -> ~0.1ms per var). + + Args: + ds: Source dataset + name: Variable name + coord_cache: Pre-cached coordinate DataArrays + + Returns: + Constructed DataArray + """ + variable = ds.variables[name] + var_dims = set(variable.dims) + # Include coordinates whose dims are a subset of the variable's dims + # This preserves both dimension coordinates and auxiliary coordinates + coords = {k: v for k, v in coord_cache.items() if set(v.dims).issubset(var_dims)} + return xr.DataArray(variable, coords=coords, name=name) + + @staticmethod + def _create_flow_system( + ds: xr.Dataset, + reference_structure: dict[str, Any], + arrays_dict: dict[str, xr.DataArray], + cls: type[FlowSystem], + ) -> FlowSystem: + """Create FlowSystem instance with constructor parameters.""" + # Extract cluster index if present (clustered FlowSystem) + clusters = ds.indexes.get('cluster') + + # For clustered datasets, cluster_weight is (cluster,) shaped - set separately + if clusters is not None: + cluster_weight_for_constructor = None + else: + cluster_weight_for_constructor = ( + cls._resolve_dataarray_reference(reference_structure['cluster_weight'], arrays_dict) + if 'cluster_weight' in reference_structure + else None + ) + + # Resolve scenario_weights only if scenario dimension exists + scenario_weights = None + if ds.indexes.get('scenario') is not None and 'scenario_weights' in reference_structure: + scenario_weights = cls._resolve_dataarray_reference(reference_structure['scenario_weights'], arrays_dict) + + # Resolve timestep_duration if present as DataArray reference + timestep_duration = None + if 'timestep_duration' in reference_structure: + ref_value = reference_structure['timestep_duration'] + if isinstance(ref_value, str) and ref_value.startswith(':::'): + timestep_duration = cls._resolve_dataarray_reference(ref_value, arrays_dict) + + # Get timesteps - convert integer index to RangeIndex for segmented systems + time_index = ds.indexes['time'] + if not isinstance(time_index, pd.DatetimeIndex): + time_index = pd.RangeIndex(len(time_index), name='time') + + return cls( + timesteps=time_index, + periods=ds.indexes.get('period'), + scenarios=ds.indexes.get('scenario'), + clusters=clusters, + hours_of_last_timestep=reference_structure.get('hours_of_last_timestep'), + hours_of_previous_timesteps=reference_structure.get('hours_of_previous_timesteps'), + weight_of_last_period=reference_structure.get('weight_of_last_period'), + scenario_weights=scenario_weights, + cluster_weight=cluster_weight_for_constructor, + scenario_independent_sizes=reference_structure.get('scenario_independent_sizes', True), + scenario_independent_flow_rates=reference_structure.get('scenario_independent_flow_rates', False), + name=reference_structure.get('name'), + timestep_duration=timestep_duration, + ) + + @staticmethod + def _restore_elements( + flow_system: FlowSystem, + reference_structure: dict[str, Any], + arrays_dict: dict[str, xr.DataArray], + cls: type[FlowSystem], + ) -> None: + """Restore components, buses, and effects to FlowSystem.""" + from .effects import Effect + from .elements import Bus, Component + + # Restore components + for comp_label, comp_data in reference_structure.get('components', {}).items(): + component = cls._resolve_reference_structure(comp_data, arrays_dict) + if not isinstance(component, Component): + logger.critical(f'Restoring component {comp_label} failed.') + flow_system._add_components(component) + + # Restore buses + for bus_label, bus_data in reference_structure.get('buses', {}).items(): + bus = cls._resolve_reference_structure(bus_data, arrays_dict) + if not isinstance(bus, Bus): + logger.critical(f'Restoring bus {bus_label} failed.') + flow_system._add_buses(bus) + + # Restore effects + for effect_label, effect_data in reference_structure.get('effects', {}).items(): + effect = cls._resolve_reference_structure(effect_data, arrays_dict) + if not isinstance(effect, Effect): + logger.critical(f'Restoring effect {effect_label} failed.') + flow_system._add_effects(effect) + + @classmethod + def _restore_solution( + cls, + flow_system: FlowSystem, + ds: xr.Dataset, + reference_structure: dict[str, Any], + solution_var_names: dict[str, str], + ) -> None: + """Restore solution dataset if present.""" + if not reference_structure.get('has_solution', False) or not solution_var_names: + return + + # Use dataset subsetting (faster than individual ds[name] access) + solution_ds_names = list(solution_var_names.values()) + solution_ds = ds[solution_ds_names] + # Rename variables to remove 'solution|' prefix + rename_map = {ds_name: orig_name for orig_name, ds_name in solution_var_names.items()} + solution_ds = solution_ds.rename(rename_map) + # Rename 'solution_time' back to 'time' if present + if 'solution_time' in solution_ds.dims: + solution_ds = solution_ds.rename({'solution_time': 'time'}) + flow_system.solution = solution_ds + + @classmethod + def _restore_clustering( + cls, + flow_system: FlowSystem, + ds: xr.Dataset, + reference_structure: dict[str, Any], + config_var_names: list[str], + arrays_dict: dict[str, xr.DataArray], + fs_cls: type[FlowSystem], + ) -> None: + """Restore Clustering object if present.""" + if 'clustering' not in reference_structure: + return + + clustering_structure = json.loads(reference_structure['clustering']) + + # Collect clustering arrays (prefixed with 'clustering|') + clustering_arrays: dict[str, xr.DataArray] = {} + main_var_names: list[str] = [] + + for name in config_var_names: + if name.startswith(cls.CLUSTERING_PREFIX): + arr = ds[name] + arr_name = name[len(cls.CLUSTERING_PREFIX) :] + clustering_arrays[arr_name] = arr.rename(arr_name) + else: + main_var_names.append(name) + + clustering = fs_cls._resolve_reference_structure(clustering_structure, clustering_arrays) + flow_system.clustering = clustering + + # Reconstruct aggregated_data from FlowSystem's main data arrays + if clustering.aggregated_data is None and main_var_names: + from .core import drop_constant_arrays + + main_vars = {name: arrays_dict[name] for name in main_var_names} + clustering.aggregated_data = drop_constant_arrays(xr.Dataset(main_vars), dim='time') + + # Restore cluster_weight from clustering's representative_weights + if hasattr(clustering, 'representative_weights'): + flow_system.cluster_weight = clustering.representative_weights + + @staticmethod + def _restore_metadata( + flow_system: FlowSystem, + reference_structure: dict[str, Any], + cls: type[FlowSystem], + ) -> None: + """Restore carriers and variable categories.""" + from .structure import VariableCategory + + # Restore carriers if present + if 'carriers' in reference_structure: + carriers_structure = json.loads(reference_structure['carriers']) + for carrier_data in carriers_structure.values(): + carrier = cls._resolve_reference_structure(carrier_data, {}) + flow_system._carriers.add(carrier) + + # Restore variable categories if present + if 'variable_categories' in reference_structure: + categories_dict = json.loads(reference_structure['variable_categories']) + restored_categories: dict[str, VariableCategory] = {} + for name, value in categories_dict.items(): + try: + restored_categories[name] = VariableCategory(value) + except ValueError: + logger.warning(f'Unknown VariableCategory value "{value}" for "{name}", skipping') + flow_system._variable_categories = restored_categories + + # --- Serialization (FlowSystem -> Dataset) --- + + @classmethod + def to_dataset( + cls, + flow_system: FlowSystem, + base_dataset: xr.Dataset, + include_solution: bool = True, + include_original_data: bool = True, + ) -> xr.Dataset: + """Convert FlowSystem-specific data to dataset. + + This function adds FlowSystem-specific data (solution, clustering, metadata) + to a base dataset created by the parent class's to_dataset() method. + + Args: + flow_system: The FlowSystem to serialize + base_dataset: Dataset from parent class with basic structure + include_solution: Whether to include optimization solution + include_original_data: Whether to include clustering.original_data + + Returns: + Complete dataset with all FlowSystem data + """ + from . import __version__ + + ds = base_dataset + + # Add solution data + ds = cls._add_solution_to_dataset(ds, flow_system.solution, include_solution) + + # Add carriers + ds = cls._add_carriers_to_dataset(ds, flow_system._carriers) + + # Add clustering + ds = cls._add_clustering_to_dataset(ds, flow_system.clustering, include_original_data) + + # Add variable categories + ds = cls._add_variable_categories_to_dataset(ds, flow_system._variable_categories) + + # Add version info + ds.attrs['flixopt_version'] = __version__ + + # Ensure model coordinates are present + ds = cls._add_model_coords(ds, flow_system) + + return ds + + @classmethod + def _add_solution_to_dataset( + cls, + ds: xr.Dataset, + solution: xr.Dataset | None, + include_solution: bool, + ) -> xr.Dataset: + """Add solution variables to dataset. + + Uses ds.variables directly for fast serialization (avoids _construct_dataarray). + """ + if include_solution and solution is not None: + # Rename 'time' to 'solution_time' to preserve full solution + solution_renamed = solution.rename({'time': 'solution_time'}) if 'time' in solution.dims else solution + + # Use ds.variables directly to avoid slow _construct_dataarray calls + # Only include data variables (not coordinates) + data_var_names = set(solution_renamed.data_vars) + solution_vars = { + f'{cls.SOLUTION_PREFIX}{name}': var + for name, var in solution_renamed.variables.items() + if name in data_var_names + } + ds = ds.assign(solution_vars) + + # Add solution_time coordinate if it exists + if 'solution_time' in solution_renamed.coords: + ds = ds.assign_coords(solution_time=solution_renamed.coords['solution_time']) + + ds.attrs['has_solution'] = True + else: + ds.attrs['has_solution'] = False + + return ds + + @staticmethod + def _add_carriers_to_dataset(ds: xr.Dataset, carriers: Any) -> xr.Dataset: + """Add carrier definitions to dataset attributes.""" + if carriers: + carriers_structure = {} + for name, carrier in carriers.items(): + carrier_ref, _ = carrier._create_reference_structure() + carriers_structure[name] = carrier_ref + ds.attrs['carriers'] = json.dumps(carriers_structure) + + return ds + + @classmethod + def _add_clustering_to_dataset( + cls, + ds: xr.Dataset, + clustering: Any, + include_original_data: bool, + ) -> xr.Dataset: + """Add clustering object to dataset.""" + if clustering is not None: + clustering_ref, clustering_arrays = clustering._create_reference_structure( + include_original_data=include_original_data + ) + # Add clustering arrays with prefix using batch assignment + # (individual ds[name] = arr assignments are slow) + prefixed_arrays = {f'{cls.CLUSTERING_PREFIX}{name}': arr for name, arr in clustering_arrays.items()} + ds = ds.assign(prefixed_arrays) + ds.attrs['clustering'] = json.dumps(clustering_ref) + + return ds + + @staticmethod + def _add_variable_categories_to_dataset( + ds: xr.Dataset, + variable_categories: dict, + ) -> xr.Dataset: + """Add variable categories to dataset attributes.""" + if variable_categories: + categories_dict = {name: cat.value for name, cat in variable_categories.items()} + ds.attrs['variable_categories'] = json.dumps(categories_dict) + + return ds + + @staticmethod + def _add_model_coords(ds: xr.Dataset, flow_system: FlowSystem) -> xr.Dataset: + """Ensure model coordinates are present in dataset.""" + model_coords = {'time': flow_system.timesteps} + if flow_system.periods is not None: + model_coords['period'] = flow_system.periods + if flow_system.scenarios is not None: + model_coords['scenario'] = flow_system.scenarios + if flow_system.clusters is not None: + model_coords['cluster'] = flow_system.clusters + + return ds.assign_coords(model_coords) + + +# ============================================================================= +# Public API Functions (delegate to FlowSystemDatasetIO class) +# ============================================================================= + + +def restore_flow_system_from_dataset(ds: xr.Dataset) -> FlowSystem: + """Create FlowSystem from dataset. + + This is the main entry point for dataset restoration. + Called by FlowSystem.from_dataset(). + + Args: + ds: Dataset containing the FlowSystem data + + Returns: + FlowSystem instance with all components, buses, effects, and solution restored + + See Also: + FlowSystemDatasetIO: Class containing the implementation + """ + return FlowSystemDatasetIO.from_dataset(ds) + + +def flow_system_to_dataset( + flow_system: FlowSystem, + base_dataset: xr.Dataset, + include_solution: bool = True, + include_original_data: bool = True, +) -> xr.Dataset: + """Convert FlowSystem-specific data to dataset. + + This function adds FlowSystem-specific data (solution, clustering, metadata) + to a base dataset created by the parent class's to_dataset() method. + + Args: + flow_system: The FlowSystem to serialize + base_dataset: Dataset from parent class with basic structure + include_solution: Whether to include optimization solution + include_original_data: Whether to include clustering.original_data + + Returns: + Complete dataset with all FlowSystem data + + See Also: + FlowSystemDatasetIO: Class containing the implementation + """ + return FlowSystemDatasetIO.to_dataset(flow_system, base_dataset, include_solution, include_original_data) diff --git a/flixopt/modeling.py b/flixopt/modeling.py index a0abeec77..ff84c808f 100644 --- a/flixopt/modeling.py +++ b/flixopt/modeling.py @@ -6,7 +6,7 @@ import xarray as xr from .config import CONFIG -from .structure import Submodel +from .structure import Submodel, VariableCategory logger = logging.getLogger('flixopt') @@ -76,6 +76,27 @@ def _scalar_safe_reduce(data: xr.DataArray | Any, dim: str, method: str = 'mean' return data +def _xr_allclose(a: xr.DataArray, b: xr.DataArray, rtol: float = 1e-5, atol: float = 1e-8) -> bool: + """Check if two DataArrays are element-wise equal within tolerance. + + Args: + a: First DataArray + b: Second DataArray + rtol: Relative tolerance (default matches np.allclose) + atol: Absolute tolerance (default matches np.allclose) + + Returns: + True if all elements are close (including matching NaN positions) + """ + # Fast path: same dims and shape - use numpy directly + if a.dims == b.dims and a.shape == b.shape: + return np.allclose(a.values, b.values, rtol=rtol, atol=atol, equal_nan=True) + + # Slow path: broadcast to common shape, then use numpy + a_bc, b_bc = xr.broadcast(a, b) + return np.allclose(a_bc.values, b_bc.values, rtol=rtol, atol=atol, equal_nan=True) + + class ModelingUtilitiesAbstract: """Utility functions for modeling - leveraging xarray for temporal data""" @@ -270,6 +291,7 @@ def expression_tracking_variable( short_name: str = None, bounds: tuple[xr.DataArray, xr.DataArray] = None, coords: str | list[str] | None = None, + category: VariableCategory = None, ) -> tuple[linopy.Variable, linopy.Constraint]: """Creates a variable constrained to equal a given expression. @@ -284,6 +306,7 @@ def expression_tracking_variable( short_name: Short name for display purposes bounds: Optional (lower_bound, upper_bound) tuple for the tracker variable coords: Coordinate dimensions for the variable (None uses all model coords) + category: Category for segment expansion handling. See VariableCategory. Returns: Tuple of (tracker_variable, tracking_constraint) @@ -292,7 +315,9 @@ def expression_tracking_variable( raise ValueError('ModelingPrimitives.expression_tracking_variable() can only be used with a Submodel') if not bounds: - tracker = model.add_variables(name=name, coords=model.get_coords(coords), short_name=short_name) + tracker = model.add_variables( + name=name, coords=model.get_coords(coords), short_name=short_name, category=category + ) else: tracker = model.add_variables( lower=bounds[0] if bounds[0] is not None else -np.inf, @@ -300,6 +325,7 @@ def expression_tracking_variable( name=name, coords=model.get_coords(coords), short_name=short_name, + category=category, ) # Constraint: tracker = expression @@ -369,6 +395,7 @@ def consecutive_duration_tracking( coords=state.coords, name=name, short_name=short_name, + category=VariableCategory.DURATION, ) constraints = {} @@ -540,7 +567,7 @@ def bounds_with_state( lower_bound, upper_bound = bounds name = name or f'{variable.name}' - if np.allclose(lower_bound, upper_bound, atol=1e-10, equal_nan=True): + if _xr_allclose(lower_bound, upper_bound): fix_constraint = model.add_constraints(variable == state * upper_bound, name=f'{name}|fix') return [fix_constraint] @@ -582,7 +609,7 @@ def scaled_bounds( rel_lower, rel_upper = relative_bounds name = name or f'{variable.name}' - if np.allclose(rel_lower, rel_upper, atol=1e-10, equal_nan=True): + if _xr_allclose(rel_lower, rel_upper): return [model.add_constraints(variable == scaling_variable * rel_lower, name=f'{name}|fixed')] upper_constraint = model.add_constraints(variable <= scaling_variable * rel_upper, name=f'{name}|ub') diff --git a/flixopt/statistics_accessor.py b/flixopt/statistics_accessor.py index 90ad875b7..0092d4989 100644 --- a/flixopt/statistics_accessor.py +++ b/flixopt/statistics_accessor.py @@ -31,6 +31,7 @@ from .color_processing import ColorType, hex_to_rgba, process_colors from .config import CONFIG from .plot_result import PlotResult +from .structure import VariableCategory if TYPE_CHECKING: from .flow_system import FlowSystem @@ -523,12 +524,12 @@ def flow_rates(self) -> xr.Dataset: """ self._require_solution() if self._flow_rates is None: - flow_rate_vars = [v for v in self._fs.solution.data_vars if v.endswith('|flow_rate')] + flow_rate_vars = self._fs.get_variables_by_category(VariableCategory.FLOW_RATE) flow_carriers = self._fs.flow_carriers # Cached lookup carrier_units = self.carrier_units # Cached lookup data_vars = {} for v in flow_rate_vars: - flow_label = v.replace('|flow_rate', '') + flow_label = v.rsplit('|', 1)[0] # Extract label from 'label|flow_rate' da = self._fs.solution[v].copy() # Add carrier and unit as attributes carrier = flow_carriers.get(flow_label) @@ -567,11 +568,8 @@ def flow_sizes(self) -> xr.Dataset: """Flow sizes as a Dataset with flow labels as variable names.""" self._require_solution() if self._flow_sizes is None: - flow_labels = set(self._fs.flows.keys()) - size_vars = [ - v for v in self._fs.solution.data_vars if v.endswith('|size') and v.replace('|size', '') in flow_labels - ] - self._flow_sizes = xr.Dataset({v.replace('|size', ''): self._fs.solution[v] for v in size_vars}) + flow_size_vars = self._fs.get_variables_by_category(VariableCategory.FLOW_SIZE) + self._flow_sizes = xr.Dataset({v.rsplit('|', 1)[0]: self._fs.solution[v] for v in flow_size_vars}) return self._flow_sizes @property @@ -579,13 +577,8 @@ def storage_sizes(self) -> xr.Dataset: """Storage capacity sizes as a Dataset with storage labels as variable names.""" self._require_solution() if self._storage_sizes is None: - storage_labels = set(self._fs.storages.keys()) - size_vars = [ - v - for v in self._fs.solution.data_vars - if v.endswith('|size') and v.replace('|size', '') in storage_labels - ] - self._storage_sizes = xr.Dataset({v.replace('|size', ''): self._fs.solution[v] for v in size_vars}) + storage_size_vars = self._fs.get_variables_by_category(VariableCategory.STORAGE_SIZE) + self._storage_sizes = xr.Dataset({v.rsplit('|', 1)[0]: self._fs.solution[v] for v in storage_size_vars}) return self._storage_sizes @property @@ -600,10 +593,8 @@ def charge_states(self) -> xr.Dataset: """All storage charge states as a Dataset with storage labels as variable names.""" self._require_solution() if self._charge_states is None: - charge_vars = [v for v in self._fs.solution.data_vars if v.endswith('|charge_state')] - self._charge_states = xr.Dataset( - {v.replace('|charge_state', ''): self._fs.solution[v] for v in charge_vars} - ) + charge_vars = self._fs.get_variables_by_category(VariableCategory.CHARGE_STATE) + self._charge_states = xr.Dataset({v.rsplit('|', 1)[0]: self._fs.solution[v] for v in charge_vars}) return self._charge_states @property diff --git a/flixopt/structure.py b/flixopt/structure.py index 5333d37ae..d165667bb 100644 --- a/flixopt/structure.py +++ b/flixopt/structure.py @@ -13,6 +13,7 @@ import warnings from dataclasses import dataclass from difflib import get_close_matches +from enum import Enum from typing import ( TYPE_CHECKING, Any, @@ -78,6 +79,69 @@ def _ensure_coords( return data.broadcast_like(template) +class VariableCategory(Enum): + """Fine-grained variable categories - names mirror variable names. + + Each variable type has its own category for precise handling during + segment expansion and statistics calculation. + """ + + # === State variables === + CHARGE_STATE = 'charge_state' # Storage SOC (interpolate between boundaries) + SOC_BOUNDARY = 'soc_boundary' # Intercluster SOC boundaries + + # === Rate/Power variables === + FLOW_RATE = 'flow_rate' # Flow rate (kW) + NETTO_DISCHARGE = 'netto_discharge' # Storage net discharge + VIRTUAL_FLOW = 'virtual_flow' # Bus penalty slack variables + + # === Binary state === + STATUS = 'status' # On/off status (persists through segment) + INACTIVE = 'inactive' # Complementary inactive status + + # === Binary events === + STARTUP = 'startup' # Startup event + SHUTDOWN = 'shutdown' # Shutdown event + + # === Effect variables === + PER_TIMESTEP = 'per_timestep' # Effect per timestep + SHARE = 'share' # All temporal contributions (flow, active, startup) + TOTAL = 'total' # Effect total (per period/scenario) + TOTAL_OVER_PERIODS = 'total_over_periods' # Effect total over all periods + + # === Investment === + SIZE = 'size' # Generic investment size (for backwards compatibility) + FLOW_SIZE = 'flow_size' # Flow investment size + STORAGE_SIZE = 'storage_size' # Storage capacity size + INVESTED = 'invested' # Invested yes/no binary + + # === Counting/Duration === + STARTUP_COUNT = 'startup_count' # Count of startups + DURATION = 'duration' # Duration tracking (uptime/downtime) + + # === Piecewise linearization === + INSIDE_PIECE = 'inside_piece' # Binary segment selection + LAMBDA0 = 'lambda0' # Interpolation weight + LAMBDA1 = 'lambda1' # Interpolation weight + ZERO_POINT = 'zero_point' # Zero point handling + + # === Other === + OTHER = 'other' # Uncategorized + + +# === Logical Groupings for Segment Expansion === +# Default behavior (not listed): repeat value within segment + +EXPAND_INTERPOLATE: set[VariableCategory] = {VariableCategory.CHARGE_STATE} +"""State variables that should be interpolated between segment boundaries.""" + +EXPAND_DIVIDE: set[VariableCategory] = {VariableCategory.PER_TIMESTEP, VariableCategory.SHARE} +"""Segment totals that should be divided by expansion factor to preserve sums.""" + +EXPAND_FIRST_TIMESTEP: set[VariableCategory] = {VariableCategory.STARTUP, VariableCategory.SHUTDOWN} +"""Binary events that should appear only at the first timestep of the segment.""" + + CLASS_REGISTRY = {} @@ -135,6 +199,7 @@ def __init__(self, flow_system: FlowSystem): self.flow_system = flow_system self.effects: EffectCollectionModel | None = None self.submodels: Submodels = Submodels({}) + self.variable_categories: dict[str, VariableCategory] = {} def add_variables( self, @@ -833,8 +898,11 @@ def _resolve_dataarray_reference( array = arrays_dict[array_name] - # Handle null values with warning - if array.isnull().any(): + # Handle null values with warning (use numpy for performance - 200x faster than xarray) + has_nulls = (np.issubdtype(array.dtype, np.floating) and np.any(np.isnan(array.values))) or ( + array.dtype == object and pd.isna(array.values).any() + ) + if has_nulls: logger.error(f"DataArray '{array_name}' contains null values. Dropping all-null along present dims.") if 'time' in array.dims: array = array.dropna(dim='time', how='all') @@ -1048,7 +1116,17 @@ def from_dataset(cls, ds: xr.Dataset) -> Interface: reference_structure.pop('__class__', None) # Create arrays dictionary from dataset variables - arrays_dict = {name: array for name, array in ds.data_vars.items()} + # Use ds.variables with coord_cache for faster DataArray construction + variables = ds.variables + coord_cache = {k: ds.coords[k] for k in ds.coords} + arrays_dict = { + name: xr.DataArray( + variables[name], + coords={k: coord_cache[k] for k in variables[name].dims if k in coord_cache}, + name=name, + ) + for name in ds.data_vars + } # Resolve all references using the centralized method resolved_params = cls._resolve_reference_structure(reference_structure, arrays_dict) @@ -1659,8 +1737,22 @@ def __init__(self, model: FlowSystemModel, label_of_element: str, label_of_model logger.debug(f'Creating {self.__class__.__name__} "{self.label_full}"') self._do_modeling() - def add_variables(self, short_name: str = None, **kwargs) -> linopy.Variable: - """Create and register a variable in one step""" + def add_variables( + self, + short_name: str = None, + category: VariableCategory = None, + **kwargs: Any, + ) -> linopy.Variable: + """Create and register a variable in one step. + + Args: + short_name: Short name for the variable (used as suffix in full name). + category: Category for segment expansion handling. See VariableCategory. + **kwargs: Additional arguments passed to linopy.Model.add_variables(). + + Returns: + The created linopy Variable. + """ if kwargs.get('name') is None: if short_name is None: raise ValueError('Short name must be provided when no name is given') @@ -1668,6 +1760,11 @@ def add_variables(self, short_name: str = None, **kwargs) -> linopy.Variable: variable = self._model.add_variables(**kwargs) self.register_variable(variable, short_name) + + # Register category in FlowSystemModel for segment expansion handling + if category is not None: + self._model.variable_categories[variable.name] = category + return variable def add_constraints(self, expression, short_name: str = None, **kwargs) -> linopy.Constraint: diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index 854b23525..e5bdb360b 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -17,8 +17,12 @@ import xarray as xr from .modeling import _scalar_safe_reduce +from .structure import EXPAND_DIVIDE, EXPAND_INTERPOLATE, VariableCategory if TYPE_CHECKING: + from tsam.config import ClusterConfig, ExtremeConfig, SegmentConfig + + from .clustering import Clustering from .flow_system import FlowSystem logger = logging.getLogger('flixopt') @@ -80,6 +84,577 @@ def _calculate_clustering_weights(ds) -> dict[str, float]: return weights + @staticmethod + def _build_cluster_config_with_weights( + cluster: ClusterConfig | None, + auto_weights: dict[str, float], + ) -> ClusterConfig: + """Merge auto-calculated weights into ClusterConfig. + + Args: + cluster: Optional user-provided ClusterConfig. + auto_weights: Automatically calculated weights based on data variance. + + Returns: + ClusterConfig with weights set (either user-provided or auto-calculated). + """ + from tsam.config import ClusterConfig + + # User provided ClusterConfig with weights - use as-is + if cluster is not None and cluster.weights is not None: + return cluster + + # No ClusterConfig provided - use defaults with auto-calculated weights + if cluster is None: + return ClusterConfig(weights=auto_weights) + + # ClusterConfig provided without weights - add auto-calculated weights + return ClusterConfig( + method=cluster.method, + representation=cluster.representation, + weights=auto_weights, + normalize_column_means=cluster.normalize_column_means, + use_duration_curves=cluster.use_duration_curves, + include_period_sums=cluster.include_period_sums, + solver=cluster.solver, + ) + + @staticmethod + def _accuracy_to_dataframe(accuracy) -> pd.DataFrame: + """Convert tsam AccuracyMetrics to DataFrame. + + Args: + accuracy: tsam AccuracyMetrics object. + + Returns: + DataFrame with RMSE, MAE, and RMSE_duration columns. + """ + return pd.DataFrame( + { + 'RMSE': accuracy.rmse, + 'MAE': accuracy.mae, + 'RMSE_duration': accuracy.rmse_duration, + } + ) + + def _build_cluster_weight_da( + self, + cluster_occurrences_all: dict[tuple, dict], + n_clusters: int, + cluster_coords: np.ndarray, + periods: list, + scenarios: list, + ) -> xr.DataArray: + """Build cluster_weight DataArray from occurrence counts. + + Args: + cluster_occurrences_all: Dict mapping (period, scenario) tuples to + dicts of {cluster_id: occurrence_count}. + n_clusters: Number of clusters. + cluster_coords: Cluster coordinate values. + periods: List of period labels ([None] if no periods dimension). + scenarios: List of scenario labels ([None] if no scenarios dimension). + + Returns: + DataArray with dims [cluster] or [cluster, period?, scenario?]. + """ + + def _weight_for_key(key: tuple) -> xr.DataArray: + occurrences = cluster_occurrences_all[key] + weights = np.array([occurrences.get(c, 1) for c in range(n_clusters)]) + return xr.DataArray(weights, dims=['cluster'], coords={'cluster': cluster_coords}) + + weight_slices = {key: _weight_for_key(key) for key in cluster_occurrences_all} + return self._combine_slices_to_dataarray_generic( + weight_slices, ['cluster'], periods, scenarios, 'cluster_weight' + ) + + def _build_typical_das( + self, + tsam_aggregation_results: dict[tuple, Any], + actual_n_clusters: int, + n_time_points: int, + cluster_coords: np.ndarray, + time_coords: pd.DatetimeIndex | pd.RangeIndex, + is_segmented: bool = False, + ) -> dict[str, dict[tuple, xr.DataArray]]: + """Build typical periods DataArrays with (cluster, time) shape. + + Args: + tsam_aggregation_results: Dict mapping (period, scenario) to tsam results. + actual_n_clusters: Number of clusters. + n_time_points: Number of time points per cluster (timesteps or segments). + cluster_coords: Cluster coordinate values. + time_coords: Time coordinate values. + is_segmented: Whether segmentation was used. + + Returns: + Nested dict: {column_name: {(period, scenario): DataArray}}. + """ + typical_das: dict[str, dict[tuple, xr.DataArray]] = {} + for key, tsam_result in tsam_aggregation_results.items(): + typical_df = tsam_result.cluster_representatives + if is_segmented: + # Segmented data: MultiIndex with cluster as first level + # Each cluster has exactly n_time_points rows (segments) + # Extract all data at once using numpy reshape, avoiding slow .loc calls + columns = typical_df.columns.tolist() + + # Get all values as numpy array: (n_clusters * n_time_points, n_columns) + all_values = typical_df.values + + # Reshape to (n_clusters, n_time_points, n_columns) + reshaped = all_values.reshape(actual_n_clusters, n_time_points, -1) + + for col_idx, col in enumerate(columns): + # reshaped[:, :, col_idx] selects all clusters, all time points, single column + # Result shape: (n_clusters, n_time_points) + typical_das.setdefault(col, {})[key] = xr.DataArray( + reshaped[:, :, col_idx], + dims=['cluster', 'time'], + coords={'cluster': cluster_coords, 'time': time_coords}, + ) + else: + # Non-segmented: flat data that can be reshaped + for col in typical_df.columns: + flat_data = typical_df[col].values + reshaped = flat_data.reshape(actual_n_clusters, n_time_points) + typical_das.setdefault(col, {})[key] = xr.DataArray( + reshaped, + dims=['cluster', 'time'], + coords={'cluster': cluster_coords, 'time': time_coords}, + ) + return typical_das + + def _build_segment_durations_da( + self, + tsam_aggregation_results: dict[tuple, Any], + actual_n_clusters: int, + n_segments: int, + cluster_coords: np.ndarray, + time_coords: pd.RangeIndex, + dt: float, + periods: list, + scenarios: list, + ) -> xr.DataArray: + """Build timestep_duration DataArray from segment durations. + + For segmented systems, each segment represents multiple original timesteps. + The duration is segment_duration_in_original_timesteps * dt (hours per original timestep). + + Args: + tsam_aggregation_results: Dict mapping (period, scenario) to tsam results. + actual_n_clusters: Number of clusters. + n_segments: Number of segments per cluster. + cluster_coords: Cluster coordinate values. + time_coords: Time coordinate values (RangeIndex for segments). + dt: Hours per original timestep. + periods: List of period labels ([None] if no periods dimension). + scenarios: List of scenario labels ([None] if no scenarios dimension). + + Returns: + DataArray with dims [cluster, time] or [cluster, time, period?, scenario?] + containing duration in hours for each segment. + """ + segment_duration_slices: dict[tuple, xr.DataArray] = {} + + for key, tsam_result in tsam_aggregation_results.items(): + # segment_durations is tuple of tuples: ((dur1, dur2, ...), (dur1, dur2, ...), ...) + # Each inner tuple is durations for one cluster + seg_durs = tsam_result.segment_durations + + # Build 2D array (cluster, segment) of durations in hours + data = np.zeros((actual_n_clusters, n_segments)) + for cluster_id in range(actual_n_clusters): + cluster_seg_durs = seg_durs[cluster_id] + for seg_id in range(n_segments): + # Duration in hours = number of original timesteps * dt + data[cluster_id, seg_id] = cluster_seg_durs[seg_id] * dt + + segment_duration_slices[key] = xr.DataArray( + data, + dims=['cluster', 'time'], + coords={'cluster': cluster_coords, 'time': time_coords}, + ) + + return self._combine_slices_to_dataarray_generic( + segment_duration_slices, ['cluster', 'time'], periods, scenarios, 'timestep_duration' + ) + + def _build_clustering_metrics( + self, + clustering_metrics_all: dict[tuple, pd.DataFrame], + periods: list, + scenarios: list, + ) -> xr.Dataset: + """Build clustering metrics Dataset from per-slice DataFrames. + + Args: + clustering_metrics_all: Dict mapping (period, scenario) to metric DataFrames. + periods: List of period labels ([None] if no periods dimension). + scenarios: List of scenario labels ([None] if no scenarios dimension). + + Returns: + Dataset with RMSE, MAE, RMSE_duration metrics. + """ + non_empty_metrics = {k: v for k, v in clustering_metrics_all.items() if not v.empty} + + if not non_empty_metrics: + return xr.Dataset() + + first_key = (periods[0], scenarios[0]) + + if len(clustering_metrics_all) == 1 and len(non_empty_metrics) == 1: + metrics_df = non_empty_metrics.get(first_key) + if metrics_df is None: + metrics_df = next(iter(non_empty_metrics.values())) + return xr.Dataset( + { + col: xr.DataArray( + metrics_df[col].values, + dims=['time_series'], + coords={'time_series': metrics_df.index}, + ) + for col in metrics_df.columns + } + ) + + # Multi-dim case + sample_df = next(iter(non_empty_metrics.values())) + metric_names = list(sample_df.columns) + data_vars = {} + + for metric in metric_names: + slices = {} + for (p, s), df in clustering_metrics_all.items(): + if df.empty: + slices[(p, s)] = xr.DataArray( + np.full(len(sample_df.index), np.nan), + dims=['time_series'], + coords={'time_series': list(sample_df.index)}, + ) + else: + slices[(p, s)] = xr.DataArray( + df[metric].values, + dims=['time_series'], + coords={'time_series': list(df.index)}, + ) + data_vars[metric] = self._combine_slices_to_dataarray_generic( + slices, ['time_series'], periods, scenarios, metric + ) + + return xr.Dataset(data_vars) + + def _build_reduced_flow_system( + self, + ds: xr.Dataset, + tsam_aggregation_results: dict[tuple, Any], + cluster_occurrences_all: dict[tuple, dict], + clustering_metrics_all: dict[tuple, pd.DataFrame], + timesteps_per_cluster: int, + dt: float, + periods: list, + scenarios: list, + n_clusters_requested: int | None = None, + ) -> FlowSystem: + """Build a reduced FlowSystem from tsam aggregation results. + + This is the shared implementation used by both cluster() and apply_clustering(). + + Args: + ds: Original dataset. + tsam_aggregation_results: Dict mapping (period, scenario) to tsam AggregationResult. + cluster_occurrences_all: Dict mapping (period, scenario) to cluster occurrence counts. + clustering_metrics_all: Dict mapping (period, scenario) to accuracy metrics. + timesteps_per_cluster: Number of timesteps per cluster. + dt: Hours per timestep. + periods: List of period labels ([None] if no periods). + scenarios: List of scenario labels ([None] if no scenarios). + n_clusters_requested: Requested number of clusters (for logging). None to skip. + + Returns: + Reduced FlowSystem with clustering metadata attached. + """ + from .clustering import Clustering + from .core import drop_constant_arrays + from .flow_system import FlowSystem + + has_periods = periods != [None] + has_scenarios = scenarios != [None] + + # Build dim_names for Clustering + dim_names = [] + if has_periods: + dim_names.append('period') + if has_scenarios: + dim_names.append('scenario') + + # Build dict keyed by (period?, scenario?) tuples (without None) + aggregation_results: dict[tuple, Any] = {} + for (p, s), result in tsam_aggregation_results.items(): + key_parts = [] + if has_periods: + key_parts.append(p) + if has_scenarios: + key_parts.append(s) + key = tuple(key_parts) + aggregation_results[key] = result + + # Use first result for structure + first_key = (periods[0], scenarios[0]) + first_tsam = tsam_aggregation_results[first_key] + + # Build metrics + clustering_metrics = self._build_clustering_metrics(clustering_metrics_all, periods, scenarios) + + n_reduced_timesteps = len(first_tsam.cluster_representatives) + actual_n_clusters = len(first_tsam.cluster_weights) + + # Create coordinates for the 2D cluster structure + cluster_coords = np.arange(actual_n_clusters) + + # Detect if segmentation was used + is_segmented = first_tsam.n_segments is not None + n_segments = first_tsam.n_segments if is_segmented else None + + # Determine time dimension based on segmentation + if is_segmented: + n_time_points = n_segments + time_coords = pd.RangeIndex(n_time_points, name='time') + else: + n_time_points = timesteps_per_cluster + time_coords = pd.date_range( + start='2000-01-01', + periods=timesteps_per_cluster, + freq=pd.Timedelta(hours=dt), + name='time', + ) + + # Build cluster_weight + cluster_weight = self._build_cluster_weight_da( + cluster_occurrences_all, actual_n_clusters, cluster_coords, periods, scenarios + ) + + # Logging + if is_segmented: + logger.info( + f'Reduced from {len(self._fs.timesteps)} to {actual_n_clusters} clusters × {n_segments} segments' + ) + else: + logger.info( + f'Reduced from {len(self._fs.timesteps)} to {actual_n_clusters} clusters × {timesteps_per_cluster} timesteps' + ) + + # Build typical periods DataArrays with (cluster, time) shape + typical_das = self._build_typical_das( + tsam_aggregation_results, actual_n_clusters, n_time_points, cluster_coords, time_coords, is_segmented + ) + + # Build reduced dataset with (cluster, time) dimensions + ds_new = self._build_reduced_dataset( + ds, + typical_das, + actual_n_clusters, + n_reduced_timesteps, + n_time_points, + cluster_coords, + time_coords, + periods, + scenarios, + ) + + # For segmented systems, build timestep_duration from segment_durations + if is_segmented: + segment_durations = self._build_segment_durations_da( + tsam_aggregation_results, + actual_n_clusters, + n_segments, + cluster_coords, + time_coords, + dt, + periods, + scenarios, + ) + ds_new['timestep_duration'] = segment_durations + + reduced_fs = FlowSystem.from_dataset(ds_new) + reduced_fs.cluster_weight = cluster_weight + + # Remove 'equals_final' from storages - doesn't make sense on reduced timesteps + for storage in reduced_fs.storages.values(): + ics = storage.initial_charge_state + if isinstance(ics, str) and ics == 'equals_final': + storage.initial_charge_state = None + + # Create Clustering object with full AggregationResult access + # Only store time-varying data (constant arrays are clutter for plotting) + reduced_fs.clustering = Clustering( + original_timesteps=self._fs.timesteps, + original_data=drop_constant_arrays(ds, dim='time'), + aggregated_data=drop_constant_arrays(ds_new, dim='time'), + _metrics=clustering_metrics if clustering_metrics.data_vars else None, + _aggregation_results=aggregation_results, + _dim_names=dim_names, + ) + + return reduced_fs + + def _build_reduced_dataset( + self, + ds: xr.Dataset, + typical_das: dict[str, dict[tuple, xr.DataArray]], + actual_n_clusters: int, + n_reduced_timesteps: int, + n_time_points: int, + cluster_coords: np.ndarray, + time_coords: pd.DatetimeIndex | pd.RangeIndex, + periods: list, + scenarios: list, + ) -> xr.Dataset: + """Build the reduced dataset with (cluster, time) structure. + + Args: + ds: Original dataset. + typical_das: Typical periods DataArrays from _build_typical_das(). + actual_n_clusters: Number of clusters. + n_reduced_timesteps: Total reduced timesteps (n_clusters * n_time_points). + n_time_points: Number of time points per cluster (timesteps or segments). + cluster_coords: Cluster coordinate values. + time_coords: Time coordinate values. + periods: List of period labels. + scenarios: List of scenario labels. + + Returns: + Dataset with reduced timesteps and (cluster, time) structure. + """ + from .core import TimeSeriesData + + all_keys = {(p, s) for p in periods for s in scenarios} + ds_new_vars = {} + + # Use ds.variables to avoid _construct_dataarray overhead + variables = ds.variables + coord_cache = {k: ds.coords[k].values for k in ds.coords} + + for name in ds.data_vars: + var = variables[name] + if 'time' not in var.dims: + # No time dimension - wrap Variable in DataArray + coords = {d: coord_cache[d] for d in var.dims if d in coord_cache} + ds_new_vars[name] = xr.DataArray(var.values, dims=var.dims, coords=coords, attrs=var.attrs, name=name) + elif name not in typical_das: + # Time-dependent but constant: reshape to (cluster, time, ...) + # Use numpy slicing instead of .isel() + time_idx = var.dims.index('time') + slices = [slice(None)] * len(var.dims) + slices[time_idx] = slice(0, n_reduced_timesteps) + sliced_values = var.values[tuple(slices)] + + other_dims = [d for d in var.dims if d != 'time'] + other_shape = [var.sizes[d] for d in other_dims] + new_shape = [actual_n_clusters, n_time_points] + other_shape + reshaped = sliced_values.reshape(new_shape) + new_coords = {'cluster': cluster_coords, 'time': time_coords} + for dim in other_dims: + if dim in coord_cache: + new_coords[dim] = coord_cache[dim] + ds_new_vars[name] = xr.DataArray( + reshaped, + dims=['cluster', 'time'] + other_dims, + coords=new_coords, + attrs=var.attrs, + ) + elif set(typical_das[name].keys()) != all_keys: + # Partial typical slices: fill missing keys with constant values + time_idx = var.dims.index('time') + slices_list = [slice(None)] * len(var.dims) + slices_list[time_idx] = slice(0, n_reduced_timesteps) + sliced_values = var.values[tuple(slices_list)] + + other_dims = [d for d in var.dims if d != 'time'] + other_shape = [var.sizes[d] for d in other_dims] + new_shape = [actual_n_clusters, n_time_points] + other_shape + reshaped_constant = sliced_values.reshape(new_shape) + + new_coords = {'cluster': cluster_coords, 'time': time_coords} + for dim in other_dims: + if dim in coord_cache: + new_coords[dim] = coord_cache[dim] + + # Build filled slices dict: use typical where available, constant otherwise + filled_slices = {} + for key in all_keys: + if key in typical_das[name]: + filled_slices[key] = typical_das[name][key] + else: + filled_slices[key] = xr.DataArray( + reshaped_constant, + dims=['cluster', 'time'] + other_dims, + coords=new_coords, + ) + + da = self._combine_slices_to_dataarray_2d( + slices=filled_slices, + attrs=var.attrs, + periods=periods, + scenarios=scenarios, + ) + if var.attrs.get('__timeseries_data__', False): + da = TimeSeriesData.from_dataarray(da.assign_attrs(var.attrs)) + ds_new_vars[name] = da + else: + # Time-varying: combine per-(period, scenario) slices + da = self._combine_slices_to_dataarray_2d( + slices=typical_das[name], + attrs=var.attrs, + periods=periods, + scenarios=scenarios, + ) + if var.attrs.get('__timeseries_data__', False): + da = TimeSeriesData.from_dataarray(da.assign_attrs(var.attrs)) + ds_new_vars[name] = da + + # Copy attrs but remove cluster_weight + new_attrs = dict(ds.attrs) + new_attrs.pop('cluster_weight', None) + return xr.Dataset(ds_new_vars, attrs=new_attrs) + + def _build_cluster_assignments_da( + self, + cluster_assignmentss: dict[tuple, np.ndarray], + periods: list, + scenarios: list, + ) -> xr.DataArray: + """Build cluster_assignments DataArray from cluster assignments. + + Args: + cluster_assignmentss: Dict mapping (period, scenario) to cluster assignment arrays. + periods: List of period labels ([None] if no periods dimension). + scenarios: List of scenario labels ([None] if no scenarios dimension). + + Returns: + DataArray with dims [original_cluster] or [original_cluster, period?, scenario?]. + """ + has_periods = periods != [None] + has_scenarios = scenarios != [None] + + if has_periods or has_scenarios: + # Multi-dimensional case + cluster_assignments_slices = {} + for p in periods: + for s in scenarios: + key = (p, s) + cluster_assignments_slices[key] = xr.DataArray( + cluster_assignmentss[key], dims=['original_cluster'], name='cluster_assignments' + ) + return self._combine_slices_to_dataarray_generic( + cluster_assignments_slices, ['original_cluster'], periods, scenarios, 'cluster_assignments' + ) + else: + # Simple case + first_key = (periods[0], scenarios[0]) + return xr.DataArray(cluster_assignmentss[first_key], dims=['original_cluster'], name='cluster_assignments') + def sel( self, time: str | slice | list[str] | pd.Timestamp | pd.DatetimeIndex | None = None, @@ -603,20 +1178,97 @@ def fix_sizes( return new_fs + def clustering_data( + self, + period: Any | None = None, + scenario: Any | None = None, + ) -> xr.Dataset: + """ + Get the time-varying data that would be used for clustering. + + This method extracts only the data arrays that vary over time, which is + the data that clustering algorithms use to identify typical periods. + Constant arrays (same value for all timesteps) are excluded since they + don't contribute to pattern identification. + + Use this to inspect or pre-process the data before clustering, or to + understand which variables influence the clustering result. + + Args: + period: Optional period label to select. If None and the FlowSystem + has multiple periods, returns data for all periods. + scenario: Optional scenario label to select. If None and the FlowSystem + has multiple scenarios, returns data for all scenarios. + + Returns: + xr.Dataset containing only time-varying data arrays. The dataset + includes arrays like demand profiles, price profiles, and other + time series that vary over the time dimension. + + Examples: + Inspect clustering input data: + + >>> data = flow_system.transform.clustering_data() + >>> print(f'Variables used for clustering: {list(data.data_vars)}') + >>> data['HeatDemand(Q)|fixed_relative_profile'].plot() + + Get data for a specific period/scenario: + + >>> data_2024 = flow_system.transform.clustering_data(period=2024) + >>> data_high = flow_system.transform.clustering_data(scenario='high') + + Convert to DataFrame for external tools: + + >>> df = flow_system.transform.clustering_data().to_dataframe() + """ + from .core import drop_constant_arrays + + if not self._fs.connected_and_transformed: + self._fs.connect_and_transform() + + ds = self._fs.to_dataset(include_solution=False) + + # Build selector for period/scenario + selector = {} + if period is not None: + selector['period'] = period + if scenario is not None: + selector['scenario'] = scenario + + # Apply selection if specified + if selector: + ds = ds.sel(**selector, drop=True) + + # Filter to only time-varying arrays + result = drop_constant_arrays(ds, dim='time') + + # Guard against empty dataset (all variables are constant) + if not result.data_vars: + selector_info = f' for {selector}' if selector else '' + raise ValueError( + f'No time-varying data found{selector_info}. ' + f'All variables are constant over time. Check your period/scenario filter or input data.' + ) + + # Remove attrs for cleaner output + result.attrs = {} + for var in result.data_vars: + result[var].attrs = {} + + return result + def cluster( self, n_clusters: int, cluster_duration: str | float, - weights: dict[str, float] | None = None, - time_series_for_high_peaks: list[str] | None = None, - time_series_for_low_peaks: list[str] | None = None, - cluster_method: Literal['k_means', 'k_medoids', 'hierarchical', 'k_maxoids', 'averaging'] = 'hierarchical', - representation_method: Literal[ - 'meanRepresentation', 'medoidRepresentation', 'distributionAndMinMaxRepresentation' - ] = 'medoidRepresentation', - extreme_period_method: Literal['append', 'new_cluster_center', 'replace_cluster_center'] | None = None, - rescale_cluster_periods: bool = True, - predef_cluster_order: xr.DataArray | np.ndarray | list[int] | None = None, + data_vars: list[str] | None = None, + cluster: ClusterConfig | None = None, + extremes: ExtremeConfig | None = None, + segments: SegmentConfig | None = None, + preserve_column_means: bool = True, + rescale_exclude_columns: list[str] | None = None, + round_decimals: int | None = None, + numerical_tolerance: float = 1e-13, **tsam_kwargs: Any, ) -> FlowSystem: """ @@ -635,33 +1287,41 @@ def cluster( Use this for initial sizing optimization, then use ``fix_sizes()`` to re-optimize at full resolution for accurate dispatch results. + To reuse an existing clustering on different data, use ``apply_clustering()`` instead. + Args: n_clusters: Number of clusters (typical periods) to extract (e.g., 8 typical days). cluster_duration: Duration of each cluster. Can be a pandas-style string ('1D', '24h', '6h') or a numeric value in hours. - weights: Optional clustering weights per time series. Keys are time series labels. - time_series_for_high_peaks: Time series labels for explicitly selecting high-value - clusters. **Recommended** for demand time series to capture peak demand days. - time_series_for_low_peaks: Time series labels for explicitly selecting low-value clusters. - cluster_method: Clustering algorithm to use. Options: - ``'hierarchical'`` (default), ``'k_means'``, ``'k_medoids'``, - ``'k_maxoids'``, ``'averaging'``. - representation_method: How cluster representatives are computed. Options: - ``'medoidRepresentation'`` (default), ``'meanRepresentation'``, - ``'distributionAndMinMaxRepresentation'``. - extreme_period_method: How extreme periods (peaks) are integrated. Options: - ``None`` (default, no special handling), ``'append'``, - ``'new_cluster_center'``, ``'replace_cluster_center'``. - rescale_cluster_periods: If True (default), rescale cluster periods so their - weighted mean matches the original time series mean. - predef_cluster_order: Predefined cluster assignments for manual clustering. - Array of cluster indices (0 to n_clusters-1) for each original period. - If provided, clustering is skipped and these assignments are used directly. - For multi-dimensional FlowSystems, use an xr.DataArray with dims - ``[original_cluster, period?, scenario?]`` to specify different assignments - per period/scenario combination. - **tsam_kwargs: Additional keyword arguments passed to - ``tsam.TimeSeriesAggregation``. See tsam documentation for all options. + data_vars: Optional list of variable names to use for clustering. If specified, + only these variables are used to determine cluster assignments, but the + clustering is then applied to ALL time-varying data in the FlowSystem. + Use ``transform.clustering_data()`` to see available variables. + Example: ``data_vars=['HeatDemand(Q)|fixed_relative_profile']`` to cluster + based only on heat demand patterns. + cluster: Optional tsam ``ClusterConfig`` object specifying clustering algorithm, + representation method, and weights. If None, uses default settings (hierarchical + clustering with medoid representation) and automatically calculated weights + based on data variance. + extremes: Optional tsam ``ExtremeConfig`` object specifying how to handle + extreme periods (peaks). Use this to ensure peak demand days are captured. + Example: ``ExtremeConfig(method='new_cluster', max_value=['demand'])``. + segments: Optional tsam ``SegmentConfig`` object specifying intra-period + segmentation. Segments divide each cluster period into variable-duration + sub-segments. Example: ``SegmentConfig(n_segments=4)``. + preserve_column_means: Rescale typical periods so each column's weighted mean + matches the original data's mean. Ensures total energy/load is preserved + when weights represent occurrence counts. Default is True. + rescale_exclude_columns: Column names to exclude from rescaling when + ``preserve_column_means=True``. Useful for binary/indicator columns (0/1 values) + that should not be rescaled. + round_decimals: Round output values to this many decimal places. + If None (default), no rounding is applied. + numerical_tolerance: Tolerance for numerical precision issues. Controls when + warnings are raised for aggregated values exceeding original time series bounds. + Default is 1e-13. + **tsam_kwargs: Additional keyword arguments passed to ``tsam.aggregate()`` + for forward compatibility. See tsam documentation for all options. Returns: A new FlowSystem with reduced timesteps (only typical clusters). @@ -672,37 +1332,42 @@ def cluster( ValueError: If cluster_duration is not a multiple of timestep size. Examples: - Two-stage sizing optimization: + Basic clustering with peak preservation: - >>> # Stage 1: Size with reduced timesteps (fast) - >>> fs_sizing = flow_system.transform.cluster( + >>> from tsam.config import ExtremeConfig + >>> fs_clustered = flow_system.transform.cluster( ... n_clusters=8, ... cluster_duration='1D', - ... time_series_for_high_peaks=['HeatDemand(Q_th)|fixed_relative_profile'], + ... extremes=ExtremeConfig( + ... method='new_cluster', + ... max_value=['HeatDemand(Q_th)|fixed_relative_profile'], + ... ), ... ) - >>> fs_sizing.optimize(solver) - >>> - >>> # Apply safety margin (typical clusters may smooth peaks) - >>> sizes_with_margin = { - ... name: float(size.item()) * 1.05 for name, size in fs_sizing.statistics.sizes.items() - ... } + >>> fs_clustered.optimize(solver) + + Clustering based on specific variables only: + + >>> # See available variables for clustering + >>> print(flow_system.transform.clustering_data().data_vars) >>> - >>> # Stage 2: Fix sizes and re-optimize at full resolution - >>> fs_dispatch = flow_system.transform.fix_sizes(sizes_with_margin) - >>> fs_dispatch.optimize(solver) + >>> # Cluster based only on demand profile + >>> fs_clustered = flow_system.transform.cluster( + ... n_clusters=8, + ... cluster_duration='1D', + ... data_vars=['HeatDemand(Q)|fixed_relative_profile'], + ... ) Note: - This is best suited for initial sizing, not final dispatch optimization - - Use ``time_series_for_high_peaks`` to ensure peak demand clusters are captured + - Use ``extremes`` to ensure peak demand clusters are captured - A 5-10% safety margin on sizes is recommended for the dispatch stage - For seasonal storage (e.g., hydrogen, thermal storage), set ``Storage.cluster_mode='intercluster'`` or ``'intercluster_cyclic'`` """ - import tsam.timeseriesaggregation as tsam + import tsam - from .clustering import Clustering, ClusterResult, ClusterStructure - from .core import TimeSeriesData, drop_constant_arrays - from .flow_system import FlowSystem + from .clustering import ClusteringResults + from .core import drop_constant_arrays # Parse cluster_duration to hours hours_per_cluster = ( @@ -731,19 +1396,30 @@ def cluster( ds = self._fs.to_dataset(include_solution=False) + # Validate and prepare data_vars for clustering + if data_vars is not None: + missing = set(data_vars) - set(ds.data_vars) + if missing: + raise ValueError( + f'data_vars not found in FlowSystem: {missing}. ' + f'Available time-varying variables can be found via transform.clustering_data().' + ) + ds_for_clustering = ds[list(data_vars)] + else: + ds_for_clustering = ds + # Validate tsam_kwargs doesn't override explicit parameters reserved_tsam_keys = { - 'noTypicalPeriods', - 'hoursPerPeriod', - 'resolution', - 'clusterMethod', - 'extremePeriodMethod', - 'representationMethod', - 'rescaleClusterPeriods', - 'predefClusterOrder', - 'weightDict', - 'addPeakMax', - 'addPeakMin', + 'n_clusters', + 'period_duration', # exposed as cluster_duration + 'timestep_duration', # computed automatically + 'cluster', + 'segments', + 'extremes', + 'preserve_column_means', + 'rescale_exclude_columns', + 'round_decimals', + 'numerical_tolerance', } conflicts = reserved_tsam_keys & set(tsam_kwargs.keys()) if conflicts: @@ -752,22 +1428,10 @@ def cluster( f'Use the corresponding cluster() parameters instead.' ) - # Validate predef_cluster_order dimensions if it's a DataArray - if isinstance(predef_cluster_order, xr.DataArray): - expected_dims = {'original_cluster'} - if has_periods: - expected_dims.add('period') - if has_scenarios: - expected_dims.add('scenario') - if set(predef_cluster_order.dims) != expected_dims: - raise ValueError( - f'predef_cluster_order dimensions {set(predef_cluster_order.dims)} ' - f'do not match expected {expected_dims} for this FlowSystem.' - ) - # Cluster each (period, scenario) combination using tsam directly - tsam_results: dict[tuple, tsam.TimeSeriesAggregation] = {} - cluster_orders: dict[tuple, np.ndarray] = {} + tsam_aggregation_results: dict[tuple, Any] = {} # AggregationResult objects + tsam_clustering_results: dict[tuple, Any] = {} # ClusteringResult objects for persistence + cluster_assignmentss: dict[tuple, np.ndarray] = {} cluster_occurrences_all: dict[tuple, dict] = {} # Collect metrics per (period, scenario) slice @@ -777,370 +1441,225 @@ def cluster( for scenario_label in scenarios: key = (period_label, scenario_label) selector = {k: v for k, v in [('period', period_label), ('scenario', scenario_label)] if v is not None} - ds_slice = ds.sel(**selector, drop=True) if selector else ds - temporaly_changing_ds = drop_constant_arrays(ds_slice, dim='time') - df = temporaly_changing_ds.to_dataframe() + + # Select data for clustering (may be subset if data_vars specified) + ds_slice_for_clustering = ( + ds_for_clustering.sel(**selector, drop=True) if selector else ds_for_clustering + ) + temporaly_changing_ds_for_clustering = drop_constant_arrays(ds_slice_for_clustering, dim='time') + + # Guard against empty dataset after removing constant arrays + if not temporaly_changing_ds_for_clustering.data_vars: + filter_info = f'data_vars={data_vars}' if data_vars else 'all variables' + selector_info = f', selector={selector}' if selector else '' + raise ValueError( + f'No time-varying data found for clustering ({filter_info}{selector_info}). ' + f'All variables are constant over time. Check your data_vars filter or input data.' + ) + + df_for_clustering = temporaly_changing_ds_for_clustering.to_dataframe() if selector: logger.info(f'Clustering {", ".join(f"{k}={v}" for k, v in selector.items())}...') - # Handle predef_cluster_order for multi-dimensional case - predef_order_slice = None - if predef_cluster_order is not None: - if isinstance(predef_cluster_order, xr.DataArray): - # Extract slice for this (period, scenario) combination - predef_order_slice = predef_cluster_order.sel(**selector, drop=True).values - else: - # Simple array/list - use directly - predef_order_slice = predef_cluster_order - - # Use tsam directly - clustering_weights = weights or self._calculate_clustering_weights(temporaly_changing_ds) - # tsam expects 'None' as a string, not Python None - tsam_extreme_method = 'None' if extreme_period_method is None else extreme_period_method - tsam_agg = tsam.TimeSeriesAggregation( - df, - noTypicalPeriods=n_clusters, - hoursPerPeriod=hours_per_cluster, - resolution=dt, - clusterMethod=cluster_method, - extremePeriodMethod=tsam_extreme_method, - representationMethod=representation_method, - rescaleClusterPeriods=rescale_cluster_periods, - predefClusterOrder=predef_order_slice, - weightDict={name: w for name, w in clustering_weights.items() if name in df.columns}, - addPeakMax=time_series_for_high_peaks or [], - addPeakMin=time_series_for_low_peaks or [], - **tsam_kwargs, - ) # Suppress tsam warning about minimal value constraints (informational, not actionable) with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=UserWarning, message='.*minimal value.*exceeds.*') - tsam_agg.createTypicalPeriods() - tsam_results[key] = tsam_agg - cluster_orders[key] = tsam_agg.clusterOrder - cluster_occurrences_all[key] = tsam_agg.clusterPeriodNoOccur - # Compute accuracy metrics with error handling + # Build ClusterConfig with auto-calculated weights + clustering_weights = self._calculate_clustering_weights(temporaly_changing_ds_for_clustering) + filtered_weights = { + name: w for name, w in clustering_weights.items() if name in df_for_clustering.columns + } + cluster_config = self._build_cluster_config_with_weights(cluster, filtered_weights) + + # Perform clustering based on selected data_vars (or all if not specified) + tsam_result = tsam.aggregate( + df_for_clustering, + n_clusters=n_clusters, + period_duration=hours_per_cluster, + timestep_duration=dt, + cluster=cluster_config, + extremes=extremes, + segments=segments, + preserve_column_means=preserve_column_means, + rescale_exclude_columns=rescale_exclude_columns, + round_decimals=round_decimals, + numerical_tolerance=numerical_tolerance, + **tsam_kwargs, + ) + + tsam_aggregation_results[key] = tsam_result + tsam_clustering_results[key] = tsam_result.clustering + cluster_assignmentss[key] = tsam_result.cluster_assignments + cluster_occurrences_all[key] = tsam_result.cluster_weights try: - clustering_metrics_all[key] = tsam_agg.accuracyIndicators() + clustering_metrics_all[key] = self._accuracy_to_dataframe(tsam_result.accuracy) except Exception as e: logger.warning(f'Failed to compute clustering metrics for {key}: {e}') clustering_metrics_all[key] = pd.DataFrame() - # Use first result for structure - first_key = (periods[0], scenarios[0]) - first_tsam = tsam_results[first_key] - - # Convert metrics to xr.Dataset with period/scenario dims if multi-dimensional - # Filter out empty DataFrames (from failed accuracyIndicators calls) - non_empty_metrics = {k: v for k, v in clustering_metrics_all.items() if not v.empty} - if not non_empty_metrics: - # All metrics failed - create empty Dataset - clustering_metrics = xr.Dataset() - elif len(non_empty_metrics) == 1 or len(clustering_metrics_all) == 1: - # Simple case: convert single DataFrame to Dataset - metrics_df = non_empty_metrics.get(first_key) - if metrics_df is None: - metrics_df = next(iter(non_empty_metrics.values())) - clustering_metrics = xr.Dataset( - { - col: xr.DataArray( - metrics_df[col].values, dims=['time_series'], coords={'time_series': metrics_df.index} - ) - for col in metrics_df.columns - } + # If data_vars was specified, apply clustering to FULL data + if data_vars is not None: + # Build dim_names for ClusteringResults + dim_names = [] + if has_periods: + dim_names.append('period') + if has_scenarios: + dim_names.append('scenario') + + # Convert (period, scenario) keys to ClusteringResults format + def to_cr_key(p, s): + key_parts = [] + if has_periods: + key_parts.append(p) + if has_scenarios: + key_parts.append(s) + return tuple(key_parts) + + # Build ClusteringResults from subset clustering + clustering_results = ClusteringResults( + {to_cr_key(p, s): cr for (p, s), cr in tsam_clustering_results.items()}, + dim_names, ) - else: - # Multi-dim case: combine metrics into Dataset with period/scenario dims - # First, get the metric columns from any non-empty DataFrame - sample_df = next(iter(non_empty_metrics.values())) - metric_names = list(sample_df.columns) - - # Build DataArrays for each metric - data_vars = {} - for metric in metric_names: - # Shape: (time_series, period?, scenario?) - # Each slice needs its own coordinates since different periods/scenarios - # may have different time series (after drop_constant_arrays) - slices = {} - for (p, s), df in clustering_metrics_all.items(): - if df.empty: - # Use NaN for failed metrics - use sample_df index as fallback - slices[(p, s)] = xr.DataArray( - np.full(len(sample_df.index), np.nan), - dims=['time_series'], - coords={'time_series': list(sample_df.index)}, - ) - else: - # Use this DataFrame's own index as coordinates - slices[(p, s)] = xr.DataArray( - df[metric].values, dims=['time_series'], coords={'time_series': list(df.index)} - ) - - da = self._combine_slices_to_dataarray_generic(slices, ['time_series'], periods, scenarios, metric) - data_vars[metric] = da - clustering_metrics = xr.Dataset(data_vars) - n_reduced_timesteps = len(first_tsam.typicalPeriods) - actual_n_clusters = len(first_tsam.clusterPeriodNoOccur) - - # ═══════════════════════════════════════════════════════════════════════ - # TRUE (cluster, time) DIMENSIONS - # ═══════════════════════════════════════════════════════════════════════ - # Create coordinates for the 2D cluster structure - cluster_coords = np.arange(actual_n_clusters) - # Use DatetimeIndex for time within cluster (e.g., 00:00-23:00 for daily clustering) - time_coords = pd.date_range( - start='2000-01-01', - periods=timesteps_per_cluster, - freq=pd.Timedelta(hours=dt), - name='time', + # Apply to full data - this returns AggregationResults + agg_results = clustering_results.apply(ds) + + # Update tsam_aggregation_results with full data results + for cr_key, result in agg_results: + # Convert back to (period, scenario) format + if has_periods and has_scenarios: + full_key = (cr_key[0], cr_key[1]) + elif has_periods: + full_key = (cr_key[0], None) + elif has_scenarios: + full_key = (None, cr_key[0]) + else: + full_key = (None, None) + tsam_aggregation_results[full_key] = result + cluster_occurrences_all[full_key] = result.cluster_weights + + # Build and return the reduced FlowSystem + return self._build_reduced_flow_system( + ds=ds, + tsam_aggregation_results=tsam_aggregation_results, + cluster_occurrences_all=cluster_occurrences_all, + clustering_metrics_all=clustering_metrics_all, + timesteps_per_cluster=timesteps_per_cluster, + dt=dt, + periods=periods, + scenarios=scenarios, + n_clusters_requested=n_clusters, ) - # Create cluster_weight: shape (cluster,) - one weight per cluster - # This is the number of original periods each cluster represents - def _build_cluster_weight_for_key(key: tuple) -> xr.DataArray: - occurrences = cluster_occurrences_all[key] - weights = np.array([occurrences.get(c, 1) for c in range(actual_n_clusters)]) - return xr.DataArray(weights, dims=['cluster'], coords={'cluster': cluster_coords}) - - # Build cluster_weight - use _combine_slices_to_dataarray_generic for multi-dim handling - weight_slices = {key: _build_cluster_weight_for_key(key) for key in cluster_occurrences_all} - cluster_weight = self._combine_slices_to_dataarray_generic( - weight_slices, ['cluster'], periods, scenarios, 'cluster_weight' - ) + def apply_clustering( + self, + clustering: Clustering, + ) -> FlowSystem: + """ + Apply an existing clustering to this FlowSystem. - logger.info( - f'Reduced from {len(self._fs.timesteps)} to {actual_n_clusters} clusters × {timesteps_per_cluster} timesteps' - ) - logger.info(f'Clusters: {actual_n_clusters} (requested: {n_clusters})') + This method applies a previously computed clustering (from another FlowSystem) + to the current FlowSystem's data. The clustering structure (cluster assignments, + number of clusters, etc.) is preserved while the time series data is aggregated + according to the existing cluster assignments. - # Build typical periods DataArrays with (cluster, time) shape - typical_das: dict[str, dict[tuple, xr.DataArray]] = {} - for key, tsam_agg in tsam_results.items(): - typical_df = tsam_agg.typicalPeriods - for col in typical_df.columns: - # Reshape flat data to (cluster, time) - flat_data = typical_df[col].values - reshaped = flat_data.reshape(actual_n_clusters, timesteps_per_cluster) - typical_das.setdefault(col, {})[key] = xr.DataArray( - reshaped, - dims=['cluster', 'time'], - coords={'cluster': cluster_coords, 'time': time_coords}, - ) + Use this to: + - Compare different scenarios with identical cluster assignments + - Apply a reference clustering to new data - # Build reduced dataset with (cluster, time) dimensions - all_keys = {(p, s) for p in periods for s in scenarios} - ds_new_vars = {} - for name, original_da in ds.data_vars.items(): - if 'time' not in original_da.dims: - ds_new_vars[name] = original_da.copy() - elif name not in typical_das or set(typical_das[name].keys()) != all_keys: - # Time-dependent but constant: reshape to (cluster, time, ...) - sliced = original_da.isel(time=slice(0, n_reduced_timesteps)) - # Get the shape - time is first, other dims follow - other_dims = [d for d in sliced.dims if d != 'time'] - other_shape = [sliced.sizes[d] for d in other_dims] - # Reshape: (n_reduced_timesteps, ...) -> (n_clusters, timesteps_per_cluster, ...) - new_shape = [actual_n_clusters, timesteps_per_cluster] + other_shape - reshaped = sliced.values.reshape(new_shape) - # Build coords - new_coords = {'cluster': cluster_coords, 'time': time_coords} - for dim in other_dims: - new_coords[dim] = sliced.coords[dim].values - ds_new_vars[name] = xr.DataArray( - reshaped, - dims=['cluster', 'time'] + other_dims, - coords=new_coords, - attrs=original_da.attrs, - ) - else: - # Time-varying: combine per-(period, scenario) slices with (cluster, time) dims - da = self._combine_slices_to_dataarray_2d( - slices=typical_das[name], - original_da=original_da, - periods=periods, - scenarios=scenarios, - ) - if TimeSeriesData.is_timeseries_data(original_da): - da = TimeSeriesData.from_dataarray(da.assign_attrs(original_da.attrs)) - ds_new_vars[name] = da + Args: + clustering: A ``Clustering`` object from a previously clustered FlowSystem. + Obtain this via ``fs.clustering`` from a clustered FlowSystem. - # Copy attrs but remove cluster_weight - the clustered FlowSystem gets its own - # cluster_weight set after from_dataset (original reference has wrong shape) - new_attrs = dict(ds.attrs) - new_attrs.pop('cluster_weight', None) - ds_new = xr.Dataset(ds_new_vars, attrs=new_attrs) + Returns: + A new FlowSystem with reduced timesteps (only typical clusters). + The FlowSystem has metadata stored in ``clustering`` for expansion. - reduced_fs = FlowSystem.from_dataset(ds_new) - # Set cluster_weight - shape (cluster,) possibly with period/scenario dimensions - reduced_fs.cluster_weight = cluster_weight + Raises: + ValueError: If the clustering dimensions don't match this FlowSystem's + periods/scenarios. - # Remove 'equals_final' from storages - doesn't make sense on reduced timesteps - # Set to None so initial SOC is free (handled by storage_mode constraints) - for storage in reduced_fs.storages.values(): - ics = storage.initial_charge_state - if isinstance(ics, str) and ics == 'equals_final': - storage.initial_charge_state = None + Examples: + Apply clustering from one FlowSystem to another: - # Build Clustering for inter-cluster linking and solution expansion - n_original_timesteps = len(self._fs.timesteps) - - # Build per-slice cluster_order and timestep_mapping as multi-dimensional DataArrays - # This is needed because each (period, scenario) combination may have different clustering - - def _build_timestep_mapping_for_key(key: tuple) -> np.ndarray: - """Build timestep_mapping for a single (period, scenario) slice.""" - mapping = np.zeros(n_original_timesteps, dtype=np.int32) - for period_idx, cluster_id in enumerate(cluster_orders[key]): - for pos in range(timesteps_per_cluster): - original_idx = period_idx * timesteps_per_cluster + pos - if original_idx < n_original_timesteps: - representative_idx = cluster_id * timesteps_per_cluster + pos - mapping[original_idx] = representative_idx - return mapping - - def _build_cluster_occurrences_for_key(key: tuple) -> np.ndarray: - """Build cluster_occurrences array for a single (period, scenario) slice.""" - occurrences = cluster_occurrences_all[key] - return np.array([occurrences.get(c, 0) for c in range(actual_n_clusters)]) + >>> fs_reference = fs_base.transform.cluster(n_clusters=8, cluster_duration='1D') + >>> fs_other = fs_high.transform.apply_clustering(fs_reference.clustering) + """ + # Validation + dt = float(self._fs.timestep_duration.min().item()) + if not np.isclose(dt, float(self._fs.timestep_duration.max().item())): + raise ValueError( + f'apply_clustering() requires uniform timestep sizes, got min={dt}h, ' + f'max={float(self._fs.timestep_duration.max().item())}h.' + ) - # Build multi-dimensional arrays - if has_periods or has_scenarios: - # Multi-dimensional case: build arrays for each (period, scenario) combination - # cluster_order: dims [original_cluster, period?, scenario?] - cluster_order_slices = {} - timestep_mapping_slices = {} - cluster_occurrences_slices = {} + # Get timesteps_per_cluster from the clustering object (survives serialization) + timesteps_per_cluster = clustering.timesteps_per_cluster + has_periods = self._fs.periods is not None + has_scenarios = self._fs.scenarios is not None - # Use renamed timesteps as coordinates for multi-dimensional case - original_timesteps_coord = self._fs.timesteps.rename('original_time') + # Determine iteration dimensions + periods = list(self._fs.periods) if has_periods else [None] + scenarios = list(self._fs.scenarios) if has_scenarios else [None] - for p in periods: - for s in scenarios: - key = (p, s) - cluster_order_slices[key] = xr.DataArray( - cluster_orders[key], dims=['original_cluster'], name='cluster_order' - ) - timestep_mapping_slices[key] = xr.DataArray( - _build_timestep_mapping_for_key(key), - dims=['original_time'], - coords={'original_time': original_timesteps_coord}, - name='timestep_mapping', - ) - cluster_occurrences_slices[key] = xr.DataArray( - _build_cluster_occurrences_for_key(key), dims=['cluster'], name='cluster_occurrences' - ) + ds = self._fs.to_dataset(include_solution=False) - # Combine slices into multi-dimensional DataArrays - cluster_order_da = self._combine_slices_to_dataarray_generic( - cluster_order_slices, ['original_cluster'], periods, scenarios, 'cluster_order' - ) - timestep_mapping_da = self._combine_slices_to_dataarray_generic( - timestep_mapping_slices, ['original_time'], periods, scenarios, 'timestep_mapping' - ) - cluster_occurrences_da = self._combine_slices_to_dataarray_generic( - cluster_occurrences_slices, ['cluster'], periods, scenarios, 'cluster_occurrences' - ) - else: - # Simple case: single (None, None) slice - cluster_order_da = xr.DataArray(cluster_orders[first_key], dims=['original_cluster'], name='cluster_order') - # Use renamed timesteps as coordinates - original_timesteps_coord = self._fs.timesteps.rename('original_time') - timestep_mapping_da = xr.DataArray( - _build_timestep_mapping_for_key(first_key), - dims=['original_time'], - coords={'original_time': original_timesteps_coord}, - name='timestep_mapping', - ) - cluster_occurrences_da = xr.DataArray( - _build_cluster_occurrences_for_key(first_key), dims=['cluster'], name='cluster_occurrences' + # Validate that timesteps match the clustering expectations + current_timesteps = len(self._fs.timesteps) + expected_timesteps = clustering.n_original_clusters * clustering.timesteps_per_cluster + if current_timesteps != expected_timesteps: + raise ValueError( + f'Timestep count mismatch in apply_clustering(): ' + f'FlowSystem has {current_timesteps} timesteps, but clustering expects ' + f'{expected_timesteps} timesteps ({clustering.n_original_clusters} clusters × ' + f'{clustering.timesteps_per_cluster} timesteps/cluster). ' + f'Ensure self._fs.timesteps matches the original data used for clustering.results.apply(ds).' ) - cluster_structure = ClusterStructure( - cluster_order=cluster_order_da, - cluster_occurrences=cluster_occurrences_da, - n_clusters=actual_n_clusters, - timesteps_per_cluster=timesteps_per_cluster, - ) - - # Create representative_weights with (cluster,) dimension only - # Each cluster has one weight (same for all timesteps within it) - def _build_cluster_weights_for_key(key: tuple) -> xr.DataArray: - occurrences = cluster_occurrences_all[key] - # Shape: (n_clusters,) - one weight per cluster - weights = np.array([occurrences.get(c, 1) for c in range(actual_n_clusters)]) - return xr.DataArray(weights, dims=['cluster'], name='representative_weights') + # Apply existing clustering to all (period, scenario) combinations at once + logger.info('Applying clustering...') + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', category=UserWarning, message='.*minimal value.*exceeds.*') + agg_results = clustering.results.apply(ds) - weights_slices = {key: _build_cluster_weights_for_key(key) for key in cluster_occurrences_all} - representative_weights = self._combine_slices_to_dataarray_generic( - weights_slices, ['cluster'], periods, scenarios, 'representative_weights' - ) - - aggregation_result = ClusterResult( - timestep_mapping=timestep_mapping_da, - n_representatives=n_reduced_timesteps, - representative_weights=representative_weights, - cluster_structure=cluster_structure, - original_data=ds, - aggregated_data=ds_new, - ) + # Convert AggregationResults to the dict format expected by _build_reduced_flow_system + tsam_aggregation_results: dict[tuple, Any] = {} + cluster_occurrences_all: dict[tuple, dict] = {} + clustering_metrics_all: dict[tuple, pd.DataFrame] = {} - reduced_fs.clustering = Clustering( - result=aggregation_result, - backend_name='tsam', - metrics=clustering_metrics, + for cr_key, result in agg_results: + # Convert ClusteringResults key to (period, scenario) format + if has_periods and has_scenarios: + full_key = (cr_key[0], cr_key[1]) + elif has_periods: + full_key = (cr_key[0], None) + elif has_scenarios: + full_key = (None, cr_key[0]) + else: + full_key = (None, None) + + tsam_aggregation_results[full_key] = result + cluster_occurrences_all[full_key] = result.cluster_weights + try: + clustering_metrics_all[full_key] = self._accuracy_to_dataframe(result.accuracy) + except Exception as e: + logger.warning(f'Failed to compute clustering metrics for {full_key}: {e}') + clustering_metrics_all[full_key] = pd.DataFrame() + + # Build and return the reduced FlowSystem + return self._build_reduced_flow_system( + ds=ds, + tsam_aggregation_results=tsam_aggregation_results, + cluster_occurrences_all=cluster_occurrences_all, + clustering_metrics_all=clustering_metrics_all, + timesteps_per_cluster=timesteps_per_cluster, + dt=dt, + periods=periods, + scenarios=scenarios, ) - return reduced_fs - - @staticmethod - def _combine_slices_to_dataarray( - slices: dict[tuple, xr.DataArray], - original_da: xr.DataArray, - new_time_index: pd.DatetimeIndex, - periods: list, - scenarios: list, - ) -> xr.DataArray: - """Combine per-(period, scenario) slices into a multi-dimensional DataArray using xr.concat. - - Args: - slices: Dict mapping (period, scenario) tuples to 1D DataArrays (time only). - original_da: Original DataArray to get dimension order and attrs from. - new_time_index: New time coordinate for the output. - periods: List of period labels ([None] if no periods dimension). - scenarios: List of scenario labels ([None] if no scenarios dimension). - - Returns: - DataArray with dimensions matching original_da but reduced time. - """ - first_key = (periods[0], scenarios[0]) - has_periods = periods != [None] - has_scenarios = scenarios != [None] - - # Simple case: no period/scenario dimensions - if not has_periods and not has_scenarios: - return slices[first_key].assign_attrs(original_da.attrs) - - # Multi-dimensional: use xr.concat to stack along period/scenario dims - if has_periods and has_scenarios: - # Stack scenarios first, then periods - period_arrays = [] - for p in periods: - scenario_arrays = [slices[(p, s)] for s in scenarios] - period_arrays.append(xr.concat(scenario_arrays, dim=pd.Index(scenarios, name='scenario'))) - result = xr.concat(period_arrays, dim=pd.Index(periods, name='period')) - elif has_periods: - result = xr.concat([slices[(p, None)] for p in periods], dim=pd.Index(periods, name='period')) - else: - result = xr.concat([slices[(None, s)] for s in scenarios], dim=pd.Index(scenarios, name='scenario')) - - # Put time dimension first (standard order), preserve other dims - result = result.transpose('time', ...) - - return result.assign_attrs(original_da.attrs) - @staticmethod def _combine_slices_to_dataarray_generic( slices: dict[tuple, xr.DataArray], @@ -1208,7 +1727,7 @@ def _combine_slices_to_dataarray_generic( @staticmethod def _combine_slices_to_dataarray_2d( slices: dict[tuple, xr.DataArray], - original_da: xr.DataArray, + attrs: dict, periods: list, scenarios: list, ) -> xr.DataArray: @@ -1216,7 +1735,7 @@ def _combine_slices_to_dataarray_2d( Args: slices: Dict mapping (period, scenario) tuples to DataArrays with (cluster, time) dims. - original_da: Original DataArray to get attrs from. + attrs: Attributes to assign to the result. periods: List of period labels ([None] if no periods dimension). scenarios: List of scenario labels ([None] if no scenarios dimension). @@ -1229,7 +1748,7 @@ def _combine_slices_to_dataarray_2d( # Simple case: no period/scenario dimensions if not has_periods and not has_scenarios: - return slices[first_key].assign_attrs(original_da.attrs) + return slices[first_key].assign_attrs(attrs) # Multi-dimensional: use xr.concat to stack along period/scenario dims if has_periods and has_scenarios: @@ -1247,17 +1766,18 @@ def _combine_slices_to_dataarray_2d( # Put cluster and time first (standard order for clustered data) result = result.transpose('cluster', 'time', ...) - return result.assign_attrs(original_da.attrs) + return result.assign_attrs(attrs) - def _validate_for_expansion(self) -> tuple: + def _validate_for_expansion(self) -> Clustering: """Validate FlowSystem can be expanded and return clustering info. Returns: - Tuple of (clustering, cluster_structure). + The Clustering object. Raises: ValueError: If FlowSystem wasn't created with cluster() or has no solution. """ + if self._fs.clustering is None: raise ValueError( 'expand() requires a FlowSystem created with cluster(). This FlowSystem has no aggregation info.' @@ -1265,17 +1785,13 @@ def _validate_for_expansion(self) -> tuple: if self._fs.solution is None: raise ValueError('FlowSystem has no solution. Run optimize() or solve() first.') - cluster_structure = self._fs.clustering.result.cluster_structure - if cluster_structure is None: - raise ValueError('No cluster structure available for expansion.') - - return self._fs.clustering, cluster_structure + return self._fs.clustering def _combine_intercluster_charge_states( self, expanded_fs: FlowSystem, reduced_solution: xr.Dataset, - cluster_structure, + clustering: Clustering, original_timesteps_extra: pd.DatetimeIndex, timesteps_per_cluster: int, n_original_clusters: int, @@ -1290,13 +1806,13 @@ def _combine_intercluster_charge_states( Args: expanded_fs: The expanded FlowSystem (modified in-place). reduced_solution: The original reduced solution dataset. - cluster_structure: ClusterStructure with cluster order info. + clustering: Clustering with cluster order info. original_timesteps_extra: Original timesteps including the extra final timestep. timesteps_per_cluster: Number of timesteps per cluster. n_original_clusters: Number of original clusters before aggregation. """ n_original_timesteps_extra = len(original_timesteps_extra) - soc_boundary_vars = [name for name in reduced_solution.data_vars if name.endswith('|SOC_boundary')] + soc_boundary_vars = self._fs.get_variables_by_category(VariableCategory.SOC_BOUNDARY) for soc_boundary_name in soc_boundary_vars: storage_name = soc_boundary_name.rsplit('|', 1)[0] @@ -1322,7 +1838,7 @@ def _combine_intercluster_charge_states( soc_boundary_per_timestep = self._apply_soc_decay( soc_boundary_per_timestep, storage_name, - cluster_structure, + clustering, original_timesteps_extra, original_cluster_indices, timesteps_per_cluster, @@ -1343,7 +1859,7 @@ def _apply_soc_decay( self, soc_boundary_per_timestep: xr.DataArray, storage_name: str, - cluster_structure, + clustering: Clustering, original_timesteps_extra: pd.DatetimeIndex, original_cluster_indices: np.ndarray, timesteps_per_cluster: int, @@ -1353,7 +1869,7 @@ def _apply_soc_decay( Args: soc_boundary_per_timestep: SOC boundary values mapped to each timestep. storage_name: Name of the storage component. - cluster_structure: ClusterStructure with cluster order info. + clustering: Clustering with cluster order info. original_timesteps_extra: Original timesteps including final extra timestep. original_cluster_indices: Mapping of timesteps to original cluster indices. timesteps_per_cluster: Number of timesteps per cluster. @@ -1383,21 +1899,126 @@ def _apply_soc_decay( # Handle cluster dimension if present if 'cluster' in decay_da.dims: - cluster_order = cluster_structure.cluster_order - if cluster_order.ndim == 1: + cluster_assignments = clustering.cluster_assignments + if cluster_assignments.ndim == 1: cluster_per_timestep = xr.DataArray( - cluster_order.values[original_cluster_indices], + cluster_assignments.values[original_cluster_indices], dims=['time'], coords={'time': original_timesteps_extra}, ) else: - cluster_per_timestep = cluster_order.isel( + cluster_per_timestep = cluster_assignments.isel( original_cluster=xr.DataArray(original_cluster_indices, dims=['time']) ).assign_coords(time=original_timesteps_extra) decay_da = decay_da.isel(cluster=cluster_per_timestep).drop_vars('cluster', errors='ignore') return soc_boundary_per_timestep * decay_da + def _build_segment_total_varnames(self) -> set[str]: + """Build segment total variable names - BACKWARDS COMPATIBILITY FALLBACK. + + This method is only used when variable_categories is empty (old FlowSystems + saved before category registration was implemented). New FlowSystems use + the VariableCategory registry with EXPAND_DIVIDE categories (PER_TIMESTEP, SHARE). + + For segmented systems, these variables contain values that are summed over + segments. When expanded to hourly resolution, they need to be divided by + segment duration to get correct hourly rates. + + Returns: + Set of variable names that should be divided by expansion divisor. + """ + segment_total_vars: set[str] = set() + + # Get all effect names + effect_names = list(self._fs.effects.keys()) + + # 1. Per-timestep totals for each effect: {effect}(temporal)|per_timestep + for effect in effect_names: + segment_total_vars.add(f'{effect}(temporal)|per_timestep') + + # 2. Flow contributions to effects: {flow}->{effect}(temporal) + # (from effects_per_flow_hour on Flow elements) + for flow_label in self._fs.flows: + for effect in effect_names: + segment_total_vars.add(f'{flow_label}->{effect}(temporal)') + + # 3. Component contributions to effects: {component}->{effect}(temporal) + # (from effects_per_startup, effects_per_active_hour on OnOffParameters) + for component_label in self._fs.components: + for effect in effect_names: + segment_total_vars.add(f'{component_label}->{effect}(temporal)') + + # 4. Effect-to-effect contributions (from share_from_temporal) + # {source_effect}(temporal)->{target_effect}(temporal) + for target_effect_name, target_effect in self._fs.effects.items(): + if target_effect.share_from_temporal: + for source_effect_name in target_effect.share_from_temporal: + segment_total_vars.add(f'{source_effect_name}(temporal)->{target_effect_name}(temporal)') + + return segment_total_vars + + def _interpolate_charge_state_segmented( + self, + da: xr.DataArray, + clustering: Clustering, + original_timesteps: pd.DatetimeIndex, + ) -> xr.DataArray: + """Interpolate charge_state values within segments for segmented systems. + + For segmented systems, charge_state has values at segment boundaries (n_segments+1). + Instead of repeating the start boundary value for all timesteps in a segment, + this method interpolates between start and end boundary values to show the + actual charge trajectory as the storage charges/discharges. + + Uses vectorized xarray operations via Clustering class properties. + + Args: + da: charge_state DataArray with dims (cluster, time) where time has n_segments+1 entries. + clustering: Clustering object with segment info. + original_timesteps: Original timesteps to expand to. + + Returns: + Interpolated charge_state with dims (time, ...) for original timesteps. + """ + # Get multi-dimensional properties from Clustering + timestep_mapping = clustering.timestep_mapping + segment_assignments = clustering.results.segment_assignments + segment_durations = clustering.results.segment_durations + position_within_segment = clustering.results.position_within_segment + + # Decode timestep_mapping into cluster and time indices + # timestep_mapping encodes original timestep -> (cluster, position_within_cluster) + # where position_within_cluster indexes into segment_assignments/position_within_segment + # which have shape (cluster, timesteps_per_cluster) + timesteps_per_cluster = clustering.timesteps_per_cluster + cluster_indices = timestep_mapping // timesteps_per_cluster + time_indices = timestep_mapping % timesteps_per_cluster + + # Get segment index and position for each original timestep + seg_indices = segment_assignments.isel(cluster=cluster_indices, time=time_indices) + positions = position_within_segment.isel(cluster=cluster_indices, time=time_indices) + durations = segment_durations.isel(cluster=cluster_indices, segment=seg_indices) + + # Calculate interpolation factor: position within segment (0 to 1) + # At position=0, factor=0.5/duration (start of segment) + # At position=duration-1, factor approaches 1 (end of segment) + factor = xr.where(durations > 1, (positions + 0.5) / durations, 0.5) + + # Get start and end boundary values from charge_state + # charge_state has dims (cluster, time) where time = segment boundaries (n_segments+1) + start_vals = da.isel(cluster=cluster_indices, time=seg_indices) + end_vals = da.isel(cluster=cluster_indices, time=seg_indices + 1) + + # Linear interpolation + interpolated = start_vals + (end_vals - start_vals) * factor + + # Clean up coordinate artifacts and rename + interpolated = interpolated.drop_vars(['cluster', 'time', 'segment'], errors='ignore') + interpolated = interpolated.rename({'original_time': 'time'}).assign_coords(time=original_timesteps) + + return interpolated.transpose('time', ...).assign_attrs(da.attrs) + def expand(self) -> FlowSystem: """Expand a clustered FlowSystem back to full original timesteps. @@ -1448,22 +2069,67 @@ def expand(self) -> FlowSystem: For accurate dispatch results, use ``fix_sizes()`` to fix the sizes from the reduced optimization and re-optimize at full resolution. + + **Segmented Systems Variable Handling:** + + For systems clustered with ``SegmentConfig``, special handling is applied + to time-varying solution variables. Variables without a ``time`` dimension + are unaffected by segment expansion. This includes: + + - Investment: ``{component}|size``, ``{component}|exists`` + - Storage boundaries: ``{storage}|SOC_boundary`` + - Aggregated totals: ``{flow}|total_flow_hours``, ``{flow}|active_hours`` + - Effect totals: ``{effect}``, ``{effect}(temporal)``, ``{effect}(periodic)`` + + Time-varying variables are categorized and handled as follows: + + 1. **State variables** - Interpolated within segments: + + - ``{storage}|charge_state``: Linear interpolation between segment + boundary values to show the charge trajectory during charge/discharge. + + 2. **Segment totals** - Divided by segment duration: + + These variables represent values summed over the segment. Division + converts them back to hourly rates for correct plotting and analysis. + + - ``{effect}(temporal)|per_timestep``: Per-timestep effect contributions + - ``{flow}->{effect}(temporal)``: Flow contributions (includes both + ``effects_per_flow_hour`` and ``effects_per_startup``) + - ``{component}->{effect}(temporal)``: Component-level contributions + - ``{source}(temporal)->{target}(temporal)``: Effect-to-effect shares + + 3. **Rate/average variables** - Expanded as-is: + + These variables represent average values within the segment. tsam + already provides properly averaged values, so no correction needed. + + - ``{flow}|flow_rate``: Average flow rate during segment + - ``{storage}|netto_discharge``: Net discharge rate (discharge - charge) + + 4. **Binary status variables** - Constant within segment: + + These variables cannot be meaningfully interpolated. They indicate + the dominant state or whether an event occurred during the segment. + + - ``{flow}|status``: On/off status (0 or 1) + - ``{flow}|startup``: Startup event occurred in segment + - ``{flow}|shutdown``: Shutdown event occurred in segment """ from .flow_system import FlowSystem # Validate and extract clustering info - info, cluster_structure = self._validate_for_expansion() + clustering = self._validate_for_expansion() - timesteps_per_cluster = cluster_structure.timesteps_per_cluster - n_clusters = ( - int(cluster_structure.n_clusters) - if isinstance(cluster_structure.n_clusters, (int, np.integer)) - else int(cluster_structure.n_clusters.values) - ) - n_original_clusters = cluster_structure.n_original_clusters + timesteps_per_cluster = clustering.timesteps_per_cluster + # For segmented systems, the time dimension has n_segments entries + n_segments = clustering.n_segments + time_dim_size = n_segments if n_segments is not None else timesteps_per_cluster + n_clusters = clustering.n_clusters + n_original_clusters = clustering.n_original_clusters # Get original timesteps and dimensions - original_timesteps = info.original_timesteps + original_timesteps = clustering.original_timesteps n_original_timesteps = len(original_timesteps) original_timesteps_extra = FlowSystem._create_timesteps_with_extra(original_timesteps, None) @@ -1473,35 +2139,85 @@ def expand(self) -> FlowSystem: n_original_clusters - 1, ) - def expand_da(da: xr.DataArray, var_name: str = '') -> xr.DataArray: + # For segmented systems: build expansion divisor and identify segment total variables + expansion_divisor = None + segment_total_vars: set[str] = set() + variable_categories = getattr(self._fs, '_variable_categories', {}) + if clustering.is_segmented: + expansion_divisor = clustering.build_expansion_divisor(original_time=original_timesteps) + # Build segment total vars using registry first, fall back to pattern matching + segment_total_vars = {name for name, cat in variable_categories.items() if cat in EXPAND_DIVIDE} + # Fall back to pattern matching for backwards compatibility (old FlowSystems without categories) + if not segment_total_vars: + segment_total_vars = self._build_segment_total_varnames() + + def _is_state_variable(var_name: str) -> bool: + """Check if a variable is a state variable (should be interpolated).""" + if var_name in variable_categories: + return variable_categories[var_name] in EXPAND_INTERPOLATE + # Fall back to pattern matching for backwards compatibility + return var_name.endswith('|charge_state') + + def _append_final_state(expanded: xr.DataArray, da: xr.DataArray) -> xr.DataArray: + """Append final state value from original data to expanded data.""" + cluster_assignments = clustering.cluster_assignments + if cluster_assignments.ndim == 1: + last_cluster = int(cluster_assignments.values[last_original_cluster_idx]) + extra_val = da.isel(cluster=last_cluster, time=-1) + else: + last_clusters = cluster_assignments.isel(original_cluster=last_original_cluster_idx) + extra_val = da.isel(cluster=last_clusters, time=-1) + extra_val = extra_val.drop_vars(['cluster', 'time'], errors='ignore') + extra_val = extra_val.expand_dims(time=[original_timesteps_extra[-1]]) + return xr.concat([expanded, extra_val], dim='time') + + def expand_da(da: xr.DataArray, var_name: str = '', is_solution: bool = False) -> xr.DataArray: """Expand a DataArray from clustered to original timesteps.""" if 'time' not in da.dims: return da.copy() - expanded = info.result.expand_data(da, original_time=original_timesteps) - - # For charge_state with cluster dim, append the extra timestep value - if var_name.endswith('|charge_state') and 'cluster' in da.dims: - cluster_order = cluster_structure.cluster_order - if cluster_order.ndim == 1: - last_cluster = int(cluster_order[last_original_cluster_idx]) - extra_val = da.isel(cluster=last_cluster, time=-1) - else: - last_clusters = cluster_order.isel(original_cluster=last_original_cluster_idx) - extra_val = da.isel(cluster=last_clusters, time=-1) - extra_val = extra_val.drop_vars(['cluster', 'time'], errors='ignore') - extra_val = extra_val.expand_dims(time=[original_timesteps_extra[-1]]) - expanded = xr.concat([expanded, extra_val], dim='time') + + is_state = _is_state_variable(var_name) and 'cluster' in da.dims + + # State variables in segmented systems: interpolate within segments + if is_state and clustering.is_segmented: + expanded = self._interpolate_charge_state_segmented(da, clustering, original_timesteps) + return _append_final_state(expanded, da) + + expanded = clustering.expand_data(da, original_time=original_timesteps) + + # Segment totals: divide by expansion divisor + if is_solution and expansion_divisor is not None and var_name in segment_total_vars: + expanded = expanded / expansion_divisor + + # State variables: append final state + if is_state: + expanded = _append_final_state(expanded, da) return expanded + # Helper to construct DataArray without slow _construct_dataarray + def _fast_get_da(ds: xr.Dataset, name: str, coord_cache: dict) -> xr.DataArray: + variable = ds.variables[name] + var_dims = set(variable.dims) + coords = {k: v for k, v in coord_cache.items() if set(v.dims).issubset(var_dims)} + return xr.DataArray(variable, coords=coords, name=name) + # 1. Expand FlowSystem data reduced_ds = self._fs.to_dataset(include_solution=False) clustering_attrs = {'is_clustered', 'n_clusters', 'timesteps_per_cluster', 'clustering', 'cluster_weight'} - data_vars = { - name: expand_da(da, name) - for name, da in reduced_ds.data_vars.items() - if name != 'cluster_weight' and not name.startswith('clustering|') - } + skip_vars = {'cluster_weight', 'timestep_duration'} # These have special handling + data_vars = {} + # Use ds.variables pattern to avoid slow _construct_dataarray calls + coord_cache = {k: v for k, v in reduced_ds.coords.items()} + for name in reduced_ds.data_vars: + if name in skip_vars or name.startswith('clustering|'): + continue + da = _fast_get_da(reduced_ds, name, coord_cache) + # Skip vars with cluster dim but no time dim - they don't make sense after expansion + # (e.g., representative_weights with dims ('cluster',) or ('cluster', 'period')) + if 'cluster' in da.dims and 'time' not in da.dims: + continue + data_vars[name] = expand_da(da, name) attrs = {k: v for k, v in reduced_ds.attrs.items() if k not in clustering_attrs} expanded_ds = xr.Dataset(data_vars, attrs=attrs) @@ -1511,19 +2227,22 @@ def expand_da(da: xr.DataArray, var_name: str = '') -> xr.DataArray: expanded_fs = FlowSystem.from_dataset(expanded_ds) - # 2. Expand solution + # 2. Expand solution (with segment total correction for segmented systems) reduced_solution = self._fs.solution - expanded_fs._solution = xr.Dataset( - {name: expand_da(da, name) for name, da in reduced_solution.data_vars.items()}, - attrs=reduced_solution.attrs, - ) + # Use ds.variables pattern to avoid slow _construct_dataarray calls + sol_coord_cache = {k: v for k, v in reduced_solution.coords.items()} + expanded_sol_vars = {} + for name in reduced_solution.data_vars: + da = _fast_get_da(reduced_solution, name, sol_coord_cache) + expanded_sol_vars[name] = expand_da(da, name, is_solution=True) + expanded_fs._solution = xr.Dataset(expanded_sol_vars, attrs=reduced_solution.attrs) expanded_fs._solution = expanded_fs._solution.reindex(time=original_timesteps_extra) # 3. Combine charge_state with SOC_boundary for intercluster storages self._combine_intercluster_charge_states( expanded_fs, reduced_solution, - cluster_structure, + clustering, original_timesteps_extra, timesteps_per_cluster, n_original_clusters, @@ -1535,10 +2254,11 @@ def expand_da(da: xr.DataArray, var_name: str = '') -> xr.DataArray: n_combinations = (len(self._fs.periods) if has_periods else 1) * ( len(self._fs.scenarios) if has_scenarios else 1 ) - n_reduced_timesteps = n_clusters * timesteps_per_cluster + n_reduced_timesteps = n_clusters * time_dim_size + segmented_info = f' ({n_segments} segments)' if n_segments else '' logger.info( f'Expanded FlowSystem from {n_reduced_timesteps} to {n_original_timesteps} timesteps ' - f'({n_clusters} clusters' + f'({n_clusters} clusters{segmented_info}' + ( f', {n_combinations} period/scenario combinations)' if n_combinations > 1 diff --git a/pyproject.toml b/pyproject.toml index d85ed29c0..d1dec9ea9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,7 +64,7 @@ network_viz = [ # Full feature set (everything except dev tools) full = [ "pyvis==0.3.2", # Visualizing FlowSystem Network - "tsam >= 2.3.1, < 3", # Time series aggregation + "tsam @ git+https://github.com/FBumann/tsam.git@v3-rebased", # Time series aggregation (unreleased) "scipy >= 1.15.1, < 2", # Used by tsam. Prior versions have conflict with highspy. See https://github.com/scipy/scipy/issues/22257 "gurobipy >= 10.0.0, < 14; python_version < '3.14'", # No Python 3.14 wheels yet (expected Q1 2026) "dash >= 3.0.0, < 4", # Visualizing FlowSystem Network as app @@ -83,7 +83,7 @@ dev = [ "ruff==0.14.10", "pre-commit==4.3.0", "pyvis==0.3.2", - "tsam==2.3.9", + "tsam @ git+https://github.com/FBumann/tsam.git@v3-rebased", "scipy==1.16.3", # 1.16.1+ required for Python 3.14 wheels "gurobipy==12.0.3; python_version < '3.14'", # No Python 3.14 wheels yet "dash==3.3.0", diff --git a/tests/deprecated/examples/03_Optimization_modes/example_optimization_modes.py b/tests/deprecated/examples/03_Optimization_modes/example_optimization_modes.py index 1f2e13906..b174b5141 100644 --- a/tests/deprecated/examples/03_Optimization_modes/example_optimization_modes.py +++ b/tests/deprecated/examples/03_Optimization_modes/example_optimization_modes.py @@ -190,20 +190,24 @@ def get_solutions(optimizations: list, variable: str) -> xr.Dataset: optimizations.append(optimization) if aggregated: - # Use the new transform.cluster() API - # Note: time_series_for_high_peaks/low_peaks expect string labels matching dataset variables - time_series_for_high_peaks = ['Wärmelast(Q_th_Last)|fixed_relative_profile'] if keep_extreme_periods else None - time_series_for_low_peaks = ( - ['Stromlast(P_el_Last)|fixed_relative_profile', 'Wärmelast(Q_th_Last)|fixed_relative_profile'] - if keep_extreme_periods - else None - ) + # Use the transform.cluster() API with tsam 3.0 + from tsam.config import ExtremeConfig + + extremes = None + if keep_extreme_periods: + extremes = ExtremeConfig( + method='new_cluster', + max_value=['Wärmelast(Q_th_Last)|fixed_relative_profile'], + min_value=[ + 'Stromlast(P_el_Last)|fixed_relative_profile', + 'Wärmelast(Q_th_Last)|fixed_relative_profile', + ], + ) clustered_fs = flow_system.copy().transform.cluster( n_clusters=n_clusters, cluster_duration=cluster_duration, - time_series_for_high_peaks=time_series_for_high_peaks, - time_series_for_low_peaks=time_series_for_low_peaks, + extremes=extremes, ) t_start = timeit.default_timer() clustered_fs.optimize(fx.solvers.HighsSolver(0.01 / 100, 60)) diff --git a/tests/test_cluster_reduce_expand.py b/tests/test_cluster_reduce_expand.py index f09977e7b..b4900b3c9 100644 --- a/tests/test_cluster_reduce_expand.py +++ b/tests/test_cluster_reduce_expand.py @@ -62,7 +62,7 @@ def test_cluster_creates_reduced_timesteps(timesteps_8_days): assert len(fs_reduced.clusters) == 2 # Number of clusters assert len(fs_reduced.timesteps) * len(fs_reduced.clusters) == 48 # Total assert hasattr(fs_reduced, 'clustering') - assert fs_reduced.clustering.result.cluster_structure.n_clusters == 2 + assert fs_reduced.clustering.n_clusters == 2 def test_expand_restores_full_timesteps(solver_fixture, timesteps_8_days): @@ -120,10 +120,10 @@ def test_expand_maps_values_correctly(solver_fixture, timesteps_8_days): ) fs_reduced.optimize(solver_fixture) - # Get cluster_order to know mapping + # Get cluster_assignments to know mapping info = fs_reduced.clustering - cluster_order = info.result.cluster_structure.cluster_order.values - timesteps_per_cluster = info.result.cluster_structure.timesteps_per_cluster # 24 + cluster_assignments = info.cluster_assignments.values + timesteps_per_cluster = info.timesteps_per_cluster # 24 reduced_flow = fs_reduced.solution['Boiler(Q_th)|flow_rate'].values @@ -132,7 +132,7 @@ def test_expand_maps_values_correctly(solver_fixture, timesteps_8_days): # Check that values are correctly mapped # For each original segment, values should match the corresponding typical cluster - for orig_segment_idx, cluster_id in enumerate(cluster_order): + for orig_segment_idx, cluster_id in enumerate(cluster_assignments): orig_start = orig_segment_idx * timesteps_per_cluster orig_end = orig_start + timesteps_per_cluster @@ -291,8 +291,7 @@ def test_cluster_with_scenarios(timesteps_8_days, scenarios_2): # Should have aggregation info with cluster structure info = fs_reduced.clustering assert info is not None - assert info.result.cluster_structure is not None - assert info.result.cluster_structure.n_clusters == 2 + assert info.n_clusters == 2 # Clustered FlowSystem preserves scenarios assert fs_reduced.scenarios is not None assert len(fs_reduced.scenarios) == 2 @@ -336,23 +335,22 @@ def test_expand_maps_scenarios_independently(solver_fixture, timesteps_8_days, s fs_reduced.optimize(solver_fixture) info = fs_reduced.clustering - cluster_structure = info.result.cluster_structure - timesteps_per_cluster = cluster_structure.timesteps_per_cluster # 24 + timesteps_per_cluster = info.timesteps_per_cluster # 24 reduced_flow = fs_reduced.solution['Boiler(Q_th)|flow_rate'] fs_expanded = fs_reduced.transform.expand() expanded_flow = fs_expanded.solution['Boiler(Q_th)|flow_rate'] - # Check mapping for each scenario using its own cluster_order + # Check mapping for each scenario using its own cluster_assignments for scenario in scenarios_2: - # Get the cluster_order for THIS scenario - cluster_order = cluster_structure.get_cluster_order_for_slice(scenario=scenario) + # Get the cluster_assignments for THIS scenario + cluster_assignments = info.cluster_assignments.sel(scenario=scenario).values reduced_scenario = reduced_flow.sel(scenario=scenario).values expanded_scenario = expanded_flow.sel(scenario=scenario).values - # Verify mapping is correct for this scenario using its own cluster_order - for orig_segment_idx, cluster_id in enumerate(cluster_order): + # Verify mapping is correct for this scenario using its own cluster_assignments + for orig_segment_idx, cluster_id in enumerate(cluster_assignments): orig_start = orig_segment_idx * timesteps_per_cluster orig_end = orig_start + timesteps_per_cluster @@ -451,7 +449,7 @@ def test_storage_cluster_mode_intercluster(self, solver_fixture, timesteps_8_day assert 'cluster_boundary' in soc_boundary.dims # Number of boundaries = n_original_clusters + 1 - n_original_clusters = fs_clustered.clustering.result.cluster_structure.n_original_clusters + n_original_clusters = fs_clustered.clustering.n_original_clusters assert soc_boundary.sizes['cluster_boundary'] == n_original_clusters + 1 def test_storage_cluster_mode_intercluster_cyclic(self, solver_fixture, timesteps_8_days): @@ -535,16 +533,16 @@ def test_expanded_charge_state_matches_manual_calculation(self, solver_fixture, # Get values needed for manual calculation soc_boundary = fs_clustered.solution['Battery|SOC_boundary'] cs_clustered = fs_clustered.solution['Battery|charge_state'] - cluster_structure = fs_clustered.clustering.result.cluster_structure - cluster_order = cluster_structure.cluster_order.values - timesteps_per_cluster = cluster_structure.timesteps_per_cluster + clustering = fs_clustered.clustering + cluster_assignments = clustering.cluster_assignments.values + timesteps_per_cluster = clustering.timesteps_per_cluster fs_expanded = fs_clustered.transform.expand() cs_expanded = fs_expanded.solution['Battery|charge_state'] # Manual verification for first few timesteps of first period p = 0 # First period - cluster = int(cluster_order[p]) + cluster = int(cluster_assignments[p]) soc_b = soc_boundary.isel(cluster_boundary=p).item() for t in [0, 5, 12, 23]: @@ -767,46 +765,52 @@ def create_system_with_peak_demand(timesteps: pd.DatetimeIndex) -> fx.FlowSystem class TestPeakSelection: - """Tests for time_series_for_high_peaks and time_series_for_low_peaks parameters.""" + """Tests for extremes config with max_value and min_value parameters.""" + + def test_extremes_max_value_parameter_accepted(self, timesteps_8_days): + """Verify extremes max_value parameter is accepted.""" + from tsam.config import ExtremeConfig - def test_time_series_for_high_peaks_parameter_accepted(self, timesteps_8_days): - """Verify time_series_for_high_peaks parameter is accepted.""" fs = create_system_with_peak_demand(timesteps_8_days) # Should not raise an error fs_clustered = fs.transform.cluster( n_clusters=2, cluster_duration='1D', - time_series_for_high_peaks=['HeatDemand(Q)|fixed_relative_profile'], + extremes=ExtremeConfig(method='new_cluster', max_value=['HeatDemand(Q)|fixed_relative_profile']), ) assert fs_clustered is not None assert len(fs_clustered.clusters) == 2 - def test_time_series_for_low_peaks_parameter_accepted(self, timesteps_8_days): - """Verify time_series_for_low_peaks parameter is accepted.""" + def test_extremes_min_value_parameter_accepted(self, timesteps_8_days): + """Verify extremes min_value parameter is accepted.""" + from tsam.config import ExtremeConfig + fs = create_system_with_peak_demand(timesteps_8_days) # Should not raise an error - # Note: tsam requires n_clusters >= 3 when using low_peaks to avoid index error + # Note: tsam requires n_clusters >= 3 when using min_value to avoid index error fs_clustered = fs.transform.cluster( n_clusters=3, cluster_duration='1D', - time_series_for_low_peaks=['HeatDemand(Q)|fixed_relative_profile'], + extremes=ExtremeConfig(method='new_cluster', min_value=['HeatDemand(Q)|fixed_relative_profile']), ) assert fs_clustered is not None assert len(fs_clustered.clusters) == 3 - def test_high_peaks_captures_extreme_demand_day(self, solver_fixture, timesteps_8_days): - """Verify high peak selection captures day with maximum demand.""" + def test_extremes_captures_extreme_demand_day(self, solver_fixture, timesteps_8_days): + """Verify extremes config captures day with maximum demand.""" + from tsam.config import ExtremeConfig + fs = create_system_with_peak_demand(timesteps_8_days) - # Cluster WITH high peak selection + # Cluster WITH extremes config fs_with_peaks = fs.transform.cluster( n_clusters=2, cluster_duration='1D', - time_series_for_high_peaks=['HeatDemand(Q)|fixed_relative_profile'], + extremes=ExtremeConfig(method='new_cluster', max_value=['HeatDemand(Q)|fixed_relative_profile']), ) fs_with_peaks.optimize(solver_fixture) @@ -818,18 +822,702 @@ def test_high_peaks_captures_extreme_demand_day(self, solver_fixture, timesteps_ max_flow = float(flow_rates.max()) assert max_flow >= 49, f'Peak demand not captured: max_flow={max_flow}' - def test_clustering_without_peaks_may_miss_extremes(self, solver_fixture, timesteps_8_days): - """Show that without peak selection, extreme days might be averaged out.""" + def test_clustering_without_extremes_may_miss_peaks(self, solver_fixture, timesteps_8_days): + """Show that without extremes config, extreme days might be averaged out.""" fs = create_system_with_peak_demand(timesteps_8_days) - # Cluster WITHOUT high peak selection (may or may not capture peak) + # Cluster WITHOUT extremes config (may or may not capture peak) fs_no_peaks = fs.transform.cluster( n_clusters=2, cluster_duration='1D', - # No time_series_for_high_peaks + # No extremes config ) fs_no_peaks.optimize(solver_fixture) # This test just verifies the clustering works # The peak may or may not be captured depending on clustering algorithm assert fs_no_peaks.solution is not None + + +# ==================== Data Vars Parameter Tests ==================== + + +class TestDataVarsParameter: + """Tests for data_vars parameter in cluster() method.""" + + def test_cluster_with_data_vars_subset(self, timesteps_8_days): + """Test clustering with a subset of variables.""" + # Create system with multiple time-varying data + hours = len(timesteps_8_days) + demand = np.sin(np.linspace(0, 4 * np.pi, hours)) * 10 + 15 + price = np.cos(np.linspace(0, 4 * np.pi, hours)) * 0.02 + 0.05 # Different pattern + + fs = fx.FlowSystem(timesteps_8_days) + fs.add_elements( + fx.Bus('Heat'), + fx.Bus('Gas'), + fx.Effect('costs', '€', is_standard=True, is_objective=True), + fx.Sink('HeatDemand', inputs=[fx.Flow('Q', bus='Heat', fixed_relative_profile=demand, size=1)]), + fx.Source('GasSource', outputs=[fx.Flow('Gas', bus='Gas', effects_per_flow_hour=price)]), + fx.linear_converters.Boiler( + 'Boiler', + thermal_efficiency=0.9, + fuel_flow=fx.Flow('Q_fu', bus='Gas'), + thermal_flow=fx.Flow('Q_th', bus='Heat'), + ), + ) + + # Cluster based only on demand profile (not price) + fs_reduced = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + data_vars=['HeatDemand(Q)|fixed_relative_profile'], + ) + + # Should have clustered structure + assert len(fs_reduced.timesteps) == 24 + assert len(fs_reduced.clusters) == 2 + + def test_data_vars_validation_error(self, timesteps_8_days): + """Test that invalid data_vars raises ValueError.""" + fs = create_simple_system(timesteps_8_days) + + with pytest.raises(ValueError, match='data_vars not found'): + fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + data_vars=['NonExistentVariable'], + ) + + def test_data_vars_preserves_all_flowsystem_data(self, timesteps_8_days): + """Test that clustering with data_vars preserves all FlowSystem variables.""" + # Create system with multiple time-varying data + hours = len(timesteps_8_days) + demand = np.sin(np.linspace(0, 4 * np.pi, hours)) * 10 + 15 + price = np.cos(np.linspace(0, 4 * np.pi, hours)) * 0.02 + 0.05 + + fs = fx.FlowSystem(timesteps_8_days) + fs.add_elements( + fx.Bus('Heat'), + fx.Bus('Gas'), + fx.Effect('costs', '€', is_standard=True, is_objective=True), + fx.Sink('HeatDemand', inputs=[fx.Flow('Q', bus='Heat', fixed_relative_profile=demand, size=1)]), + fx.Source('GasSource', outputs=[fx.Flow('Gas', bus='Gas', effects_per_flow_hour=price)]), + fx.linear_converters.Boiler( + 'Boiler', + thermal_efficiency=0.9, + fuel_flow=fx.Flow('Q_fu', bus='Gas'), + thermal_flow=fx.Flow('Q_th', bus='Heat'), + ), + ) + + # Cluster based only on demand profile + fs_reduced = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + data_vars=['HeatDemand(Q)|fixed_relative_profile'], + ) + + # Both demand and price should be preserved in the reduced FlowSystem + ds = fs_reduced.to_dataset() + assert 'HeatDemand(Q)|fixed_relative_profile' in ds.data_vars + assert 'GasSource(Gas)|costs|per_flow_hour' in ds.data_vars + + def test_data_vars_optimization_works(self, solver_fixture, timesteps_8_days): + """Test that FlowSystem clustered with data_vars can be optimized.""" + hours = len(timesteps_8_days) + demand = np.sin(np.linspace(0, 4 * np.pi, hours)) * 10 + 15 + price = np.cos(np.linspace(0, 4 * np.pi, hours)) * 0.02 + 0.05 + + fs = fx.FlowSystem(timesteps_8_days) + fs.add_elements( + fx.Bus('Heat'), + fx.Bus('Gas'), + fx.Effect('costs', '€', is_standard=True, is_objective=True), + fx.Sink('HeatDemand', inputs=[fx.Flow('Q', bus='Heat', fixed_relative_profile=demand, size=1)]), + fx.Source('GasSource', outputs=[fx.Flow('Gas', bus='Gas', effects_per_flow_hour=price)]), + fx.linear_converters.Boiler( + 'Boiler', + thermal_efficiency=0.9, + fuel_flow=fx.Flow('Q_fu', bus='Gas'), + thermal_flow=fx.Flow('Q_th', bus='Heat'), + ), + ) + + fs_reduced = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + data_vars=['HeatDemand(Q)|fixed_relative_profile'], + ) + + # Should optimize successfully + fs_reduced.optimize(solver_fixture) + assert fs_reduced.solution is not None + assert 'Boiler(Q_th)|flow_rate' in fs_reduced.solution + + def test_data_vars_with_multiple_variables(self, timesteps_8_days): + """Test clustering with multiple selected variables.""" + hours = len(timesteps_8_days) + demand = np.sin(np.linspace(0, 4 * np.pi, hours)) * 10 + 15 + price = np.cos(np.linspace(0, 4 * np.pi, hours)) * 0.02 + 0.05 + + fs = fx.FlowSystem(timesteps_8_days) + fs.add_elements( + fx.Bus('Heat'), + fx.Bus('Gas'), + fx.Effect('costs', '€', is_standard=True, is_objective=True), + fx.Sink('HeatDemand', inputs=[fx.Flow('Q', bus='Heat', fixed_relative_profile=demand, size=1)]), + fx.Source('GasSource', outputs=[fx.Flow('Gas', bus='Gas', effects_per_flow_hour=price)]), + fx.linear_converters.Boiler( + 'Boiler', + thermal_efficiency=0.9, + fuel_flow=fx.Flow('Q_fu', bus='Gas'), + thermal_flow=fx.Flow('Q_th', bus='Heat'), + ), + ) + + # Cluster based on both demand and price + fs_reduced = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + data_vars=[ + 'HeatDemand(Q)|fixed_relative_profile', + 'GasSource(Gas)|costs|per_flow_hour', + ], + ) + + assert len(fs_reduced.timesteps) == 24 + assert len(fs_reduced.clusters) == 2 + + +# ==================== Segmentation Tests ==================== + + +class TestSegmentation: + """Tests for intra-period segmentation (variable timestep durations within clusters).""" + + def test_segment_config_creates_segmented_system(self, timesteps_8_days): + """Test that SegmentConfig creates a segmented FlowSystem.""" + from tsam.config import SegmentConfig + + fs = create_simple_system(timesteps_8_days) + + # Cluster with 6 segments per day (instead of 24 hourly timesteps) + fs_segmented = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + segments=SegmentConfig(n_segments=6), + ) + + # Verify segmentation properties + assert fs_segmented.clustering.is_segmented is True + assert fs_segmented.clustering.n_segments == 6 + assert fs_segmented.clustering.timesteps_per_cluster == 24 # Original period length + + # Time dimension should have n_segments entries (not timesteps_per_cluster) + assert len(fs_segmented.timesteps) == 6 # 6 segments + + # Verify RangeIndex for segmented time + assert isinstance(fs_segmented.timesteps, pd.RangeIndex) + + def test_segmented_system_has_variable_timestep_durations(self, timesteps_8_days): + """Test that segmented systems have variable timestep durations.""" + from tsam.config import SegmentConfig + + fs = create_simple_system(timesteps_8_days) + + fs_segmented = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + segments=SegmentConfig(n_segments=6), + ) + + # Timestep duration should be a DataArray with cluster dimension + timestep_duration = fs_segmented.timestep_duration + assert 'cluster' in timestep_duration.dims + assert 'time' in timestep_duration.dims + + # Sum of durations per cluster should equal original period length (24 hours) + for cluster in fs_segmented.clusters: + cluster_duration_sum = timestep_duration.sel(cluster=cluster).sum().item() + assert_allclose(cluster_duration_sum, 24.0, rtol=1e-6) + + def test_segmented_system_optimizes(self, solver_fixture, timesteps_8_days): + """Test that segmented systems can be optimized.""" + from tsam.config import SegmentConfig + + fs = create_simple_system(timesteps_8_days) + + fs_segmented = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + segments=SegmentConfig(n_segments=6), + ) + + # Optimize + fs_segmented.optimize(solver_fixture) + + # Should have solution + assert fs_segmented.solution is not None + assert 'objective' in fs_segmented.solution + + # Flow rates should have (cluster, time) structure with 6 time points + flow_var = 'Boiler(Q_th)|flow_rate' + assert flow_var in fs_segmented.solution + # time dimension has n_segments + 1 (for previous_flow_rate pattern) + assert fs_segmented.solution[flow_var].sizes['time'] == 7 # 6 + 1 + + def test_segmented_expand_restores_original_timesteps(self, solver_fixture, timesteps_8_days): + """Test that expand() restores the original timestep count for segmented systems.""" + from tsam.config import SegmentConfig + + fs = create_simple_system(timesteps_8_days) + + # Cluster with segments + fs_segmented = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + segments=SegmentConfig(n_segments=6), + ) + + # Optimize and expand + fs_segmented.optimize(solver_fixture) + fs_expanded = fs_segmented.transform.expand() + + # Should have original timesteps restored + assert len(fs_expanded.timesteps) == 192 # 8 days * 24 hours + assert fs_expanded.clusters is None # No cluster dimension after expansion + + # Should have DatetimeIndex after expansion (not RangeIndex) + assert isinstance(fs_expanded.timesteps, pd.DatetimeIndex) + + def test_segmented_expand_preserves_objective(self, solver_fixture, timesteps_8_days): + """Test that expand() preserves the objective value for segmented systems.""" + from tsam.config import SegmentConfig + + fs = create_simple_system(timesteps_8_days) + + fs_segmented = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + segments=SegmentConfig(n_segments=6), + ) + + fs_segmented.optimize(solver_fixture) + segmented_objective = fs_segmented.solution['objective'].item() + + fs_expanded = fs_segmented.transform.expand() + expanded_objective = fs_expanded.solution['objective'].item() + + # Objectives should be equal (expand preserves solution) + assert_allclose(segmented_objective, expanded_objective, rtol=1e-6) + + def test_segmented_expand_has_correct_flow_rates(self, solver_fixture, timesteps_8_days): + """Test that expanded flow rates have correct timestep count.""" + from tsam.config import SegmentConfig + + fs = create_simple_system(timesteps_8_days) + + fs_segmented = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + segments=SegmentConfig(n_segments=6), + ) + + fs_segmented.optimize(solver_fixture) + fs_expanded = fs_segmented.transform.expand() + + # Check flow rates dimension + flow_var = 'Boiler(Q_th)|flow_rate' + flow_rates = fs_expanded.solution[flow_var] + + # Should have original time dimension + assert flow_rates.sizes['time'] == 193 # 192 + 1 (previous_flow_rate) + + def test_segmented_statistics_after_expand(self, solver_fixture, timesteps_8_days): + """Test that statistics accessor works after expanding segmented system.""" + from tsam.config import SegmentConfig + + fs = create_simple_system(timesteps_8_days) + + fs_segmented = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + segments=SegmentConfig(n_segments=6), + ) + + fs_segmented.optimize(solver_fixture) + fs_expanded = fs_segmented.transform.expand() + + # Statistics should work + stats = fs_expanded.statistics + assert hasattr(stats, 'flow_rates') + assert hasattr(stats, 'total_effects') + + # Flow rates should have correct dimensions + flow_rates = stats.flow_rates + assert 'time' in flow_rates.dims + + def test_segmented_timestep_mapping_uses_segment_assignments(self, timesteps_8_days): + """Test that timestep_mapping correctly maps original timesteps to segments.""" + from tsam.config import SegmentConfig + + fs = create_simple_system(timesteps_8_days) + + fs_segmented = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + segments=SegmentConfig(n_segments=6), + ) + + mapping = fs_segmented.clustering.timestep_mapping + + # Mapping should have original timestep count + assert len(mapping.values) == 192 + + # Each mapped value should be in valid range: [0, n_clusters * n_segments) + max_valid_idx = 2 * 6 - 1 # n_clusters * n_segments - 1 + assert mapping.min().item() >= 0 + assert mapping.max().item() <= max_valid_idx + + @pytest.mark.parametrize('freq', ['1h', '2h']) + def test_segmented_total_effects_match_solution(self, solver_fixture, freq): + """Test that total_effects matches solution Cost after expand with segmentation. + + This is a regression test for the bug where expansion_divisor was computed + incorrectly for segmented systems, causing total_effects to not match the + solution's objective value. + """ + from tsam.config import SegmentConfig + + # Create system with specified timestep frequency + n_timesteps = 72 if freq == '1h' else 36 # 3 days worth + timesteps = pd.date_range('2024-01-01', periods=n_timesteps, freq=freq) + fs = fx.FlowSystem(timesteps=timesteps) + + # Minimal components: effect + source + sink with varying demand + fs.add_elements(fx.Effect('Cost', unit='EUR', is_objective=True)) + fs.add_elements(fx.Bus('Heat')) + fs.add_elements( + fx.Source( + 'Boiler', + outputs=[fx.Flow('Q', bus='Heat', size=100, effects_per_flow_hour={'Cost': 50})], + ) + ) + demand_profile = np.tile([0.5, 1], n_timesteps // 2) + fs.add_elements( + fx.Sink('Demand', inputs=[fx.Flow('Q', bus='Heat', size=50, fixed_relative_profile=demand_profile)]) + ) + + # Cluster with segments -> solve -> expand + fs_clustered = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + segments=SegmentConfig(n_segments=4), + ) + fs_clustered.optimize(solver_fixture) + fs_expanded = fs_clustered.transform.expand() + + # Validate: total_effects must match solution objective + computed = fs_expanded.statistics.total_effects['Cost'].sum('contributor') + expected = fs_expanded.solution['Cost'] + assert np.allclose(computed.values, expected.values, rtol=1e-5), ( + f'total_effects mismatch: computed={float(computed):.2f}, expected={float(expected):.2f}' + ) + + +class TestSegmentationWithStorage: + """Tests for segmentation combined with storage components.""" + + def test_segmented_storage_optimizes(self, solver_fixture, timesteps_8_days): + """Test that segmented systems with storage can be optimized.""" + from tsam.config import SegmentConfig + + fs = create_system_with_storage(timesteps_8_days, cluster_mode='cyclic') + + fs_segmented = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + segments=SegmentConfig(n_segments=6), + ) + + fs_segmented.optimize(solver_fixture) + + # Should have solution with charge_state + assert fs_segmented.solution is not None + assert 'Battery|charge_state' in fs_segmented.solution + + def test_segmented_storage_expand(self, solver_fixture, timesteps_8_days): + """Test that segmented storage systems can be expanded.""" + from tsam.config import SegmentConfig + + fs = create_system_with_storage(timesteps_8_days, cluster_mode='cyclic') + + fs_segmented = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + segments=SegmentConfig(n_segments=6), + ) + + fs_segmented.optimize(solver_fixture) + fs_expanded = fs_segmented.transform.expand() + + # Charge state should be expanded to original timesteps + charge_state = fs_expanded.solution['Battery|charge_state'] + # charge_state has time dimension = n_original_timesteps + 1 + assert charge_state.sizes['time'] == 193 + + +class TestSegmentationWithPeriods: + """Tests for segmentation combined with multi-period systems.""" + + def test_segmented_with_periods(self, solver_fixture, timesteps_8_days, periods_2): + """Test segmentation with multiple periods.""" + from tsam.config import SegmentConfig + + fs = create_system_with_periods(timesteps_8_days, periods_2) + + fs_segmented = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + segments=SegmentConfig(n_segments=6), + ) + + # Verify structure + assert fs_segmented.clustering.is_segmented is True + assert fs_segmented.periods is not None + assert len(fs_segmented.periods) == 2 + + # Optimize + fs_segmented.optimize(solver_fixture) + assert fs_segmented.solution is not None + + def test_segmented_with_periods_expand(self, solver_fixture, timesteps_8_days, periods_2): + """Test expansion of segmented multi-period systems.""" + from tsam.config import SegmentConfig + + fs = create_system_with_periods(timesteps_8_days, periods_2) + + fs_segmented = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + segments=SegmentConfig(n_segments=6), + ) + + fs_segmented.optimize(solver_fixture) + fs_expanded = fs_segmented.transform.expand() + + # Should have original timesteps and periods preserved + assert len(fs_expanded.timesteps) == 192 + assert fs_expanded.periods is not None + assert len(fs_expanded.periods) == 2 + + # Solution should have period dimension + flow_var = 'Boiler(Q_th)|flow_rate' + assert 'period' in fs_expanded.solution[flow_var].dims + + def test_segmented_different_clustering_per_period(self, solver_fixture, timesteps_8_days, periods_2): + """Test that different periods can have different cluster assignments.""" + from tsam.config import SegmentConfig + + fs = create_system_with_periods(timesteps_8_days, periods_2) + + fs_segmented = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + segments=SegmentConfig(n_segments=6), + ) + + # Verify cluster_assignments has period dimension + cluster_assignments = fs_segmented.clustering.cluster_assignments + assert 'period' in cluster_assignments.dims + + # Each period should have independent cluster assignments + # (may or may not be different depending on data) + assert cluster_assignments.sizes['period'] == 2 + + fs_segmented.optimize(solver_fixture) + fs_expanded = fs_segmented.transform.expand() + + # Expanded solution should preserve period dimension + flow_var = 'Boiler(Q_th)|flow_rate' + assert 'period' in fs_expanded.solution[flow_var].dims + assert fs_expanded.solution[flow_var].sizes['period'] == 2 + + def test_segmented_expand_maps_correctly_per_period(self, solver_fixture, timesteps_8_days, periods_2): + """Test that expand maps values correctly for each period independently.""" + from tsam.config import SegmentConfig + + fs = create_system_with_periods(timesteps_8_days, periods_2) + + fs_segmented = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + segments=SegmentConfig(n_segments=6), + ) + + fs_segmented.optimize(solver_fixture) + + # Get the timestep_mapping which should be multi-dimensional + mapping = fs_segmented.clustering.timestep_mapping + + # Mapping should have period dimension + assert 'period' in mapping.dims + assert mapping.sizes['period'] == 2 + + # Expand and verify each period has correct number of timesteps + fs_expanded = fs_segmented.transform.expand() + flow_var = 'Boiler(Q_th)|flow_rate' + flow_rates = fs_expanded.solution[flow_var] + + # Each period should have the original time dimension + # time = 193 (192 + 1 for previous_flow_rate pattern) + assert flow_rates.sizes['time'] == 193 + assert flow_rates.sizes['period'] == 2 + + +class TestSegmentationIO: + """Tests for IO round-trip of segmented systems.""" + + def test_segmented_roundtrip(self, solver_fixture, timesteps_8_days, tmp_path): + """Test that segmented systems survive IO round-trip.""" + from tsam.config import SegmentConfig + + fs = create_simple_system(timesteps_8_days) + + fs_segmented = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + segments=SegmentConfig(n_segments=6), + ) + + fs_segmented.optimize(solver_fixture) + + # Save and load + path = tmp_path / 'segmented.nc4' + fs_segmented.to_netcdf(path) + fs_loaded = fx.FlowSystem.from_netcdf(path) + + # Verify segmentation preserved + assert fs_loaded.clustering.is_segmented is True + assert fs_loaded.clustering.n_segments == 6 + + # Verify solution preserved + assert_allclose( + fs_loaded.solution['objective'].item(), + fs_segmented.solution['objective'].item(), + rtol=1e-6, + ) + + def test_segmented_expand_after_load(self, solver_fixture, timesteps_8_days, tmp_path): + """Test that expand works after loading segmented system.""" + from tsam.config import SegmentConfig + + fs = create_simple_system(timesteps_8_days) + + fs_segmented = fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + segments=SegmentConfig(n_segments=6), + ) + + fs_segmented.optimize(solver_fixture) + + # Save, load, and expand + path = tmp_path / 'segmented.nc4' + fs_segmented.to_netcdf(path) + fs_loaded = fx.FlowSystem.from_netcdf(path) + fs_expanded = fs_loaded.transform.expand() + + # Should have original timesteps + assert len(fs_expanded.timesteps) == 192 + + # Objective should be preserved + assert_allclose( + fs_expanded.solution['objective'].item(), + fs_segmented.solution['objective'].item(), + rtol=1e-6, + ) + + +class TestCombineSlices: + """Tests for the combine_slices utility function.""" + + def test_single_dim(self): + """Test combining slices with a single extra dimension.""" + from flixopt.clustering.base import combine_slices + + slices = { + ('A',): np.array([1.0, 2.0, 3.0]), + ('B',): np.array([4.0, 5.0, 6.0]), + } + result = combine_slices( + slices, + extra_dims=['x'], + dim_coords={'x': ['A', 'B']}, + output_dim='time', + output_coord=[0, 1, 2], + ) + + assert result.dims == ('time', 'x') + assert result.shape == (3, 2) + assert result.sel(x='A').values.tolist() == [1.0, 2.0, 3.0] + assert result.sel(x='B').values.tolist() == [4.0, 5.0, 6.0] + + def test_two_dims(self): + """Test combining slices with two extra dimensions.""" + from flixopt.clustering.base import combine_slices + + slices = { + ('P1', 'base'): np.array([1.0, 2.0]), + ('P1', 'high'): np.array([3.0, 4.0]), + ('P2', 'base'): np.array([5.0, 6.0]), + ('P2', 'high'): np.array([7.0, 8.0]), + } + result = combine_slices( + slices, + extra_dims=['period', 'scenario'], + dim_coords={'period': ['P1', 'P2'], 'scenario': ['base', 'high']}, + output_dim='time', + output_coord=[0, 1], + ) + + assert result.dims == ('time', 'period', 'scenario') + assert result.shape == (2, 2, 2) + assert result.sel(period='P1', scenario='base').values.tolist() == [1.0, 2.0] + assert result.sel(period='P2', scenario='high').values.tolist() == [7.0, 8.0] + + def test_attrs_propagation(self): + """Test that attrs are propagated to the result.""" + from flixopt.clustering.base import combine_slices + + slices = {('A',): np.array([1.0, 2.0])} + result = combine_slices( + slices, + extra_dims=['x'], + dim_coords={'x': ['A']}, + output_dim='time', + output_coord=[0, 1], + attrs={'units': 'kW', 'description': 'power'}, + ) + + assert result.attrs['units'] == 'kW' + assert result.attrs['description'] == 'power' + + def test_datetime_coords(self): + """Test with pandas DatetimeIndex as output coordinates.""" + from flixopt.clustering.base import combine_slices + + time_index = pd.date_range('2020-01-01', periods=3, freq='h') + slices = {('A',): np.array([1.0, 2.0, 3.0])} + result = combine_slices( + slices, + extra_dims=['x'], + dim_coords={'x': ['A']}, + output_dim='time', + output_coord=time_index, + ) + + assert result.dims == ('time', 'x') + assert len(result.coords['time']) == 3 + assert result.coords['time'][0].values == time_index[0] diff --git a/tests/test_clustering/test_base.py b/tests/test_clustering/test_base.py index e1fffaa75..81afc2a97 100644 --- a/tests/test_clustering/test_base.py +++ b/tests/test_clustering/test_base.py @@ -1,141 +1,484 @@ """Tests for flixopt.clustering.base module.""" import numpy as np +import pandas as pd import pytest import xarray as xr -from flixopt.clustering import ( - Clustering, - ClusterResult, - ClusterStructure, - create_cluster_structure_from_mapping, -) +from flixopt.clustering import Clustering, ClusteringResults +from flixopt.clustering.base import _build_timestep_mapping, _cluster_occurrences -class TestClusterStructure: - """Tests for ClusterStructure dataclass.""" +class TestHelperFunctions: + """Tests for helper functions.""" - def test_basic_creation(self): - """Test basic ClusterStructure creation.""" - cluster_order = xr.DataArray([0, 1, 0, 1, 2, 0], dims=['original_cluster']) - cluster_occurrences = xr.DataArray([3, 2, 1], dims=['cluster']) + @pytest.fixture + def mock_clustering_result(self): + """Create a mock tsam ClusteringResult-like object.""" - structure = ClusterStructure( - cluster_order=cluster_order, - cluster_occurrences=cluster_occurrences, - n_clusters=3, - timesteps_per_cluster=24, - ) + class MockClusteringResult: + n_clusters = 3 + n_original_periods = 6 + n_timesteps_per_period = 24 + cluster_assignments = (0, 1, 0, 1, 2, 0) + period_duration = 24.0 + n_segments = None # None indicates non-segmented + segment_assignments = None # None indicates non-segmented + + def to_dict(self): + return { + 'n_clusters': self.n_clusters, + 'n_original_periods': self.n_original_periods, + 'n_timesteps_per_period': self.n_timesteps_per_period, + 'cluster_assignments': list(self.cluster_assignments), + 'period_duration': self.period_duration, + } + + def apply(self, data): + """Mock apply method.""" + return {'applied': True} + + return MockClusteringResult() + + def test_cluster_occurrences(self, mock_clustering_result): + """Test _cluster_occurrences helper.""" + occurrences = _cluster_occurrences(mock_clustering_result) + # cluster 0: 3 occurrences (indices 0, 2, 5) + # cluster 1: 2 occurrences (indices 1, 3) + # cluster 2: 1 occurrence (index 4) + np.testing.assert_array_equal(occurrences, [3, 2, 1]) + + def test_build_timestep_mapping(self, mock_clustering_result): + """Test _build_timestep_mapping helper.""" + mapping = _build_timestep_mapping(mock_clustering_result, n_timesteps=144) + assert len(mapping) == 144 + + # First 24 timesteps should map to cluster 0's representative (0-23) + np.testing.assert_array_equal(mapping[:24], np.arange(24)) + + # Second 24 timesteps (period 1 -> cluster 1) should map to cluster 1's representative (24-47) + np.testing.assert_array_equal(mapping[24:48], np.arange(24, 48)) + + +class TestClusteringResults: + """Tests for ClusteringResults collection class.""" + + @pytest.fixture + def mock_clustering_result_factory(self): + """Factory for creating mock ClusteringResult objects.""" + + def create_result(cluster_assignments, n_timesteps_per_period=24): + class MockClusteringResult: + n_clusters = max(cluster_assignments) + 1 if cluster_assignments else 0 + n_original_periods = len(cluster_assignments) + period_duration = 24.0 + n_segments = None # None indicates non-segmented + segment_assignments = None # None indicates non-segmented + + def __init__(self, assignments, n_timesteps): + self.cluster_assignments = tuple(assignments) + self.n_timesteps_per_period = n_timesteps + + def to_dict(self): + return { + 'n_clusters': self.n_clusters, + 'n_original_periods': self.n_original_periods, + 'n_timesteps_per_period': self.n_timesteps_per_period, + 'cluster_assignments': list(self.cluster_assignments), + 'period_duration': self.period_duration, + } + + def apply(self, data): + return {'applied': True} + + return MockClusteringResult(cluster_assignments, n_timesteps_per_period) + + return create_result + + def test_single_result(self, mock_clustering_result_factory): + """Test ClusteringResults with single result.""" + cr = mock_clustering_result_factory([0, 1, 0]) + results = ClusteringResults({(): cr}, dim_names=[]) + + assert results.n_clusters == 2 + assert results.timesteps_per_cluster == 24 + assert len(results) == 1 + + def test_multi_period_results(self, mock_clustering_result_factory): + """Test ClusteringResults with multiple periods.""" + cr_2020 = mock_clustering_result_factory([0, 1, 0]) + cr_2030 = mock_clustering_result_factory([1, 0, 1]) - assert structure.n_clusters == 3 - assert structure.timesteps_per_cluster == 24 - assert structure.n_original_clusters == 6 - - def test_creation_from_numpy(self): - """Test ClusterStructure creation from numpy arrays.""" - structure = ClusterStructure( - cluster_order=np.array([0, 0, 1, 1, 0]), - cluster_occurrences=np.array([3, 2]), - n_clusters=2, - timesteps_per_cluster=12, + results = ClusteringResults( + {(2020,): cr_2020, (2030,): cr_2030}, + dim_names=['period'], ) - assert isinstance(structure.cluster_order, xr.DataArray) - assert isinstance(structure.cluster_occurrences, xr.DataArray) - assert structure.n_original_clusters == 5 + assert results.n_clusters == 2 + assert len(results) == 2 + # Access by period + assert results.sel(period=2020) is cr_2020 + assert results.sel(period=2030) is cr_2030 -class TestClusterResult: - """Tests for ClusterResult dataclass.""" + def test_dims_property(self, mock_clustering_result_factory): + """Test dims property returns tuple (xarray-like).""" + cr = mock_clustering_result_factory([0, 1, 0]) + results = ClusteringResults({(): cr}, dim_names=[]) + assert results.dims == () - def test_basic_creation(self): - """Test basic ClusterResult creation.""" - result = ClusterResult( - timestep_mapping=xr.DataArray([0, 0, 1, 1, 2, 2], dims=['original_time']), - n_representatives=3, - representative_weights=xr.DataArray([2, 2, 2], dims=['time']), + cr_2020 = mock_clustering_result_factory([0, 1, 0]) + cr_2030 = mock_clustering_result_factory([1, 0, 1]) + results = ClusteringResults( + {(2020,): cr_2020, (2030,): cr_2030}, + dim_names=['period'], ) + assert results.dims == ('period',) - assert result.n_representatives == 3 - assert result.n_original_timesteps == 6 + def test_coords_property(self, mock_clustering_result_factory): + """Test coords property returns dict (xarray-like).""" + cr_2020 = mock_clustering_result_factory([0, 1, 0]) + cr_2030 = mock_clustering_result_factory([1, 0, 1]) + results = ClusteringResults( + {(2020,): cr_2020, (2030,): cr_2030}, + dim_names=['period'], + ) + assert results.coords == {'period': [2020, 2030]} - def test_creation_from_numpy(self): - """Test ClusterResult creation from numpy arrays.""" - result = ClusterResult( - timestep_mapping=np.array([0, 1, 0, 1]), - n_representatives=2, - representative_weights=np.array([2.0, 2.0]), + def test_sel_method(self, mock_clustering_result_factory): + """Test sel() method (xarray-like selection).""" + cr_2020 = mock_clustering_result_factory([0, 1, 0]) + cr_2030 = mock_clustering_result_factory([1, 0, 1]) + results = ClusteringResults( + {(2020,): cr_2020, (2030,): cr_2030}, + dim_names=['period'], ) + assert results.sel(period=2020) is cr_2020 + assert results.sel(period=2030) is cr_2030 + + def test_sel_invalid_key_raises(self, mock_clustering_result_factory): + """Test sel() raises KeyError for invalid key.""" + cr = mock_clustering_result_factory([0, 1, 0]) + results = ClusteringResults({(2020,): cr}, dim_names=['period']) - assert isinstance(result.timestep_mapping, xr.DataArray) - assert isinstance(result.representative_weights, xr.DataArray) + with pytest.raises(KeyError): + results.sel(period=2030) - def test_validation_success(self): - """Test validation passes for valid result.""" - result = ClusterResult( - timestep_mapping=xr.DataArray([0, 1, 0, 1], dims=['original_time']), - n_representatives=2, - representative_weights=xr.DataArray([2.0, 2.0], dims=['time']), + def test_isel_method(self, mock_clustering_result_factory): + """Test isel() method (xarray-like integer selection).""" + cr_2020 = mock_clustering_result_factory([0, 1, 0]) + cr_2030 = mock_clustering_result_factory([1, 0, 1]) + results = ClusteringResults( + {(2020,): cr_2020, (2030,): cr_2030}, + dim_names=['period'], ) + assert results.isel(period=0) is cr_2020 + assert results.isel(period=1) is cr_2030 + + def test_isel_invalid_index_raises(self, mock_clustering_result_factory): + """Test isel() raises IndexError for out-of-range index.""" + cr = mock_clustering_result_factory([0, 1, 0]) + results = ClusteringResults({(2020,): cr}, dim_names=['period']) + + with pytest.raises(IndexError): + results.isel(period=5) + + def test_cluster_assignments_dataarray(self, mock_clustering_result_factory): + """Test cluster_assignments returns correct DataArray.""" + cr = mock_clustering_result_factory([0, 1, 0]) + results = ClusteringResults({(): cr}, dim_names=[]) - # Should not raise - result.validate() + cluster_assignments = results.cluster_assignments + assert isinstance(cluster_assignments, xr.DataArray) + assert 'original_cluster' in cluster_assignments.dims + np.testing.assert_array_equal(cluster_assignments.values, [0, 1, 0]) - def test_validation_invalid_mapping(self): - """Test validation fails for out-of-range mapping.""" - result = ClusterResult( - timestep_mapping=xr.DataArray([0, 5, 0, 1], dims=['original_time']), # 5 is out of range - n_representatives=2, - representative_weights=xr.DataArray([2.0, 2.0], dims=['time']), + def test_cluster_occurrences_dataarray(self, mock_clustering_result_factory): + """Test cluster_occurrences returns correct DataArray.""" + cr = mock_clustering_result_factory([0, 1, 0]) # 2 x cluster 0, 1 x cluster 1 + results = ClusteringResults({(): cr}, dim_names=[]) + + occurrences = results.cluster_occurrences + assert isinstance(occurrences, xr.DataArray) + assert 'cluster' in occurrences.dims + np.testing.assert_array_equal(occurrences.values, [2, 1]) + + +class TestClustering: + """Tests for Clustering dataclass.""" + + @pytest.fixture + def basic_cluster_results(self): + """Create basic ClusteringResults for testing.""" + + class MockClusteringResult: + n_clusters = 3 + n_original_periods = 6 + n_timesteps_per_period = 24 + cluster_assignments = (0, 1, 0, 1, 2, 0) + period_duration = 24.0 + n_segments = None # None indicates non-segmented + segment_assignments = None # None indicates non-segmented + + def to_dict(self): + return { + 'n_clusters': self.n_clusters, + 'n_original_periods': self.n_original_periods, + 'n_timesteps_per_period': self.n_timesteps_per_period, + 'cluster_assignments': list(self.cluster_assignments), + 'period_duration': self.period_duration, + } + + def apply(self, data): + return {'applied': True} + + mock_cr = MockClusteringResult() + return ClusteringResults({(): mock_cr}, dim_names=[]) + + @pytest.fixture + def basic_clustering(self, basic_cluster_results): + """Create a basic Clustering instance for testing.""" + original_timesteps = pd.date_range('2024-01-01', periods=144, freq='h') + + return Clustering( + results=basic_cluster_results, + original_timesteps=original_timesteps, ) - with pytest.raises(ValueError, match='timestep_mapping contains index'): - result.validate() + def test_basic_creation(self, basic_clustering): + """Test basic Clustering creation.""" + assert basic_clustering.n_clusters == 3 + assert basic_clustering.timesteps_per_cluster == 24 + assert basic_clustering.n_original_clusters == 6 + + def test_n_representatives(self, basic_clustering): + """Test n_representatives property.""" + assert basic_clustering.n_representatives == 72 # 3 * 24 + + def test_cluster_occurrences(self, basic_clustering): + """Test cluster_occurrences property returns correct values.""" + occurrences = basic_clustering.cluster_occurrences + assert isinstance(occurrences, xr.DataArray) + assert 'cluster' in occurrences.dims + # cluster 0: 3 occurrences, cluster 1: 2 occurrences, cluster 2: 1 occurrence + assert occurrences.sel(cluster=0).item() == 3 + assert occurrences.sel(cluster=1).item() == 2 + assert occurrences.sel(cluster=2).item() == 1 - def test_get_expansion_mapping(self): - """Test get_expansion_mapping returns named DataArray.""" - result = ClusterResult( - timestep_mapping=xr.DataArray([0, 1, 0], dims=['original_time']), - n_representatives=2, - representative_weights=xr.DataArray([2.0, 1.0], dims=['time']), + def test_representative_weights(self, basic_clustering): + """Test representative_weights is same as cluster_occurrences.""" + weights = basic_clustering.representative_weights + occurrences = basic_clustering.cluster_occurrences + xr.testing.assert_equal( + weights.drop_vars('cluster', errors='ignore'), + occurrences.drop_vars('cluster', errors='ignore'), ) - mapping = result.get_expansion_mapping() - assert mapping.name == 'expansion_mapping' + def test_timestep_mapping(self, basic_clustering): + """Test timestep_mapping property.""" + mapping = basic_clustering.timestep_mapping + assert isinstance(mapping, xr.DataArray) + assert 'original_time' in mapping.dims + assert len(mapping) == 144 # Original timesteps + def test_metrics(self, basic_clustering): + """Test metrics property returns empty Dataset when no metrics.""" + metrics = basic_clustering.metrics + assert isinstance(metrics, xr.Dataset) + # No metrics provided, so should be empty + assert len(metrics.data_vars) == 0 -class TestCreateClusterStructureFromMapping: - """Tests for create_cluster_structure_from_mapping function.""" + def test_cluster_start_positions(self, basic_clustering): + """Test cluster_start_positions property.""" + positions = basic_clustering.cluster_start_positions + np.testing.assert_array_equal(positions, [0, 24, 48]) - def test_basic_creation(self): - """Test creating ClusterStructure from timestep mapping.""" - # 12 original timesteps, 4 per period, 3 periods - # Mapping: period 0 -> cluster 0, period 1 -> cluster 1, period 2 -> cluster 0 - mapping = xr.DataArray( - [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3], # First and third period map to cluster 0 - dims=['original_time'], + def test_empty_results_raises(self): + """Test that empty results raises ValueError.""" + with pytest.raises(ValueError, match='cannot be empty'): + ClusteringResults({}, dim_names=[]) + + def test_repr(self, basic_clustering): + """Test string representation.""" + repr_str = repr(basic_clustering) + assert 'Clustering' in repr_str + assert '6 periods' in repr_str + assert '3 clusters' in repr_str + + +class TestClusteringMultiDim: + """Tests for Clustering with period/scenario dimensions.""" + + @pytest.fixture + def mock_clustering_result_factory(self): + """Factory for creating mock ClusteringResult objects.""" + + def create_result(cluster_assignments, n_timesteps_per_period=24): + class MockClusteringResult: + n_clusters = max(cluster_assignments) + 1 if cluster_assignments else 0 + n_original_periods = len(cluster_assignments) + period_duration = 24.0 + n_segments = None # None indicates non-segmented + segment_assignments = None # None indicates non-segmented + + def __init__(self, assignments, n_timesteps): + self.cluster_assignments = tuple(assignments) + self.n_timesteps_per_period = n_timesteps + + def to_dict(self): + return { + 'n_clusters': self.n_clusters, + 'n_original_periods': self.n_original_periods, + 'n_timesteps_per_period': self.n_timesteps_per_period, + 'cluster_assignments': list(self.cluster_assignments), + 'period_duration': self.period_duration, + } + + def apply(self, data): + return {'applied': True} + + return MockClusteringResult(cluster_assignments, n_timesteps_per_period) + + return create_result + + def test_multi_period_clustering(self, mock_clustering_result_factory): + """Test Clustering with multiple periods.""" + cr_2020 = mock_clustering_result_factory([0, 1, 0]) + cr_2030 = mock_clustering_result_factory([1, 0, 1]) + + results = ClusteringResults( + {(2020,): cr_2020, (2030,): cr_2030}, + dim_names=['period'], ) + original_timesteps = pd.date_range('2024-01-01', periods=72, freq='h') - structure = create_cluster_structure_from_mapping(mapping, timesteps_per_cluster=4) + clustering = Clustering( + results=results, + original_timesteps=original_timesteps, + ) - assert structure.timesteps_per_cluster == 4 - assert structure.n_original_clusters == 3 + assert clustering.n_clusters == 2 + assert 'period' in clustering.cluster_occurrences.dims + def test_get_result(self, mock_clustering_result_factory): + """Test get_result method.""" + cr = mock_clustering_result_factory([0, 1, 0]) + results = ClusteringResults({(): cr}, dim_names=[]) + original_timesteps = pd.date_range('2024-01-01', periods=72, freq='h') -class TestClustering: - """Tests for Clustering dataclass.""" + clustering = Clustering( + results=results, + original_timesteps=original_timesteps, + ) + + retrieved = clustering.get_result() + assert retrieved is cr + + def test_get_result_invalid_key(self, mock_clustering_result_factory): + """Test get_result with invalid key raises KeyError.""" + cr = mock_clustering_result_factory([0, 1, 0]) + results = ClusteringResults({(2020,): cr}, dim_names=['period']) + original_timesteps = pd.date_range('2024-01-01', periods=72, freq='h') + + clustering = Clustering( + results=results, + original_timesteps=original_timesteps, + ) + + with pytest.raises(KeyError): + clustering.get_result(period=2030) + + +class TestClusteringPlotAccessor: + """Tests for ClusteringPlotAccessor.""" + + @pytest.fixture + def clustering_with_data(self): + """Create Clustering with original and aggregated data.""" + + class MockClusteringResult: + n_clusters = 2 + n_original_periods = 3 + n_timesteps_per_period = 24 + cluster_assignments = (0, 1, 0) + period_duration = 24.0 + + def to_dict(self): + return { + 'n_clusters': self.n_clusters, + 'n_original_periods': self.n_original_periods, + 'n_timesteps_per_period': self.n_timesteps_per_period, + 'cluster_assignments': list(self.cluster_assignments), + 'period_duration': self.period_duration, + } + + def apply(self, data): + return {'applied': True} + + mock_cr = MockClusteringResult() + results = ClusteringResults({(): mock_cr}, dim_names=[]) + + original_timesteps = pd.date_range('2024-01-01', periods=72, freq='h') - def test_creation(self): - """Test Clustering creation.""" - result = ClusterResult( - timestep_mapping=xr.DataArray([0, 1], dims=['original_time']), - n_representatives=2, - representative_weights=xr.DataArray([1.0, 1.0], dims=['time']), + original_data = xr.Dataset( + { + 'col1': xr.DataArray(np.random.randn(72), dims=['time'], coords={'time': original_timesteps}), + } ) + aggregated_data = xr.Dataset( + { + 'col1': xr.DataArray( + np.random.randn(2, 24), + dims=['cluster', 'time'], + coords={'cluster': [0, 1], 'time': pd.date_range('2000-01-01', periods=24, freq='h')}, + ), + } + ) + + return Clustering( + results=results, + original_timesteps=original_timesteps, + original_data=original_data, + aggregated_data=aggregated_data, + ) + + def test_plot_accessor_exists(self, clustering_with_data): + """Test that plot accessor is available.""" + assert hasattr(clustering_with_data, 'plot') + assert hasattr(clustering_with_data.plot, 'compare') + assert hasattr(clustering_with_data.plot, 'heatmap') + assert hasattr(clustering_with_data.plot, 'clusters') + + def test_compare_requires_data(self): + """Test compare() raises when no data available.""" + + class MockClusteringResult: + n_clusters = 2 + n_original_periods = 2 + n_timesteps_per_period = 24 + cluster_assignments = (0, 1) + period_duration = 24.0 + + def to_dict(self): + return { + 'n_clusters': self.n_clusters, + 'n_original_periods': self.n_original_periods, + 'n_timesteps_per_period': self.n_timesteps_per_period, + 'cluster_assignments': list(self.cluster_assignments), + 'period_duration': self.period_duration, + } + + def apply(self, data): + return {'applied': True} + + mock_cr = MockClusteringResult() + results = ClusteringResults({(): mock_cr}, dim_names=[]) + original_timesteps = pd.date_range('2024-01-01', periods=48, freq='h') - info = Clustering( - result=result, - backend_name='tsam', + clustering = Clustering( + results=results, + original_timesteps=original_timesteps, ) - assert info.backend_name == 'tsam' + with pytest.raises(ValueError, match='No original/aggregated data'): + clustering.plot.compare() diff --git a/tests/test_clustering/test_integration.py b/tests/test_clustering/test_integration.py index 16c638c95..ea947b4fd 100644 --- a/tests/test_clustering/test_integration.py +++ b/tests/test_clustering/test_integration.py @@ -122,6 +122,97 @@ def test_weights_with_cluster_weight(self): np.testing.assert_array_almost_equal(fs.temporal_weight.values, expected.values) +class TestClusteringData: + """Tests for FlowSystem.transform.clustering_data method.""" + + def test_clustering_data_method_exists(self): + """Test that transform.clustering_data method exists.""" + fs = FlowSystem(timesteps=pd.date_range('2024-01-01', periods=48, freq='h')) + + assert hasattr(fs.transform, 'clustering_data') + assert callable(fs.transform.clustering_data) + + def test_clustering_data_returns_dataset(self): + """Test that clustering_data returns an xr.Dataset.""" + from flixopt import Bus, Flow, Sink, Source + + n_hours = 48 + fs = FlowSystem(timesteps=pd.date_range('2024-01-01', periods=n_hours, freq='h')) + + # Add components with time-varying data + demand_data = np.sin(np.linspace(0, 4 * np.pi, n_hours)) + 2 + bus = Bus('electricity') + source = Source('grid', outputs=[Flow('grid_in', bus='electricity', size=100)]) + sink = Sink( + 'demand', inputs=[Flow('demand_out', bus='electricity', size=100, fixed_relative_profile=demand_data)] + ) + fs.add_elements(source, sink, bus) + + clustering_data = fs.transform.clustering_data() + + assert isinstance(clustering_data, xr.Dataset) + + def test_clustering_data_contains_only_time_varying(self): + """Test that clustering_data returns only time-varying data.""" + from flixopt import Bus, Flow, Sink, Source + + n_hours = 48 + fs = FlowSystem(timesteps=pd.date_range('2024-01-01', periods=n_hours, freq='h')) + + # Add components with time-varying and constant data + demand_data = np.sin(np.linspace(0, 4 * np.pi, n_hours)) + 2 + bus = Bus('electricity') + source = Source('grid', outputs=[Flow('grid_in', bus='electricity', size=100)]) + sink = Sink( + 'demand', inputs=[Flow('demand_out', bus='electricity', size=100, fixed_relative_profile=demand_data)] + ) + fs.add_elements(source, sink, bus) + + clustering_data = fs.transform.clustering_data() + + # Should contain the demand profile + assert 'demand(demand_out)|fixed_relative_profile' in clustering_data.data_vars + + # All arrays should have 'time' dimension + for var in clustering_data.data_vars: + assert 'time' in clustering_data[var].dims + + def test_clustering_data_with_periods(self): + """Test clustering_data with multi-period system.""" + from flixopt import Bus, Effect, Flow, Sink, Source + + n_hours = 48 + periods = pd.Index([2024, 2030], name='period') + fs = FlowSystem( + timesteps=pd.date_range('2024-01-01', periods=n_hours, freq='h'), + periods=periods, + ) + + # Add components + demand_data = xr.DataArray( + np.random.rand(n_hours, 2), + dims=['time', 'period'], + coords={'time': fs.timesteps, 'period': periods}, + ) + bus = Bus('electricity') + effect = Effect('costs', '€', is_objective=True) + source = Source('grid', outputs=[Flow('grid_in', bus='electricity', size=100)]) + sink = Sink( + 'demand', inputs=[Flow('demand_out', bus='electricity', size=100, fixed_relative_profile=demand_data)] + ) + fs.add_elements(source, sink, bus, effect) + + # Get data for specific period + data_2024 = fs.transform.clustering_data(period=2024) + + # Should not have period dimension (it was selected) + assert 'period' not in data_2024.dims + + # Get data for all periods + data_all = fs.transform.clustering_data() + assert 'period' in data_all.dims + + class TestClusterMethod: """Tests for FlowSystem.transform.cluster method.""" @@ -194,10 +285,12 @@ def basic_flow_system(self): fs.add_elements(source, sink, bus) return fs - def test_cluster_method_parameter(self, basic_flow_system): - """Test that cluster_method parameter works.""" + def test_cluster_config_parameter(self, basic_flow_system): + """Test that cluster config parameter works.""" + from tsam.config import ClusterConfig + fs_clustered = basic_flow_system.transform.cluster( - n_clusters=2, cluster_duration='1D', cluster_method='hierarchical' + n_clusters=2, cluster_duration='1D', cluster=ClusterConfig(method='hierarchical') ) assert len(fs_clustered.clusters) == 2 @@ -207,7 +300,7 @@ def test_hierarchical_is_deterministic(self, basic_flow_system): fs2 = basic_flow_system.transform.cluster(n_clusters=2, cluster_duration='1D') # Hierarchical clustering should produce identical cluster orders - xr.testing.assert_equal(fs1.clustering.cluster_order, fs2.clustering.cluster_order) + xr.testing.assert_equal(fs1.clustering.cluster_assignments, fs2.clustering.cluster_assignments) def test_metrics_available(self, basic_flow_system): """Test that clustering metrics are available after clustering.""" @@ -219,23 +312,27 @@ def test_metrics_available(self, basic_flow_system): assert len(fs_clustered.clustering.metrics.data_vars) > 0 def test_representation_method_parameter(self, basic_flow_system): - """Test that representation_method parameter works.""" + """Test that representation method via ClusterConfig works.""" + from tsam.config import ClusterConfig + fs_clustered = basic_flow_system.transform.cluster( - n_clusters=2, cluster_duration='1D', representation_method='medoidRepresentation' + n_clusters=2, cluster_duration='1D', cluster=ClusterConfig(representation='medoid') ) assert len(fs_clustered.clusters) == 2 - def test_rescale_cluster_periods_parameter(self, basic_flow_system): - """Test that rescale_cluster_periods parameter works.""" + def test_preserve_column_means_parameter(self, basic_flow_system): + """Test that preserve_column_means parameter works via tsam_kwargs.""" fs_clustered = basic_flow_system.transform.cluster( - n_clusters=2, cluster_duration='1D', rescale_cluster_periods=False + n_clusters=2, cluster_duration='1D', preserve_column_means=False ) assert len(fs_clustered.clusters) == 2 def test_tsam_kwargs_passthrough(self, basic_flow_system): """Test that additional kwargs are passed to tsam.""" - # sameMean is a valid tsam parameter - fs_clustered = basic_flow_system.transform.cluster(n_clusters=2, cluster_duration='1D', sameMean=True) + # preserve_column_means is a valid tsam.aggregate() parameter + fs_clustered = basic_flow_system.transform.cluster( + n_clusters=2, cluster_duration='1D', preserve_column_means=False + ) assert len(fs_clustered.clusters) == 2 def test_metrics_with_periods(self): @@ -275,12 +372,4 @@ def test_import_from_flixopt(self): """Test that clustering module can be imported from flixopt.""" from flixopt import clustering - assert hasattr(clustering, 'ClusterResult') - assert hasattr(clustering, 'ClusterStructure') assert hasattr(clustering, 'Clustering') - - def test_create_cluster_structure_from_mapping_available(self): - """Test that create_cluster_structure_from_mapping is available.""" - from flixopt.clustering import create_cluster_structure_from_mapping - - assert callable(create_cluster_structure_from_mapping) diff --git a/tests/test_clustering_io.py b/tests/test_clustering_io.py index c1b211034..e3bfa6c1d 100644 --- a/tests/test_clustering_io.py +++ b/tests/test_clustering_io.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd import pytest +import xarray as xr import flixopt as fx @@ -78,6 +79,8 @@ def test_clustering_to_dataset_has_clustering_attrs(self, simple_system_8_days): def test_clustering_roundtrip_preserves_clustering_object(self, simple_system_8_days): """Clustering object should be restored after roundtrip.""" + from flixopt.clustering import Clustering + fs = simple_system_8_days fs_clustered = fs.transform.cluster(n_clusters=2, cluster_duration='1D') @@ -85,9 +88,9 @@ def test_clustering_roundtrip_preserves_clustering_object(self, simple_system_8_ ds = fs_clustered.to_dataset(include_solution=False) fs_restored = fx.FlowSystem.from_dataset(ds) - # Clustering should be restored + # Clustering should be restored as proper Clustering instance assert fs_restored.clustering is not None - assert fs_restored.clustering.backend_name == 'tsam' + assert isinstance(fs_restored.clustering, Clustering) def test_clustering_roundtrip_preserves_n_clusters(self, simple_system_8_days): """Number of clusters should be preserved after roundtrip.""" @@ -118,7 +121,8 @@ def test_clustering_roundtrip_preserves_original_timesteps(self, simple_system_8 ds = fs_clustered.to_dataset(include_solution=False) fs_restored = fx.FlowSystem.from_dataset(ds) - pd.testing.assert_index_equal(fs_restored.clustering.original_timesteps, original_timesteps) + # check_names=False because index name may be lost during serialization + pd.testing.assert_index_equal(fs_restored.clustering.original_timesteps, original_timesteps, check_names=False) def test_clustering_roundtrip_preserves_timestep_mapping(self, simple_system_8_days): """Timestep mapping should be preserved after roundtrip.""" @@ -534,3 +538,187 @@ def test_clustering_preserves_component_labels(self, simple_system_8_days, solve # Component labels should be preserved assert 'demand' in fs_expanded.components assert 'source' in fs_expanded.components + + +class TestMultiDimensionalClusteringIO: + """Test IO for clustering with both periods and scenarios (multi-dimensional).""" + + @pytest.fixture + def system_with_periods_and_scenarios(self): + """Create a flow system with both periods and scenarios, with different demand patterns.""" + n_days = 3 + hours = 24 * n_days + timesteps = pd.date_range('2024-01-01', periods=hours, freq='h') + periods = pd.Index([2024, 2025], name='period') + scenarios = pd.Index(['high', 'low'], name='scenario') + + # Create DIFFERENT demand patterns per period/scenario to get different cluster assignments + # Pattern structure: (base_mean, amplitude) for each day + patterns = { + (2024, 'high'): [(100, 40), (100, 40), (50, 20)], # Days 0&1 similar + (2024, 'low'): [(50, 20), (100, 40), (100, 40)], # Days 1&2 similar + (2025, 'high'): [(100, 40), (50, 20), (100, 40)], # Days 0&2 similar + (2025, 'low'): [(50, 20), (50, 20), (100, 40)], # Days 0&1 similar + } + + demand_values = np.zeros((hours, len(periods), len(scenarios))) + for pi, period in enumerate(periods): + for si, scenario in enumerate(scenarios): + base = np.zeros(hours) + for d, (mean, amp) in enumerate(patterns[(period, scenario)]): + start = d * 24 + base[start : start + 24] = mean + amp * np.sin(np.linspace(0, 2 * np.pi, 24)) + demand_values[:, pi, si] = base + + demand = xr.DataArray( + demand_values, + dims=['time', 'period', 'scenario'], + coords={'time': timesteps, 'period': periods, 'scenario': scenarios}, + ) + + fs = fx.FlowSystem(timesteps, periods=periods, scenarios=scenarios) + fs.add_elements( + fx.Bus('heat'), + fx.Effect('costs', unit='EUR', description='costs', is_objective=True, is_standard=True), + fx.Sink('demand', inputs=[fx.Flow('in', bus='heat', fixed_relative_profile=demand, size=1)]), + fx.Source('source', outputs=[fx.Flow('out', bus='heat', size=200, effects_per_flow_hour={'costs': 0.05})]), + ) + return fs + + def test_cluster_assignments_has_correct_dimensions(self, system_with_periods_and_scenarios): + """cluster_assignments should have dimensions for original_cluster, period, and scenario.""" + fs = system_with_periods_and_scenarios + fs_clustered = fs.transform.cluster(n_clusters=2, cluster_duration='1D') + + cluster_assignments = fs_clustered.clustering.cluster_assignments + assert 'original_cluster' in cluster_assignments.dims + assert 'period' in cluster_assignments.dims + assert 'scenario' in cluster_assignments.dims + assert cluster_assignments.shape == (3, 2, 2) # 3 days, 2 periods, 2 scenarios + + def test_different_assignments_per_period_scenario(self, system_with_periods_and_scenarios): + """Different period/scenario combinations should have different cluster assignments.""" + fs = system_with_periods_and_scenarios + fs_clustered = fs.transform.cluster(n_clusters=2, cluster_duration='1D') + + # Collect all unique assignment patterns + assignments = set() + for period in fs_clustered.periods: + for scenario in fs_clustered.scenarios: + order = tuple(fs_clustered.clustering.cluster_assignments.sel(period=period, scenario=scenario).values) + assignments.add(order) + + # We expect at least 2 different patterns (the demand was designed to create different patterns) + assert len(assignments) >= 2, f'Expected at least 2 unique patterns, got {len(assignments)}' + + def test_cluster_assignments_preserved_after_roundtrip(self, system_with_periods_and_scenarios, tmp_path): + """cluster_assignments should be exactly preserved after netcdf roundtrip.""" + fs = system_with_periods_and_scenarios + fs_clustered = fs.transform.cluster(n_clusters=2, cluster_duration='1D') + + # Store original cluster_assignments + original_cluster_assignments = fs_clustered.clustering.cluster_assignments.copy() + + # Roundtrip via netcdf + nc_path = tmp_path / 'multi_dim_clustering.nc' + fs_clustered.to_netcdf(nc_path) + fs_restored = fx.FlowSystem.from_netcdf(nc_path) + + # cluster_assignments should be exactly preserved + xr.testing.assert_equal(original_cluster_assignments, fs_restored.clustering.cluster_assignments) + + def test_results_preserved_after_load(self, system_with_periods_and_scenarios, tmp_path): + """ClusteringResults should be preserved after loading (via ClusteringResults.to_dict()).""" + fs = system_with_periods_and_scenarios + fs_clustered = fs.transform.cluster(n_clusters=2, cluster_duration='1D') + + # Before save, results exists + assert fs_clustered.clustering.results is not None + + # Roundtrip + nc_path = tmp_path / 'multi_dim_clustering.nc' + fs_clustered.to_netcdf(nc_path) + fs_restored = fx.FlowSystem.from_netcdf(nc_path) + + # After load, results should be reconstructed + assert fs_restored.clustering.results is not None + # The restored results should have the same structure + assert len(fs_restored.clustering.results) == len(fs_clustered.clustering.results) + + def test_derived_properties_work_after_load(self, system_with_periods_and_scenarios, tmp_path): + """Derived properties should work correctly after loading (computed from cluster_assignments).""" + fs = system_with_periods_and_scenarios + fs_clustered = fs.transform.cluster(n_clusters=2, cluster_duration='1D') + + # Roundtrip + nc_path = tmp_path / 'multi_dim_clustering.nc' + fs_clustered.to_netcdf(nc_path) + fs_restored = fx.FlowSystem.from_netcdf(nc_path) + + # These properties should work correctly after roundtrip + assert fs_restored.clustering.n_clusters == 2 + assert fs_restored.clustering.timesteps_per_cluster == 24 + + # cluster_occurrences should be derived from cluster_assignments + occurrences = fs_restored.clustering.cluster_occurrences + assert occurrences is not None + # For each period/scenario, occurrences should sum to n_original_clusters (3 days) + for period in fs_restored.periods: + for scenario in fs_restored.scenarios: + occ = occurrences.sel(period=period, scenario=scenario) + assert occ.sum().item() == 3 + + def test_apply_clustering_after_load(self, system_with_periods_and_scenarios, tmp_path): + """apply_clustering should work with a clustering loaded from netcdf.""" + fs = system_with_periods_and_scenarios + fs_clustered = fs.transform.cluster(n_clusters=2, cluster_duration='1D') + + # Save clustered system + nc_path = tmp_path / 'multi_dim_clustering.nc' + fs_clustered.to_netcdf(nc_path) + + # Load the full FlowSystem with clustering + fs_loaded = fx.FlowSystem.from_netcdf(nc_path) + clustering_loaded = fs_loaded.clustering + # ClusteringResults should be fully preserved after load + assert clustering_loaded.results is not None + + # Create a fresh FlowSystem (copy the original, unclustered one) + fs_fresh = fs.copy() + + # Apply the loaded clustering to the fresh FlowSystem + fs_new_clustered = fs_fresh.transform.apply_clustering(clustering_loaded) + + # Should have same cluster structure + assert fs_new_clustered.clustering.n_clusters == 2 + # Clustered FlowSystem has 'cluster' and 'time' dimensions + # timesteps gives time dimension (24 hours per cluster), cluster is separate + assert len(fs_new_clustered.timesteps) == 24 # 24 hours per typical period + assert 'cluster' in fs_new_clustered.dims + assert len(fs_new_clustered.indexes['cluster']) == 2 # 2 clusters + + # cluster_assignments should match + xr.testing.assert_equal( + fs_clustered.clustering.cluster_assignments, fs_new_clustered.clustering.cluster_assignments + ) + + def test_expand_after_load_and_optimize(self, system_with_periods_and_scenarios, tmp_path, solver_fixture): + """expand() should work correctly after loading a solved clustered system.""" + fs = system_with_periods_and_scenarios + fs_clustered = fs.transform.cluster(n_clusters=2, cluster_duration='1D') + fs_clustered.optimize(solver_fixture) + + # Roundtrip + nc_path = tmp_path / 'multi_dim_clustering_solved.nc' + fs_clustered.to_netcdf(nc_path) + fs_restored = fx.FlowSystem.from_netcdf(nc_path) + + # expand should work + fs_expanded = fs_restored.transform.expand() + + # Should have original number of timesteps + assert len(fs_expanded.timesteps) == 24 * 3 # 3 days × 24 hours + + # Solution should be expanded + assert fs_expanded.solution is not None + assert 'source(out)|flow_rate' in fs_expanded.solution