Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changelog/322.fix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Reading timeseries from a CSV now defaults to ``low_memory=False`` for the C parser, so column dtype inference is a single deterministic pass.
This stops a mostly-null or mixed-type metadata column from loading as ``object`` on one run and ``float64`` on another, and silences the accompanying ``DtypeWarning``.
An explicitly supplied ``low_memory`` is still respected, and the option is left untouched for the ``python`` and ``pyarrow`` engines.
7 changes: 7 additions & 0 deletions src/scmdata/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,13 @@ def _read_pandas(

else:
_logger.debug("Reading with pandas read_csv")
# pandas' default ``low_memory=True`` reads the file in chunks and infers
# each column's dtype per chunk. For mostly-null or mixed-type metadata
# columns this is non-deterministic across runs and emits a ``DtypeWarning``.
# Reading the whole column at once makes inference deterministic and quiet.
# ``low_memory`` is only accepted by the C parser, so leave other engines be.
if (kwargs.get("engine") or "c") == "c":
kwargs.setdefault("low_memory", False)
dateframe = pd.read_csv(fname, *args, **kwargs)

def _to_lower(c):
Expand Down
54 changes: 54 additions & 0 deletions tests/unit/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -3367,6 +3367,60 @@ def test_read_from_disk(test_file, test_kwargs, test_data_path, use_pathlib):
)


def _write_two_timeseries_csv(tmp_path):
df = pd.DataFrame(
{
"model": "idealised",
"scenario": "idealised",
"region": ["World", "Europe"],
"variable": "Emissions|CO2",
"unit": "GtC / yr",
"2020": [1.0, 2.0],
}
)
fname = tmp_path / "two_timeseries.csv"
df.to_csv(fname, index=False)
return fname


def _capture_read_csv_kwargs(call):
"""Run ``call`` while spying on ``pd.read_csv``; return the kwargs it received."""
captured = {}
real_read_csv = pd.read_csv

def _spy(*args, **kwargs):
captured.update(kwargs)
return real_read_csv(*args, **kwargs)

with patch.object(pd, "read_csv", _spy):
call()
return captured


def test_read_csv_defaults_to_low_memory_false(tmp_path):
# pandas' default low_memory=True infers each column's dtype per-chunk, which
# is non-deterministic for mostly-null / mixed-type metadata columns and emits
# a DtypeWarning. ScmRun should read in a single deterministic, quiet pass.
fname = _write_two_timeseries_csv(tmp_path)
captured = _capture_read_csv_kwargs(lambda: ScmRun(fname))
assert captured["low_memory"] is False


def test_read_csv_respects_explicit_low_memory(tmp_path):
# An explicit low_memory must win over the injected default.
fname = _write_two_timeseries_csv(tmp_path)
captured = _capture_read_csv_kwargs(lambda: ScmRun(fname, low_memory=True))
assert captured["low_memory"] is True


def test_read_csv_python_engine_omits_low_memory(tmp_path):
# low_memory is rejected by the python parser engine, so it must not be
# injected when the caller selects it -- otherwise read_csv raises ValueError.
fname = _write_two_timeseries_csv(tmp_path)
captured = _capture_read_csv_kwargs(lambda: ScmRun(fname, engine="python"))
assert "low_memory" not in captured


def test_read_from_disk_different_number_of_digits_years(test_data_path):
loaded = ScmRun(
os.path.join(test_data_path, "different_number_of_digits_years.csv")
Expand Down
Loading