diff --git a/pyreadstat/_readstat_parser.pyx b/pyreadstat/_readstat_parser.pyx index b9733bc..725f9c6 100644 --- a/pyreadstat/_readstat_parser.pyx +++ b/pyreadstat/_readstat_parser.pyx @@ -20,6 +20,7 @@ from cpython.datetime cimport import_datetime, timedelta_new, datetime_new, tota from cpython.exc cimport PyErr_Occurred from cpython.object cimport PyObject from libc.math cimport floor #NAN, +from libc.string cimport memcpy from collections import OrderedDict import datetime @@ -877,9 +878,54 @@ cdef void check_exit_status(readstat_error_t retcode) except *: raise ReadstatError(err_message) -cdef void run_readstat_parser(char * filename, data_container data, py_file_extension file_extension, long row_limit, long row_offset) except *: +# File object I/O handlers - bridge Python file object methods to C callbacks +cdef int pyobject_open_handler(const char *path, void *io_ctx) noexcept: + """File is already open, do nothing""" + return 0 # Success + +cdef ssize_t pyobject_read_handler(void *buf, size_t nbyte, void *io_ctx) noexcept: + """Read from Python file object""" + cdef object file_obj = io_ctx + cdef size_t data_len + cdef const unsigned char[:] data_view + + try: + data = file_obj.read(nbyte) + if isinstance(data, bytes): + data_len = len(data) + if data_len > 0: + data_view = data + memcpy(buf, &data_view[0], data_len) + return data_len + return 0 + except: + return -1 # Error + +cdef readstat_off_t pyobject_seek_handler(readstat_off_t offset, readstat_io_flags_t whence, void *io_ctx) noexcept: + """Seek in Python file object""" + cdef object file_obj = io_ctx + cdef int py_whence = 0 # SEEK_SET + if whence == READSTAT_SEEK_CUR: + py_whence = 1 + elif whence == READSTAT_SEEK_END: + py_whence = 2 + try: + return file_obj.seek(offset, py_whence) + except: + return -1 # Error + +cdef int pyobject_close_handler(void *io_ctx) noexcept: + """Don't close - user manages file object lifetime""" + return 0 # Success + + +cdef void run_readstat_parser(char * filename, data_container data, py_file_extension file_extension, long row_limit, long row_offset, object file_obj=None) except *: """ - Runs the parsing of the file by readstat library + Runs the parsing of the file by readstat library. + Supports both file paths and file-like objects. + + If file_obj is provided, custom I/O handlers are used for streaming. + Otherwise, standard file path parsing is used. """ cdef readstat_parser_t *parser @@ -889,71 +935,97 @@ cdef void run_readstat_parser(char * filename, data_container data, py_file_exte cdef readstat_value_handler value_handler cdef readstat_value_label_handler value_label_handler cdef readstat_note_handler note_handler - cdef void *ctx - cdef str err_message cdef PyObject *pyerr cdef bint metaonly - cdef char *err_readstat - cdef bytes encoding_byte + cdef bytes encoding_bytes + cdef bint is_file_object = file_obj is not None metaonly = data.metaonly ctx = data - #readstat_error_t error = READSTAT_OK; + # Initialize parser parser = readstat_parser_init() + + # Set up I/O handlers based on input type + if is_file_object: + # Custom I/O handlers for file objects + readstat_set_open_handler(parser, pyobject_open_handler) + readstat_set_read_handler(parser, pyobject_read_handler) + readstat_set_seek_handler(parser, pyobject_seek_handler) + readstat_set_close_handler(parser, pyobject_close_handler) + readstat_set_io_ctx(parser, file_obj) + elif os.name == "nt": + # On Windows, use custom open handler for international characters + open_handler = handle_open + readstat_set_open_handler(parser, open_handler) + + # Set data handlers (common for both paths and file objects) metadata_handler = handle_metadata variable_handler = handle_variable value_handler = handle_value value_label_handler = handle_value_label note_handler = handle_note - check_exit_status(readstat_set_metadata_handler(parser, metadata_handler)) check_exit_status(readstat_set_variable_handler(parser, variable_handler)) check_exit_status(readstat_set_value_label_handler(parser, value_label_handler)) check_exit_status(readstat_set_note_handler(parser, note_handler)) - # on windows we need a custom open handler in order to deal with internation characters in the path. - if os.name == "nt": - open_handler = handle_open - readstat_set_open_handler(parser, open_handler) - if not metaonly: check_exit_status(readstat_set_value_handler(parser, value_handler)) - # if the user set the encoding manually + # Set encoding if specified if data.user_encoding: encoding_bytes = data.user_encoding.encode("utf-8") readstat_set_file_character_encoding(parser, encoding_bytes) + # Set row limits if specified if row_limit: check_exit_status(readstat_set_row_limit(parser, row_limit)) if row_offset: check_exit_status(readstat_set_row_offset(parser, row_offset)) - # parse! + # Parse with appropriate filename (empty string for file objects) if file_extension == FILE_EXT_SAV: - error = readstat_parse_sav(parser, filename, ctx); + if is_file_object: + error = readstat_parse_sav(parser, "", ctx) + else: + error = readstat_parse_sav(parser, filename, ctx) elif file_extension == FILE_EXT_SAS7BDAT: - error = readstat_parse_sas7bdat(parser, filename, ctx); + if is_file_object: + error = readstat_parse_sas7bdat(parser, "", ctx) + else: + error = readstat_parse_sas7bdat(parser, filename, ctx) elif file_extension == FILE_EXT_DTA: - error = readstat_parse_dta(parser, filename, ctx); + if is_file_object: + error = readstat_parse_dta(parser, "", ctx) + else: + error = readstat_parse_dta(parser, filename, ctx) elif file_extension == FILE_EXT_XPORT: - error = readstat_parse_xport(parser, filename, ctx); + if is_file_object: + error = readstat_parse_xport(parser, "", ctx) + else: + error = readstat_parse_xport(parser, filename, ctx) elif file_extension == FILE_EXT_POR: - error = readstat_parse_por(parser, filename, ctx); + if is_file_object: + error = readstat_parse_por(parser, "", ctx) + else: + error = readstat_parse_por(parser, filename, ctx) elif file_extension == FILE_EXT_SAS7BCAT: - error = readstat_parse_sas7bcat(parser, filename, ctx); - #error = parse_func(parser, filename, ctx); + if is_file_object: + error = readstat_parse_sas7bcat(parser, "", ctx) + else: + error = readstat_parse_sas7bcat(parser, filename, ctx) + readstat_parser_free(parser) - # check if a python error ocurred, if yes, it will be printed by the interpreter, - # if not, make sure that the return from parse_func is OK, if not print + + # Check for errors pyerr = PyErr_Occurred() if pyerr == NULL: check_exit_status(error) - + cdef object data_container_to_dict(data_container data): """ @@ -1172,27 +1244,34 @@ cdef object run_conversion(object filename_path, py_file_format file_format, py_ cdef object data_dict cdef object data_frame - if hasattr(os, 'fsencode'): - try: - filename_bytes = os.fsencode(filename_path) - except UnicodeError: - warnings.warn("file path could not be encoded with %s which is set as your system encoding, trying to encode it as utf-8. Please set your system encoding correctly." % sys.getfilesystemencoding()) - filename_bytes = os.fsdecode(filename_path).encode("utf-8", "surrogateescape") - else: - if type(filename_path) == str: - filename_bytes = filename_path.encode('utf-8') - elif type(filename_path) == bytes: - filename_bytes = filename_path + # Check if filename_path is a file-like object + cdef bint is_file_object = False + if hasattr(filename_path, 'read') and hasattr(filename_path, 'seek'): + is_file_object = True + # Skip path encoding for file objects - we'll handle them differently + + # Path-based file handling + if not is_file_object: + if hasattr(os, 'fsencode'): + try: + filename_bytes = os.fsencode(filename_path) + except UnicodeError: + warnings.warn("file path could not be encoded with %s which is set as your system encoding, trying to encode it as utf-8. Please set your system encoding correctly." % sys.getfilesystemencoding()) + filename_bytes = os.fsdecode(filename_path).encode("utf-8", "surrogateescape") else: - raise PyreadstatError("path must be either str or bytes") - if type(filename_path) not in (str, bytes, unicode): - raise PyreadstatError("path must be str, bytes or unicode") - filename_bytes = filename_path.encode('utf-8') - + if type(filename_path) == str: + filename_bytes = filename_path.encode('utf-8') + elif type(filename_path) == bytes: + filename_bytes = filename_path + else: + raise PyreadstatError("path must be either str or bytes") + if type(filename_path) not in (str, bytes, unicode): + raise PyreadstatError("path must be str, bytes or unicode") + filename_bytes = filename_path.encode('utf-8') - filename_bytes = os.path.expanduser(filename_bytes) - if not os.path.isfile(filename_bytes): - raise PyreadstatError("File {0} does not exist!".format(filename_path)) + filename_bytes = os.path.expanduser(filename_bytes) + if not os.path.isfile(filename_bytes): + raise PyreadstatError("File {0} does not exist!".format(filename_path)) if output_format is None: output_format = 'pandas' @@ -1243,7 +1322,9 @@ cdef object run_conversion(object filename_path, py_file_format file_format, py_ stata_all_formats = stata_date_formats + stata_datetime_formats + stata_time_formats spss_all_formats = spss_date_formats + spss_datetime_formats + spss_time_formats - filename = filename_bytes + # Only set filename for path-based reads + if not is_file_object: + filename = filename_bytes data = data_container() ctx = data @@ -1278,8 +1359,12 @@ cdef object run_conversion(object filename_path, py_file_format file_format, py_ data.usernan = usernan data.no_datetime_conversion = no_datetime_conversion - # go! - run_readstat_parser(filename, data, file_extension, row_limit, row_offset) + # Parse file (handles both paths and file objects) + if is_file_object: + run_readstat_parser("", data, file_extension, row_limit, row_offset, file_obj=filename_path) + else: + run_readstat_parser(filename, data, file_extension, row_limit, row_offset) + data_dict = data_container_to_dict(data) if output_format == 'dict': data_frame = data_dict diff --git a/pyreadstat/pyreadstat.pyx b/pyreadstat/pyreadstat.pyx index 5f51d51..2d9009b 100644 --- a/pyreadstat/pyreadstat.pyx +++ b/pyreadstat/pyreadstat.pyx @@ -34,6 +34,32 @@ from pyfunctions import set_value_labels, set_catalog_to_sas # Parsing functions +# Helper function for file object handling +def _read_fileobj(file_obj, file_format, file_extension, encoding, metadataonly, + dates_as_pandas_datetime, usecols, user_missing, + disable_datetime_conversion, row_limit, row_offset, + output_format, extra_datetime_formats, extra_date_formats, + extra_time_formats): + """ + Read from a file-like object instead of a path. + This is a stub that will be implemented step by step. + """ + # Convert Python booleans to C bint (same as path-based code) + cdef bint metaonly_c = 1 if metadataonly else 0 + cdef bint dates_as_pandas = 1 if dates_as_pandas_datetime else 0 + cdef bint usernan = 1 if user_missing else 0 + cdef bint no_datetime_conversion = 1 if disable_datetime_conversion else 0 + + # Try calling run_conversion with file object (this will fail, showing us what to fix) + data_frame, metadata = run_conversion( + file_obj, file_format, file_extension, encoding, metaonly_c, + dates_as_pandas, usecols, usernan, no_datetime_conversion, + row_limit, row_offset, + output_format, extra_datetime_formats, extra_date_formats, extra_time_formats + ) + + return data_frame, metadata + def read_sas7bdat(filename_path, metadataonly=False, dates_as_pandas_datetime=False, catalog_file=None, formats_as_category=True, formats_as_ordered_category=False, str encoding=None, list usecols=None, user_missing=False, disable_datetime_conversion=False, int row_limit=0, int row_offset=0, str output_format=None, @@ -370,6 +396,19 @@ def read_sav(filename_path, metadataonly=False, dates_as_pandas_datetime=False, object with metadata. Look at the documentation for more information. """ + # Check if filename_path is a file-like object + if hasattr(filename_path, 'read') and hasattr(filename_path, 'seek'): + # Route to file object handler + return _read_fileobj( + filename_path, + _readstat_parser.FILE_FORMAT_SPSS, + _readstat_parser.FILE_EXT_SAV, + encoding, metadataonly, dates_as_pandas_datetime, + usecols, user_missing, disable_datetime_conversion, + row_limit, row_offset, output_format, + extra_datetime_formats, extra_date_formats, extra_time_formats + ) + cdef bint metaonly = 0 if metadataonly: metaonly = 1 diff --git a/pyreadstat/readstat_api.pxd b/pyreadstat/readstat_api.pxd index 9aefbc7..1995328 100644 --- a/pyreadstat/readstat_api.pxd +++ b/pyreadstat/readstat_api.pxd @@ -138,7 +138,11 @@ cdef extern from "readstat.h": READSTAT_COMPRESS_NONE, READSTAT_COMPRESS_ROWS, READSTAT_COMPRESS_BINARY - + + ctypedef enum readstat_io_flags_t: + READSTAT_SEEK_SET, + READSTAT_SEEK_CUR, + READSTAT_SEEK_END ctypedef off_t readstat_off_t @@ -146,7 +150,9 @@ cdef extern from "readstat.h": cdef void readstat_parser_free(readstat_parser_t *parser) ctypedef int (*readstat_open_handler)(const char *path, void *io_ctx); + ctypedef int (*readstat_close_handler)(void *io_ctx); ctypedef readstat_off_t (*readstat_seek_handler)(readstat_off_t offset, readstat_io_flags_t whence, void *io_ctx); + ctypedef ssize_t (*readstat_read_handler)(void *buf, size_t nbyte, void *io_ctx); ctypedef int (*readstat_metadata_handler)(readstat_metadata_t *metadata, void *ctx); ctypedef int (*readstat_variable_handler)(int index, readstat_variable_t *variable, char *val_labels, void *ctx); ctypedef int (*readstat_value_handler)(int obs_index, readstat_variable_t *variable, readstat_value_t value, void *ctx); @@ -154,7 +160,10 @@ cdef extern from "readstat.h": ctypedef int (*readstat_note_handler)(int note_index, const char *note, void *ctx); cdef readstat_error_t readstat_set_open_handler(readstat_parser_t *parser, readstat_open_handler open_handler); + cdef readstat_error_t readstat_set_close_handler(readstat_parser_t *parser, readstat_close_handler close_handler); cdef readstat_error_t readstat_set_seek_handler(readstat_parser_t *parser, readstat_seek_handler seek_handler); + cdef readstat_error_t readstat_set_read_handler(readstat_parser_t *parser, readstat_read_handler read_handler); + cdef readstat_error_t readstat_set_io_ctx(readstat_parser_t *parser, void *io_ctx); cdef readstat_error_t readstat_set_metadata_handler(readstat_parser_t *parser, readstat_metadata_handler metadata_handler); cdef readstat_error_t readstat_set_note_handler(readstat_parser_t *parser, readstat_note_handler note_handler); cdef readstat_error_t readstat_set_variable_handler(readstat_parser_t *parser, readstat_variable_handler variable_handler) diff --git a/tests/test_file_handle.py b/tests/test_file_handle.py new file mode 100644 index 0000000..444d4a8 --- /dev/null +++ b/tests/test_file_handle.py @@ -0,0 +1,54 @@ +""" +Test file handle support + +Tests reading statistical data files from file-like objects (e.g., zip archives) +without extracting them to disk. +""" + +import os +import zipfile +import tempfile +import pyreadstat + + +script_folder = os.path.dirname(os.path.realpath(__file__)) +parent_folder = os.path.split(script_folder)[0] +data_folder = os.path.join(parent_folder, "test_data", "multiple_response") + + +def test_read_sav_from_zip_file_handle(): + """ + Test reading SAV file directly from zip archive without extraction. + + This tests the main use case: reading large files from zip archives + without needing to extract them to temporary disk storage. + """ + test_file = os.path.join(data_folder, "simple_alltypes.sav") + + with tempfile.NamedTemporaryFile(suffix=".zip") as tmp: + # Create zip archive with test file + with zipfile.ZipFile(tmp.name, "w", zipfile.ZIP_DEFLATED) as zf: + zf.write(test_file, "simple_alltypes.sav") + + # Read from zip without extraction + with zipfile.ZipFile(tmp.name, "r") as zf: + with zf.open("simple_alltypes.sav", "r") as file_handle: + df, meta = pyreadstat.read_sav(file_handle) + + expected_columns = [ + "x", + "y", + "z", + "str", + "bool1", + "bool2", + "bool3", + "ca_subvar_1", + "ca_subvar_2", + "ca_subvar_3", + "date", + "quarter", + ] + + assert len(df) == 6 + assert list(df.columns) == expected_columns