Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 132 additions & 47 deletions pyreadstat/_readstat_parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ from cpython.datetime cimport import_datetime, timedelta_new, datetime_new, tota
from cpython.exc cimport PyErr_Occurred
from cpython.object cimport PyObject
from libc.math cimport floor #NAN,
from libc.string cimport memcpy

from collections import OrderedDict
import datetime
Expand Down Expand Up @@ -877,9 +878,54 @@ cdef void check_exit_status(readstat_error_t retcode) except *:
raise ReadstatError(err_message)


cdef void run_readstat_parser(char * filename, data_container data, py_file_extension file_extension, long row_limit, long row_offset) except *:
# File object I/O handlers - bridge Python file object methods to C callbacks
cdef int pyobject_open_handler(const char *path, void *io_ctx) noexcept:
"""File is already open, do nothing"""
return 0 # Success

cdef ssize_t pyobject_read_handler(void *buf, size_t nbyte, void *io_ctx) noexcept:
"""Read from Python file object"""
cdef object file_obj = <object>io_ctx
cdef size_t data_len
cdef const unsigned char[:] data_view

try:
data = file_obj.read(nbyte)
if isinstance(data, bytes):
data_len = len(data)
if data_len > 0:
data_view = data
memcpy(buf, &data_view[0], data_len)
return data_len
return 0
except:
return -1 # Error

cdef readstat_off_t pyobject_seek_handler(readstat_off_t offset, readstat_io_flags_t whence, void *io_ctx) noexcept:
"""Seek in Python file object"""
cdef object file_obj = <object>io_ctx
cdef int py_whence = 0 # SEEK_SET
if whence == READSTAT_SEEK_CUR:
py_whence = 1
elif whence == READSTAT_SEEK_END:
py_whence = 2
try:
return file_obj.seek(offset, py_whence)
except:
return -1 # Error

cdef int pyobject_close_handler(void *io_ctx) noexcept:
"""Don't close - user manages file object lifetime"""
return 0 # Success


cdef void run_readstat_parser(char * filename, data_container data, py_file_extension file_extension, long row_limit, long row_offset, object file_obj=None) except *:
"""
Runs the parsing of the file by readstat library
Runs the parsing of the file by readstat library.
Supports both file paths and file-like objects.

If file_obj is provided, custom I/O handlers are used for streaming.
Otherwise, standard file path parsing is used.
"""

cdef readstat_parser_t *parser
Expand All @@ -889,71 +935,97 @@ cdef void run_readstat_parser(char * filename, data_container data, py_file_exte
cdef readstat_value_handler value_handler
cdef readstat_value_label_handler value_label_handler
cdef readstat_note_handler note_handler

cdef void *ctx
cdef str err_message
cdef PyObject *pyerr
cdef bint metaonly
cdef char *err_readstat
cdef bytes encoding_byte
cdef bytes encoding_bytes
cdef bint is_file_object = file_obj is not None

metaonly = data.metaonly
ctx = <void *>data

#readstat_error_t error = READSTAT_OK;
# Initialize parser
parser = readstat_parser_init()

# Set up I/O handlers based on input type
if is_file_object:
# Custom I/O handlers for file objects
readstat_set_open_handler(parser, pyobject_open_handler)
readstat_set_read_handler(parser, pyobject_read_handler)
readstat_set_seek_handler(parser, pyobject_seek_handler)
readstat_set_close_handler(parser, pyobject_close_handler)
readstat_set_io_ctx(parser, <void*>file_obj)
elif os.name == "nt":
# On Windows, use custom open handler for international characters
open_handler = <readstat_open_handler> handle_open
readstat_set_open_handler(parser, open_handler)

# Set data handlers (common for both paths and file objects)
metadata_handler = <readstat_metadata_handler> handle_metadata
variable_handler = <readstat_variable_handler> handle_variable
value_handler = <readstat_value_handler> handle_value
value_label_handler = <readstat_value_label_handler> handle_value_label
note_handler = <readstat_note_handler> handle_note


check_exit_status(readstat_set_metadata_handler(parser, metadata_handler))
check_exit_status(readstat_set_variable_handler(parser, variable_handler))
check_exit_status(readstat_set_value_label_handler(parser, value_label_handler))
check_exit_status(readstat_set_note_handler(parser, note_handler))

# on windows we need a custom open handler in order to deal with internation characters in the path.
if os.name == "nt":
open_handler = <readstat_open_handler> handle_open
readstat_set_open_handler(parser, open_handler)

if not metaonly:
check_exit_status(readstat_set_value_handler(parser, value_handler))

# if the user set the encoding manually
# Set encoding if specified
if data.user_encoding:
encoding_bytes = data.user_encoding.encode("utf-8")
readstat_set_file_character_encoding(parser, <char *> encoding_bytes)

# Set row limits if specified
if row_limit:
check_exit_status(readstat_set_row_limit(parser, row_limit))

if row_offset:
check_exit_status(readstat_set_row_offset(parser, row_offset))

# parse!
# Parse with appropriate filename (empty string for file objects)
if file_extension == FILE_EXT_SAV:
error = readstat_parse_sav(parser, filename, ctx);
if is_file_object:
error = readstat_parse_sav(parser, "", ctx)
else:
error = readstat_parse_sav(parser, filename, ctx)
elif file_extension == FILE_EXT_SAS7BDAT:
error = readstat_parse_sas7bdat(parser, filename, ctx);
if is_file_object:
error = readstat_parse_sas7bdat(parser, "", ctx)
else:
error = readstat_parse_sas7bdat(parser, filename, ctx)
elif file_extension == FILE_EXT_DTA:
error = readstat_parse_dta(parser, filename, ctx);
if is_file_object:
error = readstat_parse_dta(parser, "", ctx)
else:
error = readstat_parse_dta(parser, filename, ctx)
elif file_extension == FILE_EXT_XPORT:
error = readstat_parse_xport(parser, filename, ctx);
if is_file_object:
error = readstat_parse_xport(parser, "", ctx)
else:
error = readstat_parse_xport(parser, filename, ctx)
elif file_extension == FILE_EXT_POR:
error = readstat_parse_por(parser, filename, ctx);
if is_file_object:
error = readstat_parse_por(parser, "", ctx)
else:
error = readstat_parse_por(parser, filename, ctx)
elif file_extension == FILE_EXT_SAS7BCAT:
error = readstat_parse_sas7bcat(parser, filename, ctx);
#error = parse_func(parser, filename, ctx);
if is_file_object:
error = readstat_parse_sas7bcat(parser, "", ctx)
else:
error = readstat_parse_sas7bcat(parser, filename, ctx)

readstat_parser_free(parser)
# check if a python error ocurred, if yes, it will be printed by the interpreter,
# if not, make sure that the return from parse_func is OK, if not print

# Check for errors
pyerr = PyErr_Occurred()
if <void *>pyerr == NULL:
check_exit_status(error)


cdef object data_container_to_dict(data_container data):
"""
Expand Down Expand Up @@ -1172,27 +1244,34 @@ cdef object run_conversion(object filename_path, py_file_format file_format, py_
cdef object data_dict
cdef object data_frame

if hasattr(os, 'fsencode'):
try:
filename_bytes = os.fsencode(filename_path)
except UnicodeError:
warnings.warn("file path could not be encoded with %s which is set as your system encoding, trying to encode it as utf-8. Please set your system encoding correctly." % sys.getfilesystemencoding())
filename_bytes = os.fsdecode(filename_path).encode("utf-8", "surrogateescape")
else:
if type(filename_path) == str:
filename_bytes = filename_path.encode('utf-8')
elif type(filename_path) == bytes:
filename_bytes = filename_path
# Check if filename_path is a file-like object
cdef bint is_file_object = False
if hasattr(filename_path, 'read') and hasattr(filename_path, 'seek'):
is_file_object = True
# Skip path encoding for file objects - we'll handle them differently

# Path-based file handling
if not is_file_object:
if hasattr(os, 'fsencode'):
try:
filename_bytes = os.fsencode(filename_path)
except UnicodeError:
warnings.warn("file path could not be encoded with %s which is set as your system encoding, trying to encode it as utf-8. Please set your system encoding correctly." % sys.getfilesystemencoding())
filename_bytes = os.fsdecode(filename_path).encode("utf-8", "surrogateescape")
else:
raise PyreadstatError("path must be either str or bytes")
if type(filename_path) not in (str, bytes, unicode):
raise PyreadstatError("path must be str, bytes or unicode")
filename_bytes = filename_path.encode('utf-8')

if type(filename_path) == str:
filename_bytes = filename_path.encode('utf-8')
elif type(filename_path) == bytes:
filename_bytes = filename_path
else:
raise PyreadstatError("path must be either str or bytes")
if type(filename_path) not in (str, bytes, unicode):
raise PyreadstatError("path must be str, bytes or unicode")
filename_bytes = filename_path.encode('utf-8')

filename_bytes = os.path.expanduser(filename_bytes)
if not os.path.isfile(filename_bytes):
raise PyreadstatError("File {0} does not exist!".format(filename_path))
filename_bytes = os.path.expanduser(filename_bytes)
if not os.path.isfile(filename_bytes):
raise PyreadstatError("File {0} does not exist!".format(filename_path))

if output_format is None:
output_format = 'pandas'
Expand Down Expand Up @@ -1243,7 +1322,9 @@ cdef object run_conversion(object filename_path, py_file_format file_format, py_
stata_all_formats = stata_date_formats + stata_datetime_formats + stata_time_formats
spss_all_formats = spss_date_formats + spss_datetime_formats + spss_time_formats

filename = <char *> filename_bytes
# Only set filename for path-based reads
if not is_file_object:
filename = <char *> filename_bytes

data = data_container()
ctx = <void *>data
Expand Down Expand Up @@ -1278,8 +1359,12 @@ cdef object run_conversion(object filename_path, py_file_format file_format, py_
data.usernan = usernan
data.no_datetime_conversion = no_datetime_conversion

# go!
run_readstat_parser(filename, data, file_extension, row_limit, row_offset)
# Parse file (handles both paths and file objects)
if is_file_object:
run_readstat_parser("", data, file_extension, row_limit, row_offset, file_obj=filename_path)
else:
run_readstat_parser(filename, data, file_extension, row_limit, row_offset)

data_dict = data_container_to_dict(data)
if output_format == 'dict':
data_frame = data_dict
Expand Down
39 changes: 39 additions & 0 deletions pyreadstat/pyreadstat.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,32 @@ from pyfunctions import set_value_labels, set_catalog_to_sas

# Parsing functions

# Helper function for file object handling
def _read_fileobj(file_obj, file_format, file_extension, encoding, metadataonly,
dates_as_pandas_datetime, usecols, user_missing,
disable_datetime_conversion, row_limit, row_offset,
output_format, extra_datetime_formats, extra_date_formats,
extra_time_formats):
"""
Read from a file-like object instead of a path.
This is a stub that will be implemented step by step.
"""
# Convert Python booleans to C bint (same as path-based code)
cdef bint metaonly_c = 1 if metadataonly else 0
cdef bint dates_as_pandas = 1 if dates_as_pandas_datetime else 0
cdef bint usernan = 1 if user_missing else 0
cdef bint no_datetime_conversion = 1 if disable_datetime_conversion else 0

# Try calling run_conversion with file object (this will fail, showing us what to fix)
data_frame, metadata = run_conversion(
file_obj, file_format, file_extension, encoding, metaonly_c,
dates_as_pandas, usecols, usernan, no_datetime_conversion,
<long>row_limit, <long>row_offset,
output_format, extra_datetime_formats, extra_date_formats, extra_time_formats
)

return data_frame, metadata

def read_sas7bdat(filename_path, metadataonly=False, dates_as_pandas_datetime=False, catalog_file=None,
formats_as_category=True, formats_as_ordered_category=False, str encoding=None, list usecols=None, user_missing=False,
disable_datetime_conversion=False, int row_limit=0, int row_offset=0, str output_format=None,
Expand Down Expand Up @@ -370,6 +396,19 @@ def read_sav(filename_path, metadataonly=False, dates_as_pandas_datetime=False,
object with metadata. Look at the documentation for more information.
"""

# Check if filename_path is a file-like object
if hasattr(filename_path, 'read') and hasattr(filename_path, 'seek'):
# Route to file object handler
return _read_fileobj(
filename_path,
_readstat_parser.FILE_FORMAT_SPSS,
_readstat_parser.FILE_EXT_SAV,
encoding, metadataonly, dates_as_pandas_datetime,
usecols, user_missing, disable_datetime_conversion,
row_limit, row_offset, output_format,
extra_datetime_formats, extra_date_formats, extra_time_formats
)

cdef bint metaonly = 0
if metadataonly:
metaonly = 1
Expand Down
11 changes: 10 additions & 1 deletion pyreadstat/readstat_api.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -138,23 +138,32 @@ cdef extern from "readstat.h":
READSTAT_COMPRESS_NONE,
READSTAT_COMPRESS_ROWS,
READSTAT_COMPRESS_BINARY


ctypedef enum readstat_io_flags_t:
READSTAT_SEEK_SET,
READSTAT_SEEK_CUR,
READSTAT_SEEK_END

ctypedef off_t readstat_off_t

cdef readstat_parser_t *readstat_parser_init()
cdef void readstat_parser_free(readstat_parser_t *parser)

ctypedef int (*readstat_open_handler)(const char *path, void *io_ctx);
ctypedef int (*readstat_close_handler)(void *io_ctx);
ctypedef readstat_off_t (*readstat_seek_handler)(readstat_off_t offset, readstat_io_flags_t whence, void *io_ctx);
ctypedef ssize_t (*readstat_read_handler)(void *buf, size_t nbyte, void *io_ctx);
ctypedef int (*readstat_metadata_handler)(readstat_metadata_t *metadata, void *ctx);
ctypedef int (*readstat_variable_handler)(int index, readstat_variable_t *variable, char *val_labels, void *ctx);
ctypedef int (*readstat_value_handler)(int obs_index, readstat_variable_t *variable, readstat_value_t value, void *ctx);
ctypedef int (*readstat_value_label_handler)(const char *val_labels, readstat_value_t value, const char *label, void *ctx);
ctypedef int (*readstat_note_handler)(int note_index, const char *note, void *ctx);

cdef readstat_error_t readstat_set_open_handler(readstat_parser_t *parser, readstat_open_handler open_handler);
cdef readstat_error_t readstat_set_close_handler(readstat_parser_t *parser, readstat_close_handler close_handler);
cdef readstat_error_t readstat_set_seek_handler(readstat_parser_t *parser, readstat_seek_handler seek_handler);
cdef readstat_error_t readstat_set_read_handler(readstat_parser_t *parser, readstat_read_handler read_handler);
cdef readstat_error_t readstat_set_io_ctx(readstat_parser_t *parser, void *io_ctx);
cdef readstat_error_t readstat_set_metadata_handler(readstat_parser_t *parser, readstat_metadata_handler metadata_handler);
cdef readstat_error_t readstat_set_note_handler(readstat_parser_t *parser, readstat_note_handler note_handler);
cdef readstat_error_t readstat_set_variable_handler(readstat_parser_t *parser, readstat_variable_handler variable_handler)
Expand Down
Loading