Skip to content

Commit bc17fd2

Browse files
committed
Implement file handlers processing
1 parent ac40cdd commit bc17fd2

File tree

3 files changed

+181
-48
lines changed

3 files changed

+181
-48
lines changed

pyreadstat/_readstat_parser.pyx

Lines changed: 132 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ from cpython.datetime cimport import_datetime, timedelta_new, datetime_new, tota
2020
from cpython.exc cimport PyErr_Occurred
2121
from cpython.object cimport PyObject
2222
from libc.math cimport floor #NAN,
23+
from libc.string cimport memcpy
2324

2425
from collections import OrderedDict
2526
import datetime
@@ -877,9 +878,54 @@ cdef void check_exit_status(readstat_error_t retcode) except *:
877878
raise ReadstatError(err_message)
878879

879880

880-
cdef void run_readstat_parser(char * filename, data_container data, py_file_extension file_extension, long row_limit, long row_offset) except *:
881+
# File object I/O handlers - bridge Python file object methods to C callbacks
882+
cdef int pyobject_open_handler(const char *path, void *io_ctx) noexcept:
883+
"""File is already open, do nothing"""
884+
return 0 # Success
885+
886+
cdef ssize_t pyobject_read_handler(void *buf, size_t nbyte, void *io_ctx) noexcept:
887+
"""Read from Python file object"""
888+
cdef object file_obj = <object>io_ctx
889+
cdef size_t data_len
890+
cdef const unsigned char[:] data_view
891+
892+
try:
893+
data = file_obj.read(nbyte)
894+
if isinstance(data, bytes):
895+
data_len = len(data)
896+
if data_len > 0:
897+
data_view = data
898+
memcpy(buf, &data_view[0], data_len)
899+
return data_len
900+
return 0
901+
except:
902+
return -1 # Error
903+
904+
cdef readstat_off_t pyobject_seek_handler(readstat_off_t offset, readstat_io_flags_t whence, void *io_ctx) noexcept:
905+
"""Seek in Python file object"""
906+
cdef object file_obj = <object>io_ctx
907+
cdef int py_whence = 0 # SEEK_SET
908+
if whence == READSTAT_SEEK_CUR:
909+
py_whence = 1
910+
elif whence == READSTAT_SEEK_END:
911+
py_whence = 2
912+
try:
913+
return file_obj.seek(offset, py_whence)
914+
except:
915+
return -1 # Error
916+
917+
cdef int pyobject_close_handler(void *io_ctx) noexcept:
918+
"""Don't close - user manages file object lifetime"""
919+
return 0 # Success
920+
921+
922+
cdef void run_readstat_parser(char * filename, data_container data, py_file_extension file_extension, long row_limit, long row_offset, object file_obj=None) except *:
881923
"""
882-
Runs the parsing of the file by readstat library
924+
Runs the parsing of the file by readstat library.
925+
Supports both file paths and file-like objects.
926+
927+
If file_obj is provided, custom I/O handlers are used for streaming.
928+
Otherwise, standard file path parsing is used.
883929
"""
884930

885931
cdef readstat_parser_t *parser
@@ -889,71 +935,97 @@ cdef void run_readstat_parser(char * filename, data_container data, py_file_exte
889935
cdef readstat_value_handler value_handler
890936
cdef readstat_value_label_handler value_label_handler
891937
cdef readstat_note_handler note_handler
892-
893938
cdef void *ctx
894-
cdef str err_message
895939
cdef PyObject *pyerr
896940
cdef bint metaonly
897-
cdef char *err_readstat
898-
cdef bytes encoding_byte
941+
cdef bytes encoding_bytes
942+
cdef bint is_file_object = file_obj is not None
899943

900944
metaonly = data.metaonly
901945
ctx = <void *>data
902946

903-
#readstat_error_t error = READSTAT_OK;
947+
# Initialize parser
904948
parser = readstat_parser_init()
949+
950+
# Set up I/O handlers based on input type
951+
if is_file_object:
952+
# Custom I/O handlers for file objects
953+
readstat_set_open_handler(parser, pyobject_open_handler)
954+
readstat_set_read_handler(parser, pyobject_read_handler)
955+
readstat_set_seek_handler(parser, pyobject_seek_handler)
956+
readstat_set_close_handler(parser, pyobject_close_handler)
957+
readstat_set_io_ctx(parser, <void*>file_obj)
958+
elif os.name == "nt":
959+
# On Windows, use custom open handler for international characters
960+
open_handler = <readstat_open_handler> handle_open
961+
readstat_set_open_handler(parser, open_handler)
962+
963+
# Set data handlers (common for both paths and file objects)
905964
metadata_handler = <readstat_metadata_handler> handle_metadata
906965
variable_handler = <readstat_variable_handler> handle_variable
907966
value_handler = <readstat_value_handler> handle_value
908967
value_label_handler = <readstat_value_label_handler> handle_value_label
909968
note_handler = <readstat_note_handler> handle_note
910969

911-
912970
check_exit_status(readstat_set_metadata_handler(parser, metadata_handler))
913971
check_exit_status(readstat_set_variable_handler(parser, variable_handler))
914972
check_exit_status(readstat_set_value_label_handler(parser, value_label_handler))
915973
check_exit_status(readstat_set_note_handler(parser, note_handler))
916974

917-
# on windows we need a custom open handler in order to deal with internation characters in the path.
918-
if os.name == "nt":
919-
open_handler = <readstat_open_handler> handle_open
920-
readstat_set_open_handler(parser, open_handler)
921-
922975
if not metaonly:
923976
check_exit_status(readstat_set_value_handler(parser, value_handler))
924977

925-
# if the user set the encoding manually
978+
# Set encoding if specified
926979
if data.user_encoding:
927980
encoding_bytes = data.user_encoding.encode("utf-8")
928981
readstat_set_file_character_encoding(parser, <char *> encoding_bytes)
929982

983+
# Set row limits if specified
930984
if row_limit:
931985
check_exit_status(readstat_set_row_limit(parser, row_limit))
932986

933987
if row_offset:
934988
check_exit_status(readstat_set_row_offset(parser, row_offset))
935989

936-
# parse!
990+
# Parse with appropriate filename (empty string for file objects)
937991
if file_extension == FILE_EXT_SAV:
938-
error = readstat_parse_sav(parser, filename, ctx);
992+
if is_file_object:
993+
error = readstat_parse_sav(parser, "", ctx)
994+
else:
995+
error = readstat_parse_sav(parser, filename, ctx)
939996
elif file_extension == FILE_EXT_SAS7BDAT:
940-
error = readstat_parse_sas7bdat(parser, filename, ctx);
997+
if is_file_object:
998+
error = readstat_parse_sas7bdat(parser, "", ctx)
999+
else:
1000+
error = readstat_parse_sas7bdat(parser, filename, ctx)
9411001
elif file_extension == FILE_EXT_DTA:
942-
error = readstat_parse_dta(parser, filename, ctx);
1002+
if is_file_object:
1003+
error = readstat_parse_dta(parser, "", ctx)
1004+
else:
1005+
error = readstat_parse_dta(parser, filename, ctx)
9431006
elif file_extension == FILE_EXT_XPORT:
944-
error = readstat_parse_xport(parser, filename, ctx);
1007+
if is_file_object:
1008+
error = readstat_parse_xport(parser, "", ctx)
1009+
else:
1010+
error = readstat_parse_xport(parser, filename, ctx)
9451011
elif file_extension == FILE_EXT_POR:
946-
error = readstat_parse_por(parser, filename, ctx);
1012+
if is_file_object:
1013+
error = readstat_parse_por(parser, "", ctx)
1014+
else:
1015+
error = readstat_parse_por(parser, filename, ctx)
9471016
elif file_extension == FILE_EXT_SAS7BCAT:
948-
error = readstat_parse_sas7bcat(parser, filename, ctx);
949-
#error = parse_func(parser, filename, ctx);
1017+
if is_file_object:
1018+
error = readstat_parse_sas7bcat(parser, "", ctx)
1019+
else:
1020+
error = readstat_parse_sas7bcat(parser, filename, ctx)
1021+
9501022
readstat_parser_free(parser)
951-
# check if a python error ocurred, if yes, it will be printed by the interpreter,
952-
# if not, make sure that the return from parse_func is OK, if not print
1023+
1024+
# Check for errors
9531025
pyerr = PyErr_Occurred()
9541026
if <void *>pyerr == NULL:
9551027
check_exit_status(error)
956-
1028+
9571029

9581030
cdef object data_container_to_dict(data_container data):
9591031
"""
@@ -1172,27 +1244,34 @@ cdef object run_conversion(object filename_path, py_file_format file_format, py_
11721244
cdef object data_dict
11731245
cdef object data_frame
11741246

1175-
if hasattr(os, 'fsencode'):
1176-
try:
1177-
filename_bytes = os.fsencode(filename_path)
1178-
except UnicodeError:
1179-
warnings.warn("file path could not be encoded with %s which is set as your system encoding, trying to encode it as utf-8. Please set your system encoding correctly." % sys.getfilesystemencoding())
1180-
filename_bytes = os.fsdecode(filename_path).encode("utf-8", "surrogateescape")
1181-
else:
1182-
if type(filename_path) == str:
1183-
filename_bytes = filename_path.encode('utf-8')
1184-
elif type(filename_path) == bytes:
1185-
filename_bytes = filename_path
1247+
# Check if filename_path is a file-like object
1248+
cdef bint is_file_object = False
1249+
if hasattr(filename_path, 'read') and hasattr(filename_path, 'seek'):
1250+
is_file_object = True
1251+
# Skip path encoding for file objects - we'll handle them differently
1252+
1253+
# Path-based file handling
1254+
if not is_file_object:
1255+
if hasattr(os, 'fsencode'):
1256+
try:
1257+
filename_bytes = os.fsencode(filename_path)
1258+
except UnicodeError:
1259+
warnings.warn("file path could not be encoded with %s which is set as your system encoding, trying to encode it as utf-8. Please set your system encoding correctly." % sys.getfilesystemencoding())
1260+
filename_bytes = os.fsdecode(filename_path).encode("utf-8", "surrogateescape")
11861261
else:
1187-
raise PyreadstatError("path must be either str or bytes")
1188-
if type(filename_path) not in (str, bytes, unicode):
1189-
raise PyreadstatError("path must be str, bytes or unicode")
1190-
filename_bytes = filename_path.encode('utf-8')
1191-
1262+
if type(filename_path) == str:
1263+
filename_bytes = filename_path.encode('utf-8')
1264+
elif type(filename_path) == bytes:
1265+
filename_bytes = filename_path
1266+
else:
1267+
raise PyreadstatError("path must be either str or bytes")
1268+
if type(filename_path) not in (str, bytes, unicode):
1269+
raise PyreadstatError("path must be str, bytes or unicode")
1270+
filename_bytes = filename_path.encode('utf-8')
11921271

1193-
filename_bytes = os.path.expanduser(filename_bytes)
1194-
if not os.path.isfile(filename_bytes):
1195-
raise PyreadstatError("File {0} does not exist!".format(filename_path))
1272+
filename_bytes = os.path.expanduser(filename_bytes)
1273+
if not os.path.isfile(filename_bytes):
1274+
raise PyreadstatError("File {0} does not exist!".format(filename_path))
11961275

11971276
if output_format is None:
11981277
output_format = 'pandas'
@@ -1243,7 +1322,9 @@ cdef object run_conversion(object filename_path, py_file_format file_format, py_
12431322
stata_all_formats = stata_date_formats + stata_datetime_formats + stata_time_formats
12441323
spss_all_formats = spss_date_formats + spss_datetime_formats + spss_time_formats
12451324

1246-
filename = <char *> filename_bytes
1325+
# Only set filename for path-based reads
1326+
if not is_file_object:
1327+
filename = <char *> filename_bytes
12471328

12481329
data = data_container()
12491330
ctx = <void *>data
@@ -1278,8 +1359,12 @@ cdef object run_conversion(object filename_path, py_file_format file_format, py_
12781359
data.usernan = usernan
12791360
data.no_datetime_conversion = no_datetime_conversion
12801361

1281-
# go!
1282-
run_readstat_parser(filename, data, file_extension, row_limit, row_offset)
1362+
# Parse file (handles both paths and file objects)
1363+
if is_file_object:
1364+
run_readstat_parser("", data, file_extension, row_limit, row_offset, file_obj=filename_path)
1365+
else:
1366+
run_readstat_parser(filename, data, file_extension, row_limit, row_offset)
1367+
12831368
data_dict = data_container_to_dict(data)
12841369
if output_format == 'dict':
12851370
data_frame = data_dict

pyreadstat/pyreadstat.pyx

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,32 @@ from pyfunctions import set_value_labels, set_catalog_to_sas
3434

3535
# Parsing functions
3636

37+
# Helper function for file object handling
38+
def _read_fileobj(file_obj, file_format, file_extension, encoding, metadataonly,
39+
dates_as_pandas_datetime, usecols, user_missing,
40+
disable_datetime_conversion, row_limit, row_offset,
41+
output_format, extra_datetime_formats, extra_date_formats,
42+
extra_time_formats):
43+
"""
44+
Read from a file-like object instead of a path.
45+
This is a stub that will be implemented step by step.
46+
"""
47+
# Convert Python booleans to C bint (same as path-based code)
48+
cdef bint metaonly_c = 1 if metadataonly else 0
49+
cdef bint dates_as_pandas = 1 if dates_as_pandas_datetime else 0
50+
cdef bint usernan = 1 if user_missing else 0
51+
cdef bint no_datetime_conversion = 1 if disable_datetime_conversion else 0
52+
53+
# Try calling run_conversion with file object (this will fail, showing us what to fix)
54+
data_frame, metadata = run_conversion(
55+
file_obj, file_format, file_extension, encoding, metaonly_c,
56+
dates_as_pandas, usecols, usernan, no_datetime_conversion,
57+
<long>row_limit, <long>row_offset,
58+
output_format, extra_datetime_formats, extra_date_formats, extra_time_formats
59+
)
60+
61+
return data_frame, metadata
62+
3763
def read_sas7bdat(filename_path, metadataonly=False, dates_as_pandas_datetime=False, catalog_file=None,
3864
formats_as_category=True, formats_as_ordered_category=False, str encoding=None, list usecols=None, user_missing=False,
3965
disable_datetime_conversion=False, int row_limit=0, int row_offset=0, str output_format=None,
@@ -370,6 +396,19 @@ def read_sav(filename_path, metadataonly=False, dates_as_pandas_datetime=False,
370396
object with metadata. Look at the documentation for more information.
371397
"""
372398

399+
# Check if filename_path is a file-like object
400+
if hasattr(filename_path, 'read') and hasattr(filename_path, 'seek'):
401+
# Route to file object handler
402+
return _read_fileobj(
403+
filename_path,
404+
_readstat_parser.FILE_FORMAT_SPSS,
405+
_readstat_parser.FILE_EXT_SAV,
406+
encoding, metadataonly, dates_as_pandas_datetime,
407+
usecols, user_missing, disable_datetime_conversion,
408+
row_limit, row_offset, output_format,
409+
extra_datetime_formats, extra_date_formats, extra_time_formats
410+
)
411+
373412
cdef bint metaonly = 0
374413
if metadataonly:
375414
metaonly = 1

pyreadstat/readstat_api.pxd

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,23 +138,32 @@ cdef extern from "readstat.h":
138138
READSTAT_COMPRESS_NONE,
139139
READSTAT_COMPRESS_ROWS,
140140
READSTAT_COMPRESS_BINARY
141-
141+
142+
ctypedef enum readstat_io_flags_t:
143+
READSTAT_SEEK_SET,
144+
READSTAT_SEEK_CUR,
145+
READSTAT_SEEK_END
142146

143147
ctypedef off_t readstat_off_t
144148

145149
cdef readstat_parser_t *readstat_parser_init()
146150
cdef void readstat_parser_free(readstat_parser_t *parser)
147151

148152
ctypedef int (*readstat_open_handler)(const char *path, void *io_ctx);
153+
ctypedef int (*readstat_close_handler)(void *io_ctx);
149154
ctypedef readstat_off_t (*readstat_seek_handler)(readstat_off_t offset, readstat_io_flags_t whence, void *io_ctx);
155+
ctypedef ssize_t (*readstat_read_handler)(void *buf, size_t nbyte, void *io_ctx);
150156
ctypedef int (*readstat_metadata_handler)(readstat_metadata_t *metadata, void *ctx);
151157
ctypedef int (*readstat_variable_handler)(int index, readstat_variable_t *variable, char *val_labels, void *ctx);
152158
ctypedef int (*readstat_value_handler)(int obs_index, readstat_variable_t *variable, readstat_value_t value, void *ctx);
153159
ctypedef int (*readstat_value_label_handler)(const char *val_labels, readstat_value_t value, const char *label, void *ctx);
154160
ctypedef int (*readstat_note_handler)(int note_index, const char *note, void *ctx);
155161

156162
cdef readstat_error_t readstat_set_open_handler(readstat_parser_t *parser, readstat_open_handler open_handler);
163+
cdef readstat_error_t readstat_set_close_handler(readstat_parser_t *parser, readstat_close_handler close_handler);
157164
cdef readstat_error_t readstat_set_seek_handler(readstat_parser_t *parser, readstat_seek_handler seek_handler);
165+
cdef readstat_error_t readstat_set_read_handler(readstat_parser_t *parser, readstat_read_handler read_handler);
166+
cdef readstat_error_t readstat_set_io_ctx(readstat_parser_t *parser, void *io_ctx);
158167
cdef readstat_error_t readstat_set_metadata_handler(readstat_parser_t *parser, readstat_metadata_handler metadata_handler);
159168
cdef readstat_error_t readstat_set_note_handler(readstat_parser_t *parser, readstat_note_handler note_handler);
160169
cdef readstat_error_t readstat_set_variable_handler(readstat_parser_t *parser, readstat_variable_handler variable_handler)

0 commit comments

Comments
 (0)