@@ -20,6 +20,7 @@ from cpython.datetime cimport import_datetime, timedelta_new, datetime_new, tota
2020from cpython.exc cimport PyErr_Occurred
2121from cpython.object cimport PyObject
2222from libc.math cimport floor # NAN,
23+ from libc.string cimport memcpy
2324
2425from collections import OrderedDict
2526import datetime
@@ -877,9 +878,54 @@ cdef void check_exit_status(readstat_error_t retcode) except *:
877878 raise ReadstatError(err_message)
878879
879880
880- cdef void run_readstat_parser(char * filename, data_container data, py_file_extension file_extension, long row_limit, long row_offset) except * :
881+ # File object I/O handlers - bridge Python file object methods to C callbacks
882+ cdef int pyobject_open_handler(const char * path, void * io_ctx) noexcept:
883+ """ File is already open, do nothing"""
884+ return 0 # Success
885+
886+ cdef ssize_t pyobject_read_handler(void * buf, size_t nbyte, void * io_ctx) noexcept:
887+ """ Read from Python file object"""
888+ cdef object file_obj = < object > io_ctx
889+ cdef size_t data_len
890+ cdef const unsigned char [:] data_view
891+
892+ try :
893+ data = file_obj.read(nbyte)
894+ if isinstance (data, bytes):
895+ data_len = len (data)
896+ if data_len > 0 :
897+ data_view = data
898+ memcpy(buf, & data_view[0 ], data_len)
899+ return data_len
900+ return 0
901+ except :
902+ return - 1 # Error
903+
904+ cdef readstat_off_t pyobject_seek_handler(readstat_off_t offset, readstat_io_flags_t whence, void * io_ctx) noexcept:
905+ """ Seek in Python file object"""
906+ cdef object file_obj = < object > io_ctx
907+ cdef int py_whence = 0 # SEEK_SET
908+ if whence == READSTAT_SEEK_CUR:
909+ py_whence = 1
910+ elif whence == READSTAT_SEEK_END:
911+ py_whence = 2
912+ try :
913+ return file_obj.seek(offset, py_whence)
914+ except :
915+ return - 1 # Error
916+
917+ cdef int pyobject_close_handler(void * io_ctx) noexcept:
918+ """ Don't close - user manages file object lifetime"""
919+ return 0 # Success
920+
921+
922+ cdef void run_readstat_parser(char * filename, data_container data, py_file_extension file_extension, long row_limit, long row_offset, object file_obj = None ) except * :
881923 """
882- Runs the parsing of the file by readstat library
924+ Runs the parsing of the file by readstat library.
925+ Supports both file paths and file-like objects.
926+
927+ If file_obj is provided, custom I/O handlers are used for streaming.
928+ Otherwise, standard file path parsing is used.
883929 """
884930
885931 cdef readstat_parser_t * parser
@@ -889,71 +935,97 @@ cdef void run_readstat_parser(char * filename, data_container data, py_file_exte
889935 cdef readstat_value_handler value_handler
890936 cdef readstat_value_label_handler value_label_handler
891937 cdef readstat_note_handler note_handler
892-
893938 cdef void * ctx
894- cdef str err_message
895939 cdef PyObject * pyerr
896940 cdef bint metaonly
897- cdef char * err_readstat
898- cdef bytes encoding_byte
941+ cdef bytes encoding_bytes
942+ cdef bint is_file_object = file_obj is not None
899943
900944 metaonly = data.metaonly
901945 ctx = < void * > data
902946
903- # readstat_error_t error = READSTAT_OK;
947+ # Initialize parser
904948 parser = readstat_parser_init()
949+
950+ # Set up I/O handlers based on input type
951+ if is_file_object:
952+ # Custom I/O handlers for file objects
953+ readstat_set_open_handler(parser, pyobject_open_handler)
954+ readstat_set_read_handler(parser, pyobject_read_handler)
955+ readstat_set_seek_handler(parser, pyobject_seek_handler)
956+ readstat_set_close_handler(parser, pyobject_close_handler)
957+ readstat_set_io_ctx(parser, < void * > file_obj)
958+ elif os.name == " nt" :
959+ # On Windows, use custom open handler for international characters
960+ open_handler = < readstat_open_handler> handle_open
961+ readstat_set_open_handler(parser, open_handler)
962+
963+ # Set data handlers (common for both paths and file objects)
905964 metadata_handler = < readstat_metadata_handler> handle_metadata
906965 variable_handler = < readstat_variable_handler> handle_variable
907966 value_handler = < readstat_value_handler> handle_value
908967 value_label_handler = < readstat_value_label_handler> handle_value_label
909968 note_handler = < readstat_note_handler> handle_note
910969
911-
912970 check_exit_status(readstat_set_metadata_handler(parser, metadata_handler))
913971 check_exit_status(readstat_set_variable_handler(parser, variable_handler))
914972 check_exit_status(readstat_set_value_label_handler(parser, value_label_handler))
915973 check_exit_status(readstat_set_note_handler(parser, note_handler))
916974
917- # on windows we need a custom open handler in order to deal with internation characters in the path.
918- if os.name == " nt" :
919- open_handler = < readstat_open_handler> handle_open
920- readstat_set_open_handler(parser, open_handler)
921-
922975 if not metaonly:
923976 check_exit_status(readstat_set_value_handler(parser, value_handler))
924977
925- # if the user set the encoding manually
978+ # Set encoding if specified
926979 if data.user_encoding:
927980 encoding_bytes = data.user_encoding.encode(" utf-8" )
928981 readstat_set_file_character_encoding(parser, < char * > encoding_bytes)
929982
983+ # Set row limits if specified
930984 if row_limit:
931985 check_exit_status(readstat_set_row_limit(parser, row_limit))
932986
933987 if row_offset:
934988 check_exit_status(readstat_set_row_offset(parser, row_offset))
935989
936- # parse!
990+ # Parse with appropriate filename (empty string for file objects)
937991 if file_extension == FILE_EXT_SAV:
938- error = readstat_parse_sav(parser, filename, ctx);
992+ if is_file_object:
993+ error = readstat_parse_sav(parser, " " , ctx)
994+ else :
995+ error = readstat_parse_sav(parser, filename, ctx)
939996 elif file_extension == FILE_EXT_SAS7BDAT:
940- error = readstat_parse_sas7bdat(parser, filename, ctx);
997+ if is_file_object:
998+ error = readstat_parse_sas7bdat(parser, " " , ctx)
999+ else :
1000+ error = readstat_parse_sas7bdat(parser, filename, ctx)
9411001 elif file_extension == FILE_EXT_DTA:
942- error = readstat_parse_dta(parser, filename, ctx);
1002+ if is_file_object:
1003+ error = readstat_parse_dta(parser, " " , ctx)
1004+ else :
1005+ error = readstat_parse_dta(parser, filename, ctx)
9431006 elif file_extension == FILE_EXT_XPORT:
944- error = readstat_parse_xport(parser, filename, ctx);
1007+ if is_file_object:
1008+ error = readstat_parse_xport(parser, " " , ctx)
1009+ else :
1010+ error = readstat_parse_xport(parser, filename, ctx)
9451011 elif file_extension == FILE_EXT_POR:
946- error = readstat_parse_por(parser, filename, ctx);
1012+ if is_file_object:
1013+ error = readstat_parse_por(parser, " " , ctx)
1014+ else :
1015+ error = readstat_parse_por(parser, filename, ctx)
9471016 elif file_extension == FILE_EXT_SAS7BCAT:
948- error = readstat_parse_sas7bcat(parser, filename, ctx);
949- # error = parse_func(parser, filename, ctx);
1017+ if is_file_object:
1018+ error = readstat_parse_sas7bcat(parser, " " , ctx)
1019+ else :
1020+ error = readstat_parse_sas7bcat(parser, filename, ctx)
1021+
9501022 readstat_parser_free(parser)
951- # check if a python error ocurred, if yes, it will be printed by the interpreter,
952- # if not, make sure that the return from parse_func is OK, if not print
1023+
1024+ # Check for errors
9531025 pyerr = PyErr_Occurred()
9541026 if < void * > pyerr == NULL :
9551027 check_exit_status(error)
956-
1028+
9571029
9581030cdef object data_container_to_dict(data_container data):
9591031 """
@@ -1172,27 +1244,34 @@ cdef object run_conversion(object filename_path, py_file_format file_format, py_
11721244 cdef object data_dict
11731245 cdef object data_frame
11741246
1175- if hasattr (os, ' fsencode' ):
1176- try :
1177- filename_bytes = os.fsencode(filename_path)
1178- except UnicodeError :
1179- warnings.warn(" file path could not be encoded with %s which is set as your system encoding, trying to encode it as utf-8. Please set your system encoding correctly." % sys.getfilesystemencoding())
1180- filename_bytes = os.fsdecode(filename_path).encode(" utf-8" , " surrogateescape" )
1181- else :
1182- if type (filename_path) == str :
1183- filename_bytes = filename_path.encode(' utf-8' )
1184- elif type (filename_path) == bytes:
1185- filename_bytes = filename_path
1247+ # Check if filename_path is a file-like object
1248+ cdef bint is_file_object = False
1249+ if hasattr (filename_path, ' read' ) and hasattr (filename_path, ' seek' ):
1250+ is_file_object = True
1251+ # Skip path encoding for file objects - we'll handle them differently
1252+
1253+ # Path-based file handling
1254+ if not is_file_object:
1255+ if hasattr (os, ' fsencode' ):
1256+ try :
1257+ filename_bytes = os.fsencode(filename_path)
1258+ except UnicodeError :
1259+ warnings.warn(" file path could not be encoded with %s which is set as your system encoding, trying to encode it as utf-8. Please set your system encoding correctly." % sys.getfilesystemencoding())
1260+ filename_bytes = os.fsdecode(filename_path).encode(" utf-8" , " surrogateescape" )
11861261 else :
1187- raise PyreadstatError(" path must be either str or bytes" )
1188- if type (filename_path) not in (str , bytes, unicode ):
1189- raise PyreadstatError(" path must be str, bytes or unicode" )
1190- filename_bytes = filename_path.encode(' utf-8' )
1191-
1262+ if type (filename_path) == str :
1263+ filename_bytes = filename_path.encode(' utf-8' )
1264+ elif type (filename_path) == bytes:
1265+ filename_bytes = filename_path
1266+ else :
1267+ raise PyreadstatError(" path must be either str or bytes" )
1268+ if type (filename_path) not in (str , bytes, unicode ):
1269+ raise PyreadstatError(" path must be str, bytes or unicode" )
1270+ filename_bytes = filename_path.encode(' utf-8' )
11921271
1193- filename_bytes = os.path.expanduser(filename_bytes)
1194- if not os.path.isfile(filename_bytes):
1195- raise PyreadstatError(" File {0} does not exist!" .format(filename_path))
1272+ filename_bytes = os.path.expanduser(filename_bytes)
1273+ if not os.path.isfile(filename_bytes):
1274+ raise PyreadstatError(" File {0} does not exist!" .format(filename_path))
11961275
11971276 if output_format is None :
11981277 output_format = ' pandas'
@@ -1243,7 +1322,9 @@ cdef object run_conversion(object filename_path, py_file_format file_format, py_
12431322 stata_all_formats = stata_date_formats + stata_datetime_formats + stata_time_formats
12441323 spss_all_formats = spss_date_formats + spss_datetime_formats + spss_time_formats
12451324
1246- filename = < char * > filename_bytes
1325+ # Only set filename for path-based reads
1326+ if not is_file_object:
1327+ filename = < char * > filename_bytes
12471328
12481329 data = data_container()
12491330 ctx = < void * > data
@@ -1278,8 +1359,12 @@ cdef object run_conversion(object filename_path, py_file_format file_format, py_
12781359 data.usernan = usernan
12791360 data.no_datetime_conversion = no_datetime_conversion
12801361
1281- # go!
1282- run_readstat_parser(filename, data, file_extension, row_limit, row_offset)
1362+ # Parse file (handles both paths and file objects)
1363+ if is_file_object:
1364+ run_readstat_parser(" " , data, file_extension, row_limit, row_offset, file_obj = filename_path)
1365+ else :
1366+ run_readstat_parser(filename, data, file_extension, row_limit, row_offset)
1367+
12831368 data_dict = data_container_to_dict(data)
12841369 if output_format == ' dict' :
12851370 data_frame = data_dict
0 commit comments