From 36c1c0c75a712496f5daa2b141994afc71557ed9 Mon Sep 17 00:00:00 2001 From: Arturo Herrera Aguilar Date: Thu, 23 Oct 2025 12:39:33 -0600 Subject: [PATCH 1/7] Add support for scalar string and binary functions --- CHANGELOG.md | 11 + docs/source/snowpark/functions.rst | 10 + .../snowpark/_functions/scalar_functions.py | 773 ++++++++++++++++++ 3 files changed, 794 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 43254fdc7b..c9fa1a8c0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,6 +57,17 @@ - `st_geometryfromwkt` - `try_to_geography` - `try_to_geometry` + - String and binary functions: + - `hex_decode_string` + - `jarowinkler_similarity` + - `parse_url` + - `regexp_instr` + - `regexp_like` + - `regexp_substr` + - `regexp_substr_all` + - `rtrimmed_length` + - `space` + - `split_part` - Added a parameter to enable and disable automatic column name aliasing for `interval_day_time_from_parts` and `interval_year_month_from_parts` functions. #### Bug Fixes diff --git a/docs/source/snowpark/functions.rst b/docs/source/snowpark/functions.rst index 7f54ad61a4..195b4f58f3 100644 --- a/docs/source/snowpark/functions.rst +++ b/docs/source/snowpark/functions.rst @@ -241,6 +241,7 @@ Functions haversine hex hex_decode_binary + hex_decode_string hex_encode hour h3_cell_to_boundary @@ -306,6 +307,7 @@ Functions is_timestamp_ntz is_timestamp_tz is_varchar + jarowinkler_similarity json_extract_path_text kurtosis lag @@ -375,6 +377,7 @@ Functions pandas_udf pandas_udtf parse_json + parse_url parse_xml percent_rank percentile_approx @@ -395,8 +398,12 @@ Functions regr_avgy regr_count regr_intercept + regexp_instr + regexp_like regr_r2 regr_slope + regexp_substr + regexp_substr_all regr_sxx regr_sxy regr_syy @@ -408,6 +415,7 @@ Functions row_number rpad rtrim + rtrimmed_length second seq1 seq2 @@ -425,7 +433,9 @@ Functions snowflake_cortex_summarize sort_array soundex + space split + split_part sproc sql_expr sqrt diff --git a/src/snowflake/snowpark/_functions/scalar_functions.py b/src/snowflake/snowpark/_functions/scalar_functions.py index 8ff4eb8cc9..da551ec82f 100644 --- a/src/snowflake/snowpark/_functions/scalar_functions.py +++ b/src/snowflake/snowpark/_functions/scalar_functions.py @@ -3971,3 +3971,776 @@ def try_to_geometry( return builtin("try_to_geometry", _emit_ast=_emit_ast)(c, allow_invalid_col) else: return builtin("try_to_geometry", _emit_ast=_emit_ast)(c) + + +@publicapi +def hex_decode_string(input_expr: ColumnOrName, _emit_ast: bool = True) -> Column: + """ + Decodes a hexadecimal-encoded string into its original string representation. + + Args: + input_expr (ColumnOrName): The column or string containing the hexadecimal-encoded string to decode. + + Returns: + Column: The decoded string. + + Examples:: + >>> from snowflake.snowpark.functions import col + >>> df = session.create_dataframe([["536E6F77666C616B65"], ["48454C4C4F"]], schema=["hex_string"]) + >>> df.select(hex_decode_string(col("hex_string")).alias("decoded")).collect() + [Row(DECODED='Snowflake'), Row(DECODED='HELLO')] + """ + c = _to_col_if_str(input_expr, "hex_decode_string") + return builtin("hex_decode_string", _emit_ast=_emit_ast)(c) + + +@publicapi +def jarowinkler_similarity( + string_expr1: ColumnOrName, string_expr2: ColumnOrName, _emit_ast: bool = True +) -> Column: + """ + Computes the Jaro-Winkler similarity between two strings. The Jaro-Winkler similarity + is a string metric measuring an edit distance between two sequences. It is a variant + of the Jaro distance metric designed to give more favorable ratings to strings with + common prefixes. + + Args: + string_expr1 (ColumnOrName): The first string expression to compare. + string_expr2 (ColumnOrName): The second string expression to compare. + + Returns: + Column: The Jaro-Winkler similarity score as an integer between 0 and 100. + + Examples:: + >>> df = session.create_dataframe([ + ... ("Snowflake", "Oracle"), + ... ("Ich weiß nicht", "Ich wei? nicht"), + ... ("Gute nacht", "Ich weis nicht"), + ... ("święta", "swieta"), + ... ("", ""), + ... ("test", "test") + ... ], schema=["s", "t"]) + >>> df.select(jarowinkler_similarity(df["s"], df["t"]).alias("similarity")).collect() + [Row(SIMILARITY=61), Row(SIMILARITY=97), Row(SIMILARITY=56), Row(SIMILARITY=77), Row(SIMILARITY=0), Row(SIMILARITY=100)] + """ + c1 = _to_col_if_str(string_expr1, "jarowinkler_similarity") + c2 = _to_col_if_str(string_expr2, "jarowinkler_similarity") + return builtin("jarowinkler_similarity", _emit_ast=_emit_ast)(c1, c2) + + +@publicapi +def parse_url( + string_expr: ColumnOrName, permissive: ColumnOrName = None, _emit_ast: bool = True +) -> Column: + """ + Parses a URL string and returns a JSON object containing the URL components. + + Args: + string_expr (ColumnOrName): The URL string to parse. + permissive (ColumnOrName, optional): If 1, parsing errors are ignored and None is returned. If 0 or omitted, parsing errors raise an exception. + + Returns: + Column: A JSON object containing the parsed URL components. + + Examples:: + >>> from snowflake.snowpark.functions import col, lit + >>> df = session.create_dataframe([ + ... ['https://www.snowflake.com/'], + ... ['http://USER:PASS@EXAMPLE.INT:4345/HELLO.PHP?USER=1'], + ... ['mailto:abc@xyz.com'], + ... [None] + ... ], schema=["url"]) + >>> df.select(parse_url(col("url"))).collect() + [Row(PARSE_URL("URL")='{\\n "fragment": null,\\n "host": "www.snowflake.com",\\n "parameters": null,\\n "path": "",\\n "port": null,\\n "query": null,\\n "scheme": "https"\\n}'), Row(PARSE_URL("URL")='{\\n "fragment": null,\\n "host": "USER:PASS@EXAMPLE.INT",\\n "parameters": {\\n "USER": "1"\\n },\\n "path": "HELLO.PHP",\\n "port": "4345",\\n "query": "USER=1",\\n "scheme": "http"\\n}'), Row(PARSE_URL("URL")='{\\n "fragment": null,\\n "host": null,\\n "parameters": null,\\n "path": "abc@xyz.com",\\n "port": null,\\n "query": null,\\n "scheme": "mailto"\\n}'), Row(PARSE_URL("URL")=None)] + + >>> df2 = session.create_dataframe([ + ... ['example.int/hello.php?user=12#nofragment'] + ... ], schema=["invalid_url"]) + >>> df2.select(parse_url(col("invalid_url"), lit(1))).collect() + [Row(PARSE_URL("INVALID_URL", 1)='{\\n "error": "scheme not specified"\\n}')] + """ + c = _to_col_if_str(string_expr, "parse_url") + if permissive is not None: + p = _to_col_if_str(permissive, "parse_url") + return builtin("parse_url", _emit_ast=_emit_ast)(c, p) + else: + return builtin("parse_url", _emit_ast=_emit_ast)(c) + + +@publicapi +def regexp_instr( + subject: ColumnOrName, + pattern: ColumnOrName, + position: ColumnOrName = None, + occurrence: ColumnOrName = None, + option: ColumnOrName = None, + regexp_parameters: ColumnOrName = None, + group_num: ColumnOrName = None, + _emit_ast: bool = True, +) -> Column: + """ + Returns the position of the specified occurrence of the regular expression pattern in the string subject. If no match is found, returns 0. + + Args: + subject (ColumnOrName): The string to search in. + pattern (ColumnOrName): The regular expression pattern to search for. + position (ColumnOrName, optional): The position in the string to start the search. Default is 1. + occurrence (ColumnOrName, optional): The occurrence of the pattern to find. Default is 1. + option (ColumnOrName, optional): Specifies whether to return the position of the first character of the match (0) or the position of the first character following the match (1). Default is 0. + regexp_parameters (ColumnOrName, optional): String of one or more characters that specifies the parameters for the regular expression. Default is 'c' (case-sensitive). + Supported values: + + +-----+-----------------------------------------------+ + | Parameter | Description | + +=====+===============================================+ + | c | Case-sensitive matching | + +-----+-----------------------------------------------+ + | i | Case-insensitive matching | + +-----+-----------------------------------------------+ + | m | Multi-line mode | + +-----+-----------------------------------------------+ + | e | Extract submatches | + +-----+-----------------------------------------------+ + | s | Single-line mode POSIX wildcard character | + | | . matches \\n | + +-----+-----------------------------------------------+ + + group_num (ColumnOrName, optional): Specifies which capture group to return the position for. Default is None, which returns the position of the entire match. + + Returns: + Column: The position of the match, or 0 if no match is found + + Examples:: + # Basic usage - only subject and pattern + >>> from snowflake.snowpark.functions import col, lit + >>> df = session.create_dataframe([["nevermore1, nevermore2, nevermore3.", "nevermore\\d"]], schema=["subject", "pattern"]) + >>> df.select(regexp_instr(col("subject"), col("pattern")).alias("basic_match")).collect() + [Row(BASIC_MATCH=1)] + + # With position parameter + >>> df2 = session.create_dataframe([["Hello world", "world", 7]], schema=["subject", "pattern", "position"]) + >>> df2.select(regexp_instr(col("subject"), col("pattern"), col("position")).alias("position_match")).collect() + [Row(POSITION_MATCH=7)] + + # With position and occurrence parameters + >>> df3 = session.create_dataframe([["nevermore1, nevermore2, nevermore3.", "nevermore\\d", 1, 2]], schema=["subject", "pattern", "position", "occurrence"]) + >>> df3.select(regexp_instr(col("subject"), col("pattern"), col("position"), col("occurrence")).alias("second_occurrence")).collect() + [Row(SECOND_OCCURRENCE=13)] + + # With position, occurrence, and option parameters + >>> df4 = session.create_dataframe([["Hello world", "world", 1, 1, 1]], schema=["subject", "pattern", "position", "occurrence", "option"]) + >>> df4.select(regexp_instr(col("subject"), col("pattern"), col("position"), col("occurrence"), col("option")).alias("after_match")).collect() + [Row(AFTER_MATCH=12)] + + # With position, occurrence, option, and regexp_parameters + >>> df5 = session.create_dataframe([["Hello world", "hello", 1, 1, 0, "i"]], schema=["subject", "pattern", "position", "occurrence", "option", "regexp_parameters"]) + >>> df5.select(regexp_instr(col("subject"), col("pattern"), col("position"), col("occurrence"), col("option"), col("regexp_parameters")).alias("case_insensitive")).collect() + [Row(CASE_INSENSITIVE=1)] + + # With all parameters including group_num + >>> df6 = session.create_dataframe([["Hello (World) (Test)", "(\\w+)", 1, 1, 0, "c", 1]], schema=["subject", "pattern", "position", "occurrence", "option", "regexp_parameters", "group_num"]) + >>> df6.select(regexp_instr(col("subject"), col("pattern"), col("position"), col("occurrence"), col("option"), col("regexp_parameters"), col("group_num")).alias("first_group")).collect() + [Row(FIRST_GROUP=1)] + + # Skipping position - with occurrence only + >>> df7 = session.create_dataframe([["nevermore1, nevermore2, nevermore3.", "nevermore\\d", "2"]], schema=["subject", "pattern", "occurrence"]) + >>> df7.select(regexp_instr(col("subject"), col("pattern"), occurrence=col("occurrence")).alias("skip_position")).collect() + [Row(SKIP_POSITION=13)] + + # Skipping position and occurrence - with option only + >>> df8 = session.create_dataframe([["Hello world", "world", 1]], schema=["subject", "pattern", "option"]) + >>> df8.select(regexp_instr(col("subject"), col("pattern"), option=col("option")).alias("skip_position_occurrence")).collect() + [Row(SKIP_POSITION_OCCURRENCE=12)] + + # Skipping position, occurrence, and option - with regexp_parameters only + >>> df9 = session.create_dataframe([["Hello World", "hello", "i"]], schema=["subject", "pattern", "regexp_parameters"]) + >>> df9.select(regexp_instr(col("subject"), col("pattern"), regexp_parameters=col("regexp_parameters")).alias("skip_to_regexp_params")).collect() + [Row(SKIP_TO_REGEXP_PARAMS=1)] + + # Skipping position, occurrence, option, and regexp_parameters - with group_num only + >>> df10 = session.create_dataframe([["Hello (world) (Test)", "(\\w+)", 1]], schema=["subject", "pattern", "group_num"]) + >>> df10.select(regexp_instr(col("subject"), col("pattern"), group_num=col("group_num")).alias("skip_to_group_num")).collect() + [Row(SKIP_TO_GROUP_NUM=1)] + + # Skipping position and occurrence - with option and regexp_parameters + >>> df11 = session.create_dataframe([["Hello World", "Hello", 1, "i"]], schema=["subject", "pattern", "option", "regexp_parameters"]) + >>> df11.select(regexp_instr(col("subject"), col("pattern"), option=col("option"), regexp_parameters=col("regexp_parameters")).alias("skip_position_occurrence_with_params")).collect() + [Row(SKIP_POSITION_OCCURRENCE_WITH_PARAMS=6)] + + # Skipping position, occurrence, and option - with regexp_parameters and group_num + >>> df12 = session.create_dataframe([["Hello (World) (Test)", "(\\w+)", "c", 1]], schema=["subject", "pattern", "regexp_parameters", "group_num"]) + >>> df12.select(regexp_instr(col("subject"), col("pattern"), regexp_parameters=col("regexp_parameters"), group_num=col("group_num")).alias("skip_to_params_and_group")).collect() + [Row(SKIP_TO_PARAMS_AND_GROUP=1)] + """ + if position is None: + position = lit(1) + if occurrence is None: + occurrence = lit(1) + if option is None: + option = lit(0) + if regexp_parameters is None: + regexp_parameters = lit("c") + args = [ + _to_col_if_str(subject, "regexp_instr"), + _to_col_if_str(pattern, "regexp_instr"), + _to_col_if_str(position, "regexp_instr"), + _to_col_if_str(occurrence, "regexp_instr"), + _to_col_if_str(option, "regexp_instr"), + _to_col_if_str(regexp_parameters, "regexp_instr"), + ] + + if group_num is not None: + args.append(_to_col_if_str(group_num, "regexp_instr")) + + return builtin("regexp_instr", _emit_ast=_emit_ast)(*args) + + +@publicapi +def regexp_like( + subject: ColumnOrName, + pattern: ColumnOrName, + parameters: ColumnOrName = None, + _emit_ast: bool = True, +) -> Column: + """ + Returns true if the subject matches the specified pattern. Both inputs must be text expressions. + + Args: + subject (ColumnOrName): A string expression to be matched against the pattern. + pattern (ColumnOrName): A string literal that will be used as a regular expression pattern. + parameters (ColumnOrName, optional): A string literal that specifies the parameters for the regular expression, defaults: c. + Supported Parameters: + c: Case-sensitive matching + i: Case-insensitive matching + m: Multi-line mode + e: Extract submatches + s: Single-line mode POSIX wildcard character . matches \ + + Returns: + Column: A boolean value indicating whether the subject matches the pattern. + + Examples:: + >>> from snowflake.snowpark.functions import col, lit + >>> df = session.create_dataframe([ + ... ('Sacramento',), + ... ('San Francisco',), + ... ('San Jose',), + ... ('New York',), + ... (None,) + ... ], schema=['city']) + >>> df.where(regexp_like(col('city'), lit('San.*'))).collect() + [Row(CITY='San Francisco'), Row(CITY='San Jose')] + + >>> df.where(regexp_like(col('city'), lit('SAN.*'), lit('i'))).collect() + [Row(CITY='San Francisco'), Row(CITY='San Jose')] + """ + subject_col = _to_col_if_str(subject, "regexp_like") + pattern_col = _to_col_if_str(pattern, "regexp_like") + + if parameters is None: + return builtin("regexp_like", _emit_ast=_emit_ast)(subject_col, pattern_col) + else: + parameters_col = _to_col_if_str(parameters, "regexp_like") + return builtin("regexp_like", _emit_ast=_emit_ast)( + subject_col, pattern_col, parameters_col + ) + + +@publicapi +def regexp_substr( + subject: ColumnOrName, + pattern: ColumnOrName, + position: ColumnOrName = None, + occurrence: ColumnOrName = None, + regex_parameters: ColumnOrName = None, + group_num: ColumnOrName = None, + _emit_ast: bool = True, +) -> Column: + """ + Returns the portion of the subject that matches the regular expression pattern. + + Args: + subject (ColumnOrName): The string to search for matches. + pattern (ColumnOrName): The regular expression pattern to match. + position (ColumnOrName, optional): The position in the string to start searching from (1-based). Defaults to 1. + occurrence (ColumnOrName, optional): Which occurrence of the pattern to return. Defaults to 1. + regex_parameters (ColumnOrName, optional): String of one or more characters that specifies the parameters for the regular expression. Default is 'c' (case-sensitive). + Supported values: + + +-----+-----------------------------------------------+ + | Parameter | Description | + +=====+===============================================+ + | c | Case-sensitive matching | + +-----+-----------------------------------------------+ + | i | Case-insensitive matching | + +-----+-----------------------------------------------+ + | m | Multi-line mode | + +-----+-----------------------------------------------+ + | e | Extract submatches | + +-----+-----------------------------------------------+ + | s | Single-line mode POSIX wildcard character | + | | . matches \\n | + +-----+-----------------------------------------------+ + + group_num (ColumnOrName, optional): The group number in the regular expression to extract. Defaults to None, which extracts the entire match. + + Returns: + Column: The substring that matches the pattern, or None if no match is found. + + Examples:: + # Basic usage - only subject and pattern + >>> from snowflake.snowpark.functions import col, lit + >>> df = session.create_dataframe([["nevermore1, nevermore2, nevermore3.", "nevermore\\d"]], schema=["subject", "pattern"]) + >>> df.select(regexp_substr(col("subject"), col("pattern")).alias("basic_match")).collect() + [Row(BASIC_MATCH='nevermore1')] + + # With position parameter + >>> df2 = session.create_dataframe([["Hello world", "world", 7]], schema=["subject", "pattern", "position"]) + >>> df2.select(regexp_substr(col("subject"), col("pattern"), col("position")).alias("position_match")).collect() + [Row(POSITION_MATCH='world')] + + # With position and occurrence parameters + >>> df3 = session.create_dataframe([["nevermore1, nevermore2, nevermore3.", "nevermore\\d", 1, 2]], schema=["subject", "pattern", "position", "occurrence"]) + >>> df3.select(regexp_substr(col("subject"), col("pattern"), col("position"), col("occurrence")).alias("second_occurrence")).collect() + [Row(SECOND_OCCURRENCE='nevermore2')] + + # With position, occurrence, and regex_parameters + >>> df5 = session.create_dataframe([["Hello world", "hello", 1, 1, "i"]], schema=["subject", "pattern", "position", "occurrence", "regex_parameters"]) + >>> df5.select(regexp_substr(col("subject"), col("pattern"), col("position"), col("occurrence"), col("regex_parameters")).alias("case_insensitive")).collect() + [Row(CASE_INSENSITIVE='Hello')] + + # With all parameters including group_num + >>> df6 = session.create_dataframe([["Hello (World) (Test)", "(\\w+)", 1, 1, "c", 1]], schema=["subject", "pattern", "position", "occurrence", "regex_parameters", "group_num"]) + >>> df6.select(regexp_substr(col("subject"), col("pattern"), col("position"), col("occurrence"), col("regex_parameters"), col("group_num")).alias("first_group")).collect() + [Row(FIRST_GROUP='Hello')] + + # Skipping position - with occurrence only + >>> df7 = session.create_dataframe([["nevermore1, nevermore2, nevermore3.", "nevermore\\d", "2"]], schema=["subject", "pattern", "occurrence"]) + >>> df7.select(regexp_substr(col("subject"), col("pattern"), occurrence=col("occurrence")).alias("skip_position")).collect() + [Row(SKIP_POSITION='nevermore2')] + + # Skipping position, occurrence - with regex_parameters only + >>> df9 = session.create_dataframe([["Hello World", "hello", "i"]], schema=["subject", "pattern", "regex_parameters"]) + >>> df9.select(regexp_substr(col("subject"), col("pattern"), regex_parameters=col("regex_parameters")).alias("skip_to_regexp_params")).collect() + [Row(SKIP_TO_REGEXP_PARAMS='Hello')] + + # Skipping position, occurrence, and regex_parameters - with group_num only + >>> df10 = session.create_dataframe([["Hello (world) (Test)", "(\\w+)", 1]], schema=["subject", "pattern", "group_num"]) + >>> df10.select(regexp_substr(col("subject"), col("pattern"), group_num=col("group_num")).alias("skip_to_group_num")).collect() + [Row(SKIP_TO_GROUP_NUM='Hello')] + + # Skipping position, occurrence - with regex_parameters and group_num + >>> df12 = session.create_dataframe([["Hello (World) (Test)", "(\\w+)", "c", 1]], schema=["subject", "pattern", "regex_parameters", "group_num"]) + >>> df12.select(regexp_substr(col("subject"), col("pattern"), regex_parameters=col("regex_parameters"), group_num=col("group_num")).alias("skip_to_params_and_group")).collect() + [Row(SKIP_TO_PARAMS_AND_GROUP='Hello')] + """ + + if position is None: + position = lit(1) + if occurrence is None: + occurrence = lit(1) + if regex_parameters is None: + regex_parameters = lit("c") + + args = [ + _to_col_if_str(subject, "regexp_substr"), + _to_col_if_str(pattern, "regexp_substr"), + _to_col_if_str(position, "regexp_substr"), + _to_col_if_str(occurrence, "regexp_substr"), + _to_col_if_str(regex_parameters, "regexp_substr"), + ] + + if group_num is not None: + args.append(_to_col_if_str(group_num, "regexp_substr")) + + return builtin("regexp_substr", _emit_ast=_emit_ast)(*args) + + +@publicapi +def regexp_substr_all( + subject: ColumnOrName, + pattern: ColumnOrName, + position: ColumnOrName = None, + occurrence: ColumnOrName = None, + regex_parameters: ColumnOrName = None, + group_num: ColumnOrName = None, + _emit_ast: bool = True, +) -> Column: + """ + Returns all substrings that match a regular expression within a string. + + Args: + subject (ColumnOrName): The string to search for matches. + pattern (ColumnOrName): The regular expression pattern to match. + position (ColumnOrName, optional): The position in the string to start searching from (1-based). Defaults to 1. + occurrence (ColumnOrName, optional): Which occurrence of the pattern to return. + regex_parameters (ColumnOrName, optional): String of one or more characters that specifies the parameters for the regular expression. Default is 'c' (case-sensitive). + Supported values: + + +-----+-----------------------------------------------+ + | Parameter | Description | + +=====+===============================================+ + | c | Case-sensitive matching | + +-----+-----------------------------------------------+ + | i | Case-insensitive matching | + +-----+-----------------------------------------------+ + | m | Multi-line mode | + +-----+-----------------------------------------------+ + | e | Extract submatches | + +-----+-----------------------------------------------+ + | s | Single-line mode POSIX wildcard character | + | | . matches \\n | + +-----+-----------------------------------------------+ + group_num (ColumnOrName, optional): The group number in the regular expression to extract. Defaults to None, which extracts the entire match. + + Returns: + Column: An array containing all matching substrings. + + Examples:: + >>> from snowflake.snowpark.functions import col, lit + >>> df = session.create_dataframe([['a1_a2a3_a4A5a6']], schema=["subject"]) + >>> df.select(regexp_substr_all(col("subject"), lit('a[[:digit:]]')).alias("result")).collect() + [Row(RESULT='[\\n "a1",\\n "a2",\\n "a3",\\n "a4",\\n "a6"\\n]')] + + >>> df.select(regexp_substr_all(col("subject"), lit('a[[:digit:]]'), lit(2)).alias("result")).collect() + [Row(RESULT='[\\n "a2",\\n "a3",\\n "a4",\\n "a6"\\n]')] + + >>> df.select(regexp_substr_all(col("subject"), lit('a[[:digit:]]'), lit(1), lit(3)).alias("result")).collect() + [Row(RESULT='[\\n "a3",\\n "a4",\\n "a6"\\n]')] + + >>> df.select(regexp_substr_all(col("subject"), lit('a[[:digit:]]'), lit(1), lit(1), lit('i')).alias("result")).collect() + [Row(RESULT='[\\n "a1",\\n "a2",\\n "a3",\\n "a4",\\n "A5",\\n "a6"\\n]')] + + >>> df.select(regexp_substr_all(col("subject"), lit('(a)([[:digit:]])'), lit(1), lit(1), lit('ie'), lit(1)).alias("result")).collect() + [Row(RESULT='[\\n "a",\\n "a",\\n "a",\\n "a",\\n "A",\\n "a"\\n]')] + + >>> df.select(regexp_substr_all(col("subject"), lit('b')).alias("result")).collect() + [Row(RESULT='[]')] + """ + if position is None: + position = lit(1) + if occurrence is None: + occurrence = lit(1) + if regex_parameters is None: + regex_parameters = lit("c") + + args = [ + _to_col_if_str(subject, "regexp_substr_all"), + _to_col_if_str(pattern, "regexp_substr_all"), + _to_col_if_str(position, "regexp_substr_all"), + _to_col_if_str(occurrence, "regexp_substr_all"), + _to_col_if_str(regex_parameters, "regexp_substr_all"), + ] + + if group_num is not None: + args.append(_to_col_if_str(group_num, "regexp_substr_all")) + + return builtin("regexp_substr_all", _emit_ast=_emit_ast)(*args) + + +@publicapi +def rtrimmed_length(string_expr: ColumnOrName, _emit_ast: bool = True) -> Column: + """ + Returns the length of the input string after removing trailing whitespace characters. + + Args: + string_expr (ColumnOrName): The string expression to calculate the right-trimmed length for. + + Returns: + Column: The length of the string after removing trailing whitespace. + + Examples:: + >>> df = session.create_dataframe([" ABCD ", "hello world ", " test", "no_spaces", ""], schema=["a"]) + >>> df.select(rtrimmed_length(df["a"]).alias("result")).collect() + [Row(RESULT=5), Row(RESULT=11), Row(RESULT=7), Row(RESULT=9), Row(RESULT=0)] + """ + c = _to_col_if_str(string_expr, "rtrimmed_length") + return builtin("rtrimmed_length", _emit_ast=_emit_ast)(c) + + +@publicapi +def space(n: ColumnOrName, _emit_ast: bool = True) -> Column: + """ + Returns a string consisting of n space characters. + + Args: + n (ColumnOrName): The number of space characters to return. + + Returns: + Column: A string consisting of the specified number of space characters. + + Examples:: + >>> from snowflake.snowpark.functions import col + >>> df = session.create_dataframe([[3], [5], [0]], schema=["n"]) + >>> df.select(space(col("n")).alias("result")).collect() + [Row(RESULT=' '), Row(RESULT=' '), Row(RESULT='')] + """ + c = _to_col_if_str(n, "space") + return builtin("space", _emit_ast=_emit_ast)(c) + + +@publicapi +def split_part( + string: ColumnOrName, + delimiter: ColumnOrName, + part_number: ColumnOrName, + _emit_ast: bool = True, +) -> Column: + """ + Splits a given string at a specified character and returns the requested part. + + Args: + string (ColumnOrName): The string to be split. + delimiter (ColumnOrName): The delimiter to split the string on. + part_number (ColumnOrName): The part number to return (1-based indexing). Negative numbers count from the end. + + Returns: + Column: The specified part of the split string. + + Examples:: + >>> from snowflake.snowpark.functions import col + >>> df = session.create_dataframe([ + ... ("11.22.33", ".", 1), + ... ("11.22.33", ".", 2), + ... ("11.22.33", ".", 3), + ... ("11.22.33", ".", -1), + ... ("127.0.0.1", ".", 1), + ... ("127.0.0.1", ".", -1), + ... ("|a|b|c|", "|", 1), + ... ("|a|b|c|", "|", 2), + ... ("aaa--bbb-BBB--ccc", "--", 2) + ... ], schema=["string_col", "delimiter_col", "part_number_col"]) + >>> result = df.select(split_part(col("string_col"), col("delimiter_col"), col("part_number_col")).alias("result")) + >>> result.collect() + [Row(RESULT='11'), Row(RESULT='22'), Row(RESULT='33'), Row(RESULT='33'), Row(RESULT='127'), Row(RESULT='1'), Row(RESULT=''), Row(RESULT='a'), Row(RESULT='bbb-BBB')] + """ + string_col = _to_col_if_str(string, "split_part") + delimiter_col = _to_col_if_str(delimiter, "split_part") + part_number_col = _to_col_if_str(part_number, "split_part") + return builtin("split_part", _emit_ast=_emit_ast)( + string_col, delimiter_col, part_number_col + ) + + +@publicapi +def strtok( + string: ColumnOrName, + delimiter: ColumnOrName = None, + part_nr: ColumnOrName = None, + _emit_ast: bool = True, +) -> Column: + """ + Tokenizes a string with the given set of delimiters and returns the requested part. + + Args: + string (ColumnOrName): The string to be tokenized. + delimiter (ColumnOrName, optional): A set of delimiters. Each character in the delimiter string is treated as a delimiter. If not specified, defaults to a single space character. + part_nr (ColumnOrName, optional): The requested part number (1-based). If not specified, returns the entire string. + + Returns: + Column: The requested part of the tokenized string. + + Examples:: + >>> from snowflake.snowpark.functions import col, lit + >>> df = session.create_dataframe([["a.b.c"]], schema=["string_col"]) + >>> df.select(strtok(col("string_col")).alias("result")).collect() + [Row(RESULT='a.b.c')] + >>> df.select(strtok(col("string_col"), lit(".")).alias("result")).collect() + [Row(RESULT='a')] + >>> df.select(strtok(col("string_col"), lit("."), lit(2)).alias("result")).collect() + [Row(RESULT='b')] + >>> df2 = session.create_dataframe([["user@snowflake.com"]], schema=["string_col"]) + >>> df2.select(strtok(col("string_col"), lit("@."), lit(1)).alias("result")).collect() + [Row(RESULT='user')] + >>> df2.select(strtok(col("string_col"), lit("@."), lit(3)).alias("result")).collect() + [Row(RESULT='com')] + """ + string_col = _to_col_if_str(string, "strtok") + + if delimiter is None and part_nr is None: + return builtin("strtok", _emit_ast=_emit_ast)(string_col) + elif part_nr is None: + delimiter_col = _to_col_if_str(delimiter, "strtok") + return builtin("strtok", _emit_ast=_emit_ast)(string_col, delimiter_col) + else: + delimiter_col = ( + _to_col_if_str(delimiter, "strtok") if delimiter is not None else lit(" ") + ) + part_nr_col = _to_col_if_str(part_nr, "strtok") + return builtin("strtok", _emit_ast=_emit_ast)( + string_col, delimiter_col, part_nr_col + ) + + +@publicapi +def try_base64_decode_binary( + input_expr: ColumnOrName, alphabet: ColumnOrName = None, _emit_ast: bool = True +) -> Column: + """ + Decodes a base64-encoded string to binary data. Returns NULL if the input is not valid base64. + + Args: + input_expr (ColumnOrName): The base64-encoded string to decode. + alphabet (ColumnOrName, optional): The base64 alphabet to use for decoding. If not specified, uses the standard base64 alphabet. + + Returns: + Column: A column containing the decoded binary data, or None if the input is invalid. + + Examples:: + >>> from snowflake.snowpark.functions import base64_encode + >>> df = session.create_dataframe(["HELP", "TEST"], schema=["input"]) + >>> df.select(try_base64_decode_binary(base64_encode(df["input"]))).collect() + [Row(TRY_BASE64_DECODE_BINARY(BASE64_ENCODE("INPUT"))=bytearray(b'HELP')), Row(TRY_BASE64_DECODE_BINARY(BASE64_ENCODE("INPUT"))=bytearray(b'TEST'))] + + >>> df2 = session.create_dataframe(["SEVMUA==", "VEVTVA=="], schema=["encoded"]) + >>> df2.select(try_base64_decode_binary(df2["encoded"])).collect() + [Row(TRY_BASE64_DECODE_BINARY("ENCODED")=bytearray(b'HELP')), Row(TRY_BASE64_DECODE_BINARY("ENCODED")=bytearray(b'TEST'))] + + >>> df3 = session.create_dataframe(["invalid_base64!"], schema=["bad_input"]) + >>> df3.select(try_base64_decode_binary(df3["bad_input"])).collect() + [Row(TRY_BASE64_DECODE_BINARY("BAD_INPUT")=None)] + """ + input_col = _to_col_if_str(input_expr, "try_base64_decode_binary") + + if alphabet is not None: + alphabet_col = _to_col_if_str(alphabet, "try_base64_decode_binary") + return builtin("try_base64_decode_binary", _emit_ast=_emit_ast)( + input_col, alphabet_col + ) + else: + return builtin("try_base64_decode_binary", _emit_ast=_emit_ast)(input_col) + + +@publicapi +def try_base64_decode_string( + input_expr: ColumnOrName, alphabet: ColumnOrName = None, _emit_ast: bool = True +) -> Column: + """ + Decodes a base64-encoded string and returns the result. If the input is not a valid base64-encoded string, returns NULL instead of raising an error. + + Args: + input_expr (ColumnOrName): A base64-encoded string to decode. + alphabet (ColumnOrName, optional): The base64 alphabet to use for decoding. If not specified, uses the standard base64 alphabet. + + Returns: + Column: The decoded string, or NULL if the input is not valid base64. + + Examples:: + >>> df = session.create_dataframe([["SEVMTE8="]], schema=["encoded"]) + >>> df.select(try_base64_decode_string(df["encoded"]).alias('result')).collect() + [Row(RESULT='HELLO')] + + >>> df = session.create_dataframe([["invalid_base64"]], schema=["encoded"]) + >>> df.select(try_base64_decode_string(df["encoded"]).alias('result')).collect() + [Row(RESULT=None)] + + >>> df = session.create_dataframe([["SEVMTE8="]], schema=["encoded"]) + >>> df.select(try_base64_decode_string(df["encoded"], lit('$')).alias('result')).collect() + [Row(RESULT='HELLO')] + """ + c = _to_col_if_str(input_expr, "try_base64_decode_string") + if alphabet is not None: + alphabet_col = _to_col_if_str(alphabet, "try_base64_decode_string") + return builtin("try_base64_decode_string", _emit_ast=_emit_ast)(c, alphabet_col) + else: + return builtin("try_base64_decode_string", _emit_ast=_emit_ast)(c) + + +@publicapi +def try_hex_decode_binary(input_expr: ColumnOrName, _emit_ast: bool = True) -> Column: + """ + Decodes a hex-encoded string to binary data. Returns None if the input is not a valid hex string. + + Args: + input_expr (ColumnOrName): A hex-encoded string to decode to binary data. + + Returns: + Column: The decoded binary data as bytearray, or None if input is invalid. + + Examples:: + >>> from snowflake.snowpark.functions import col + >>> df = session.create_dataframe([["41426162"], ["48656C6C6F"], ["576F726C64"]], schema=["hex_string"]) + >>> df.select(try_hex_decode_binary(col("hex_string")).alias("decoded_binary")).collect() + [Row(DECODED_BINARY=bytearray(b'ABab')), Row(DECODED_BINARY=bytearray(b'Hello')), Row(DECODED_BINARY=bytearray(b'World'))] + """ + c = _to_col_if_str(input_expr, "try_hex_decode_binary") + return builtin("try_hex_decode_binary", _emit_ast=_emit_ast)(c) + + +@publicapi +def try_hex_decode_string(input_expr: ColumnOrName, _emit_ast: bool = True) -> Column: + """ + Decodes a hex-encoded string to its original string value. Returns None if the input is not a valid hex string. + + Args: + input_expr (ColumnOrName): The hex-encoded string to decode. + + Returns: + Column: The decoded string, or None if the input is not valid hex. + + Examples:: + >>> df = session.create_dataframe([["41614262"], ["127"], ["invalid_hex"]], schema=["hex_input"]) + >>> df.select(try_hex_decode_string(df["hex_input"]).alias("decoded")).collect() + [Row(DECODED='AaBb'), Row(DECODED=None), Row(DECODED=None)] + """ + c = _to_col_if_str(input_expr, "try_hex_decode_string") + return builtin("try_hex_decode_string", _emit_ast=_emit_ast)(c) + + +@publicapi +def unicode(input_str: ColumnOrName, _emit_ast: bool = True) -> Column: + """ + Returns the Unicode code point of the first character in a string. + + Args: + input_str (ColumnOrName): The input string column or string value to get the Unicode code point from. + + Returns: + Column: The Unicode code point of the first character. Returns 0 for empty strings. + + Examples:: + >>> from snowflake.snowpark.functions import col + >>> df = session.create_dataframe([['a'], ['❄'], ['cde'], ['']], schema=["input_str"]) + >>> df.select(unicode(col("input_str")).alias("unicode_result")).collect() + [Row(UNICODE_RESULT=97), Row(UNICODE_RESULT=10052), Row(UNICODE_RESULT=99), Row(UNICODE_RESULT=0)] + """ + c = _to_col_if_str(input_str, "unicode") + return builtin("unicode", _emit_ast=_emit_ast)(c) + + +@publicapi +def uuid_string( + uuid: ColumnOrName = None, name: ColumnOrName = None, _emit_ast: bool = True +) -> Column: + """ + Returns a universally unique identifier (UUID) as a string. + + Args: + uuid (ColumnOrName, optional): The namespace UUID as a string. If provided, generates a UUID based on this namespace. + name (ColumnOrName, optional): The name to use for UUID generation. Used in combination with uuid parameter. + + Returns: + Column: A Column object representing a UUID string. + + Examples:: + >>> df = session.create_dataframe([["test"]], schema=["a"]) + >>> df.select(uuid_string().alias("random_uuid")).collect() # doctest: +SKIP + [Row(RANDOM_UUID='...')] + + >>> df.select(uuid_string("fe971b24-9572-4005-b22f-351e9c09274d", "foo").alias("named_uuid")).collect() # doctest: +SKIP + [Row(NAMED_UUID='...')] + + >>> df.select(uuid_string("fe971b24-9572-4005-b22f-351e9c09274d").alias("uuid_with_namespace")).collect() # doctest: +SKIP + [Row(UUID_WITH_NAMESPACE='...')] + + >>> df.select(uuid_string(name="foo").alias("uuid_with_name")).collect() # doctest: +SKIP + [Row(UUID_WITH_NAME='...')] + """ + if uuid is None and name is None: + return builtin("uuid_string", _emit_ast=_emit_ast)() + elif uuid is not None and name is not None: + return builtin("uuid_string", _emit_ast=_emit_ast)(uuid, name) + elif uuid is not None: + return builtin("uuid_string", _emit_ast=_emit_ast)(uuid) + else: + builtin("uuid_string", _emit_ast=_emit_ast)(name) From e24b5ac69b3902e3a565669c219e15f6187a68bd Mon Sep 17 00:00:00 2001 From: Arturo Herrera Aguilar Date: Fri, 24 Oct 2025 08:55:56 -0600 Subject: [PATCH 2/7] Add missing return --- src/snowflake/snowpark/_functions/scalar_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snowflake/snowpark/_functions/scalar_functions.py b/src/snowflake/snowpark/_functions/scalar_functions.py index da551ec82f..5d00ba6a41 100644 --- a/src/snowflake/snowpark/_functions/scalar_functions.py +++ b/src/snowflake/snowpark/_functions/scalar_functions.py @@ -4743,4 +4743,4 @@ def uuid_string( elif uuid is not None: return builtin("uuid_string", _emit_ast=_emit_ast)(uuid) else: - builtin("uuid_string", _emit_ast=_emit_ast)(name) + return builtin("uuid_string", _emit_ast=_emit_ast)(name) From 3c1df1758a30aa2978e72e289ef3865059163a9b Mon Sep 17 00:00:00 2001 From: Arturo Herrera Aguilar Date: Tue, 28 Oct 2025 10:59:39 -0600 Subject: [PATCH 3/7] Update docstrings --- .../snowpark/_functions/scalar_functions.py | 60 +++++-------------- 1 file changed, 15 insertions(+), 45 deletions(-) diff --git a/src/snowflake/snowpark/_functions/scalar_functions.py b/src/snowflake/snowpark/_functions/scalar_functions.py index 5d00ba6a41..01e8c24faf 100644 --- a/src/snowflake/snowpark/_functions/scalar_functions.py +++ b/src/snowflake/snowpark/_functions/scalar_functions.py @@ -4089,21 +4089,11 @@ def regexp_instr( option (ColumnOrName, optional): Specifies whether to return the position of the first character of the match (0) or the position of the first character following the match (1). Default is 0. regexp_parameters (ColumnOrName, optional): String of one or more characters that specifies the parameters for the regular expression. Default is 'c' (case-sensitive). Supported values: - - +-----+-----------------------------------------------+ - | Parameter | Description | - +=====+===============================================+ - | c | Case-sensitive matching | - +-----+-----------------------------------------------+ - | i | Case-insensitive matching | - +-----+-----------------------------------------------+ - | m | Multi-line mode | - +-----+-----------------------------------------------+ - | e | Extract submatches | - +-----+-----------------------------------------------+ - | s | Single-line mode POSIX wildcard character | - | | . matches \\n | - +-----+-----------------------------------------------+ + - `c`: Case-sensitive matching + - `i`: Case-insensitive matching + - `m`: Multi-line mode + - `e`: Extract submatches + - `s`: Single-line mode (POSIX wildcard character `.` matches `\n`) group_num (ColumnOrName, optional): Specifies which capture group to return the position for. Default is None, which returns the position of the entire match. @@ -4266,21 +4256,11 @@ def regexp_substr( occurrence (ColumnOrName, optional): Which occurrence of the pattern to return. Defaults to 1. regex_parameters (ColumnOrName, optional): String of one or more characters that specifies the parameters for the regular expression. Default is 'c' (case-sensitive). Supported values: - - +-----+-----------------------------------------------+ - | Parameter | Description | - +=====+===============================================+ - | c | Case-sensitive matching | - +-----+-----------------------------------------------+ - | i | Case-insensitive matching | - +-----+-----------------------------------------------+ - | m | Multi-line mode | - +-----+-----------------------------------------------+ - | e | Extract submatches | - +-----+-----------------------------------------------+ - | s | Single-line mode POSIX wildcard character | - | | . matches \\n | - +-----+-----------------------------------------------+ + - `c`: Case-sensitive matching + - `i`: Case-insensitive matching + - `m`: Multi-line mode + - `e`: Extract submatches + - `s`: Single-line mode (POSIX wildcard character `.` matches `\n`) group_num (ColumnOrName, optional): The group number in the regular expression to extract. Defaults to None, which extracts the entire match. @@ -4376,21 +4356,11 @@ def regexp_substr_all( occurrence (ColumnOrName, optional): Which occurrence of the pattern to return. regex_parameters (ColumnOrName, optional): String of one or more characters that specifies the parameters for the regular expression. Default is 'c' (case-sensitive). Supported values: - - +-----+-----------------------------------------------+ - | Parameter | Description | - +=====+===============================================+ - | c | Case-sensitive matching | - +-----+-----------------------------------------------+ - | i | Case-insensitive matching | - +-----+-----------------------------------------------+ - | m | Multi-line mode | - +-----+-----------------------------------------------+ - | e | Extract submatches | - +-----+-----------------------------------------------+ - | s | Single-line mode POSIX wildcard character | - | | . matches \\n | - +-----+-----------------------------------------------+ + - `c`: Case-sensitive matching + - `i`: Case-insensitive matching + - `m`: Multi-line mode + - `e`: Extract submatches + - `s`: Single-line mode (POSIX wildcard character `.` matches `\n`) group_num (ColumnOrName, optional): The group number in the regular expression to extract. Defaults to None, which extracts the entire match. Returns: From e8ee8274a0ba5106a9f9525dc160031b27af9108 Mon Sep 17 00:00:00 2001 From: Arturo Herrera Aguilar Date: Tue, 28 Oct 2025 13:30:41 -0600 Subject: [PATCH 4/7] Update docstrings --- .../snowpark/_functions/scalar_functions.py | 51 +++++++++---------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/src/snowflake/snowpark/_functions/scalar_functions.py b/src/snowflake/snowpark/_functions/scalar_functions.py index 01e8c24faf..66dc92eafc 100644 --- a/src/snowflake/snowpark/_functions/scalar_functions.py +++ b/src/snowflake/snowpark/_functions/scalar_functions.py @@ -4088,13 +4088,12 @@ def regexp_instr( occurrence (ColumnOrName, optional): The occurrence of the pattern to find. Default is 1. option (ColumnOrName, optional): Specifies whether to return the position of the first character of the match (0) or the position of the first character following the match (1). Default is 0. regexp_parameters (ColumnOrName, optional): String of one or more characters that specifies the parameters for the regular expression. Default is 'c' (case-sensitive). - Supported values: - - `c`: Case-sensitive matching - - `i`: Case-insensitive matching - - `m`: Multi-line mode - - `e`: Extract submatches - - `s`: Single-line mode (POSIX wildcard character `.` matches `\n`) - + Supported values: + - `c`: Case-sensitive matching + - `i`: Case-insensitive matching + - `m`: Multi-line mode + - `e`: Extract submatches + - `s`: Single-line mode (POSIX wildcard character `.` matches `\\n`) group_num (ColumnOrName, optional): Specifies which capture group to return the position for. Default is None, which returns the position of the entire match. Returns: @@ -4199,13 +4198,12 @@ def regexp_like( subject (ColumnOrName): A string expression to be matched against the pattern. pattern (ColumnOrName): A string literal that will be used as a regular expression pattern. parameters (ColumnOrName, optional): A string literal that specifies the parameters for the regular expression, defaults: c. - Supported Parameters: - c: Case-sensitive matching - i: Case-insensitive matching - m: Multi-line mode - e: Extract submatches - s: Single-line mode POSIX wildcard character . matches \ - + Supported values: + - `c`: Case-sensitive matching + - `i`: Case-insensitive matching + - `m`: Multi-line mode + - `e`: Extract submatches + - `s`: Single-line mode (POSIX wildcard character `.` matches `\\n`) Returns: Column: A boolean value indicating whether the subject matches the pattern. @@ -4255,13 +4253,12 @@ def regexp_substr( position (ColumnOrName, optional): The position in the string to start searching from (1-based). Defaults to 1. occurrence (ColumnOrName, optional): Which occurrence of the pattern to return. Defaults to 1. regex_parameters (ColumnOrName, optional): String of one or more characters that specifies the parameters for the regular expression. Default is 'c' (case-sensitive). - Supported values: - - `c`: Case-sensitive matching - - `i`: Case-insensitive matching - - `m`: Multi-line mode - - `e`: Extract submatches - - `s`: Single-line mode (POSIX wildcard character `.` matches `\n`) - + Supported values: + - `c`: Case-sensitive matching + - `i`: Case-insensitive matching + - `m`: Multi-line mode + - `e`: Extract submatches + - `s`: Single-line mode (POSIX wildcard character `.` matches `\\n`) group_num (ColumnOrName, optional): The group number in the regular expression to extract. Defaults to None, which extracts the entire match. Returns: @@ -4355,12 +4352,12 @@ def regexp_substr_all( position (ColumnOrName, optional): The position in the string to start searching from (1-based). Defaults to 1. occurrence (ColumnOrName, optional): Which occurrence of the pattern to return. regex_parameters (ColumnOrName, optional): String of one or more characters that specifies the parameters for the regular expression. Default is 'c' (case-sensitive). - Supported values: - - `c`: Case-sensitive matching - - `i`: Case-insensitive matching - - `m`: Multi-line mode - - `e`: Extract submatches - - `s`: Single-line mode (POSIX wildcard character `.` matches `\n`) + Supported values: + - `c`: Case-sensitive matching + - `i`: Case-insensitive matching + - `m`: Multi-line mode + - `e`: Extract submatches + - `s`: Single-line mode (POSIX wildcard character `.` matches `\\n`) group_num (ColumnOrName, optional): The group number in the regular expression to extract. Defaults to None, which extracts the entire match. Returns: From 0f008aa82b91a034b3a2b647df9022e9695e00c0 Mon Sep 17 00:00:00 2001 From: Arturo Herrera Aguilar Date: Thu, 30 Oct 2025 11:54:39 -0600 Subject: [PATCH 5/7] Update docstring test. --- src/snowflake/snowpark/_functions/scalar_functions.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/snowflake/snowpark/_functions/scalar_functions.py b/src/snowflake/snowpark/_functions/scalar_functions.py index 66dc92eafc..d66fc9301b 100644 --- a/src/snowflake/snowpark/_functions/scalar_functions.py +++ b/src/snowflake/snowpark/_functions/scalar_functions.py @@ -4551,7 +4551,7 @@ def try_base64_decode_binary( alphabet (ColumnOrName, optional): The base64 alphabet to use for decoding. If not specified, uses the standard base64 alphabet. Returns: - Column: A column containing the decoded binary data, or None if the input is invalid. + Column: The decoded binary data, or None if the input is invalid. Examples:: >>> from snowflake.snowpark.functions import base64_encode @@ -4566,6 +4566,10 @@ def try_base64_decode_binary( >>> df3 = session.create_dataframe(["invalid_base64!"], schema=["bad_input"]) >>> df3.select(try_base64_decode_binary(df3["bad_input"])).collect() [Row(TRY_BASE64_DECODE_BINARY("BAD_INPUT")=None)] + + >>> df4 = session.create_dataframe(["SEVMTE8="], schema=["encoded"]) + >>> df4.select(try_base64_decode_binary(df4["encoded"]), lit("+/=")).collect() + [Row(TRY_BASE64_DECODE_BINARY("ENCODED")=bytearray(b'HELLO'), '+/='='+/=')] """ input_col = _to_col_if_str(input_expr, "try_base64_decode_binary") From cbdea7d2ed927ba24436479b318e1aec0cdab282 Mon Sep 17 00:00:00 2001 From: Arturo Herrera Aguilar Date: Thu, 30 Oct 2025 12:26:30 -0600 Subject: [PATCH 6/7] Update docstring test. --- src/snowflake/snowpark/_functions/scalar_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/snowflake/snowpark/_functions/scalar_functions.py b/src/snowflake/snowpark/_functions/scalar_functions.py index d66fc9301b..503b72ac0b 100644 --- a/src/snowflake/snowpark/_functions/scalar_functions.py +++ b/src/snowflake/snowpark/_functions/scalar_functions.py @@ -4568,8 +4568,8 @@ def try_base64_decode_binary( [Row(TRY_BASE64_DECODE_BINARY("BAD_INPUT")=None)] >>> df4 = session.create_dataframe(["SEVMTE8="], schema=["encoded"]) - >>> df4.select(try_base64_decode_binary(df4["encoded"]), lit("+/=")).collect() - [Row(TRY_BASE64_DECODE_BINARY("ENCODED")=bytearray(b'HELLO'), '+/='='+/=')] + >>> df4.select(try_base64_decode_binary(df4["encoded"], lit("+/="))).collect() + [Row(TRY_BASE64_DECODE_BINARY("ENCODED", '+/=')=bytearray(b'HELLO'))] """ input_col = _to_col_if_str(input_expr, "try_base64_decode_binary") From b07283fa67cd5698593c4467736c9b804ff32fe9 Mon Sep 17 00:00:00 2001 From: Arturo Herrera Aguilar Date: Fri, 31 Oct 2025 16:22:02 -0600 Subject: [PATCH 7/7] Remove functions from other PR --- .../snowpark/_functions/scalar_functions.py | 178 ------------------ 1 file changed, 178 deletions(-) diff --git a/src/snowflake/snowpark/_functions/scalar_functions.py b/src/snowflake/snowpark/_functions/scalar_functions.py index 503b72ac0b..9619e092cb 100644 --- a/src/snowflake/snowpark/_functions/scalar_functions.py +++ b/src/snowflake/snowpark/_functions/scalar_functions.py @@ -4537,181 +4537,3 @@ def strtok( return builtin("strtok", _emit_ast=_emit_ast)( string_col, delimiter_col, part_nr_col ) - - -@publicapi -def try_base64_decode_binary( - input_expr: ColumnOrName, alphabet: ColumnOrName = None, _emit_ast: bool = True -) -> Column: - """ - Decodes a base64-encoded string to binary data. Returns NULL if the input is not valid base64. - - Args: - input_expr (ColumnOrName): The base64-encoded string to decode. - alphabet (ColumnOrName, optional): The base64 alphabet to use for decoding. If not specified, uses the standard base64 alphabet. - - Returns: - Column: The decoded binary data, or None if the input is invalid. - - Examples:: - >>> from snowflake.snowpark.functions import base64_encode - >>> df = session.create_dataframe(["HELP", "TEST"], schema=["input"]) - >>> df.select(try_base64_decode_binary(base64_encode(df["input"]))).collect() - [Row(TRY_BASE64_DECODE_BINARY(BASE64_ENCODE("INPUT"))=bytearray(b'HELP')), Row(TRY_BASE64_DECODE_BINARY(BASE64_ENCODE("INPUT"))=bytearray(b'TEST'))] - - >>> df2 = session.create_dataframe(["SEVMUA==", "VEVTVA=="], schema=["encoded"]) - >>> df2.select(try_base64_decode_binary(df2["encoded"])).collect() - [Row(TRY_BASE64_DECODE_BINARY("ENCODED")=bytearray(b'HELP')), Row(TRY_BASE64_DECODE_BINARY("ENCODED")=bytearray(b'TEST'))] - - >>> df3 = session.create_dataframe(["invalid_base64!"], schema=["bad_input"]) - >>> df3.select(try_base64_decode_binary(df3["bad_input"])).collect() - [Row(TRY_BASE64_DECODE_BINARY("BAD_INPUT")=None)] - - >>> df4 = session.create_dataframe(["SEVMTE8="], schema=["encoded"]) - >>> df4.select(try_base64_decode_binary(df4["encoded"], lit("+/="))).collect() - [Row(TRY_BASE64_DECODE_BINARY("ENCODED", '+/=')=bytearray(b'HELLO'))] - """ - input_col = _to_col_if_str(input_expr, "try_base64_decode_binary") - - if alphabet is not None: - alphabet_col = _to_col_if_str(alphabet, "try_base64_decode_binary") - return builtin("try_base64_decode_binary", _emit_ast=_emit_ast)( - input_col, alphabet_col - ) - else: - return builtin("try_base64_decode_binary", _emit_ast=_emit_ast)(input_col) - - -@publicapi -def try_base64_decode_string( - input_expr: ColumnOrName, alphabet: ColumnOrName = None, _emit_ast: bool = True -) -> Column: - """ - Decodes a base64-encoded string and returns the result. If the input is not a valid base64-encoded string, returns NULL instead of raising an error. - - Args: - input_expr (ColumnOrName): A base64-encoded string to decode. - alphabet (ColumnOrName, optional): The base64 alphabet to use for decoding. If not specified, uses the standard base64 alphabet. - - Returns: - Column: The decoded string, or NULL if the input is not valid base64. - - Examples:: - >>> df = session.create_dataframe([["SEVMTE8="]], schema=["encoded"]) - >>> df.select(try_base64_decode_string(df["encoded"]).alias('result')).collect() - [Row(RESULT='HELLO')] - - >>> df = session.create_dataframe([["invalid_base64"]], schema=["encoded"]) - >>> df.select(try_base64_decode_string(df["encoded"]).alias('result')).collect() - [Row(RESULT=None)] - - >>> df = session.create_dataframe([["SEVMTE8="]], schema=["encoded"]) - >>> df.select(try_base64_decode_string(df["encoded"], lit('$')).alias('result')).collect() - [Row(RESULT='HELLO')] - """ - c = _to_col_if_str(input_expr, "try_base64_decode_string") - if alphabet is not None: - alphabet_col = _to_col_if_str(alphabet, "try_base64_decode_string") - return builtin("try_base64_decode_string", _emit_ast=_emit_ast)(c, alphabet_col) - else: - return builtin("try_base64_decode_string", _emit_ast=_emit_ast)(c) - - -@publicapi -def try_hex_decode_binary(input_expr: ColumnOrName, _emit_ast: bool = True) -> Column: - """ - Decodes a hex-encoded string to binary data. Returns None if the input is not a valid hex string. - - Args: - input_expr (ColumnOrName): A hex-encoded string to decode to binary data. - - Returns: - Column: The decoded binary data as bytearray, or None if input is invalid. - - Examples:: - >>> from snowflake.snowpark.functions import col - >>> df = session.create_dataframe([["41426162"], ["48656C6C6F"], ["576F726C64"]], schema=["hex_string"]) - >>> df.select(try_hex_decode_binary(col("hex_string")).alias("decoded_binary")).collect() - [Row(DECODED_BINARY=bytearray(b'ABab')), Row(DECODED_BINARY=bytearray(b'Hello')), Row(DECODED_BINARY=bytearray(b'World'))] - """ - c = _to_col_if_str(input_expr, "try_hex_decode_binary") - return builtin("try_hex_decode_binary", _emit_ast=_emit_ast)(c) - - -@publicapi -def try_hex_decode_string(input_expr: ColumnOrName, _emit_ast: bool = True) -> Column: - """ - Decodes a hex-encoded string to its original string value. Returns None if the input is not a valid hex string. - - Args: - input_expr (ColumnOrName): The hex-encoded string to decode. - - Returns: - Column: The decoded string, or None if the input is not valid hex. - - Examples:: - >>> df = session.create_dataframe([["41614262"], ["127"], ["invalid_hex"]], schema=["hex_input"]) - >>> df.select(try_hex_decode_string(df["hex_input"]).alias("decoded")).collect() - [Row(DECODED='AaBb'), Row(DECODED=None), Row(DECODED=None)] - """ - c = _to_col_if_str(input_expr, "try_hex_decode_string") - return builtin("try_hex_decode_string", _emit_ast=_emit_ast)(c) - - -@publicapi -def unicode(input_str: ColumnOrName, _emit_ast: bool = True) -> Column: - """ - Returns the Unicode code point of the first character in a string. - - Args: - input_str (ColumnOrName): The input string column or string value to get the Unicode code point from. - - Returns: - Column: The Unicode code point of the first character. Returns 0 for empty strings. - - Examples:: - >>> from snowflake.snowpark.functions import col - >>> df = session.create_dataframe([['a'], ['❄'], ['cde'], ['']], schema=["input_str"]) - >>> df.select(unicode(col("input_str")).alias("unicode_result")).collect() - [Row(UNICODE_RESULT=97), Row(UNICODE_RESULT=10052), Row(UNICODE_RESULT=99), Row(UNICODE_RESULT=0)] - """ - c = _to_col_if_str(input_str, "unicode") - return builtin("unicode", _emit_ast=_emit_ast)(c) - - -@publicapi -def uuid_string( - uuid: ColumnOrName = None, name: ColumnOrName = None, _emit_ast: bool = True -) -> Column: - """ - Returns a universally unique identifier (UUID) as a string. - - Args: - uuid (ColumnOrName, optional): The namespace UUID as a string. If provided, generates a UUID based on this namespace. - name (ColumnOrName, optional): The name to use for UUID generation. Used in combination with uuid parameter. - - Returns: - Column: A Column object representing a UUID string. - - Examples:: - >>> df = session.create_dataframe([["test"]], schema=["a"]) - >>> df.select(uuid_string().alias("random_uuid")).collect() # doctest: +SKIP - [Row(RANDOM_UUID='...')] - - >>> df.select(uuid_string("fe971b24-9572-4005-b22f-351e9c09274d", "foo").alias("named_uuid")).collect() # doctest: +SKIP - [Row(NAMED_UUID='...')] - - >>> df.select(uuid_string("fe971b24-9572-4005-b22f-351e9c09274d").alias("uuid_with_namespace")).collect() # doctest: +SKIP - [Row(UUID_WITH_NAMESPACE='...')] - - >>> df.select(uuid_string(name="foo").alias("uuid_with_name")).collect() # doctest: +SKIP - [Row(UUID_WITH_NAME='...')] - """ - if uuid is None and name is None: - return builtin("uuid_string", _emit_ast=_emit_ast)() - elif uuid is not None and name is not None: - return builtin("uuid_string", _emit_ast=_emit_ast)(uuid, name) - elif uuid is not None: - return builtin("uuid_string", _emit_ast=_emit_ast)(uuid) - else: - return builtin("uuid_string", _emit_ast=_emit_ast)(name)