@@ -4313,18 +4313,13 @@ def test_initcap(self):
43134313 "spark" : Spark2 .INITCAP_DEFAULT_DELIMITER_CHARS ,
43144314 }
43154315
4316- REGEX_LITERAL_ESCAPES = {
4317- "\\ " : "\\ \\ " ,
4318- "-" : "\\ -" ,
4319- "^" : "\\ ^" ,
4320- "[" : "\\ [" ,
4321- "]" : "\\ ]" ,
4316+ duckdb_default_delimiter_sql = {
4317+ "" : "ARRAY_TO_STRING(CASE WHEN REGEXP_MATCHES(LEFT(col, 1), '[' || (' \t \n \r \x0c ' || CHR(11) || '!\" #$%&''()*+,\\ \\ \\ -./:;<=>?@\\ \\ \\ [\\ \\ \\ ]\\ ^_`{|}~') || ']') THEN LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || (' \t \n \r \x0c ' || CHR(11) || '!\" #$%&''()*+,\\ \\ \\ -./:;<=>?@\\ \\ \\ [\\ \\ \\ ]\\ ^_`{|}~') || ']+|[^' || (' \t \n \r \x0c ' || CHR(11) || '!\" #$%&''()*+,\\ \\ \\ -./:;<=>?@\\ \\ \\ [\\ \\ \\ ]\\ ^_`{|}~') || ']+)'), (seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) ELSE LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || (' \t \n \r \x0c ' || CHR(11) || '!\" #$%&''()*+,\\ \\ \\ -./:;<=>?@\\ \\ \\ [\\ \\ \\ ]\\ ^_`{|}~') || ']+|[^' || (' \t \n \r \x0c ' || CHR(11) || '!\" #$%&''()*+,\\ \\ \\ -./:;<=>?@\\ \\ \\ [\\ \\ \\ ]\\ ^_`{|}~') || ']+)'), (seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) END, '')" ,
4318+ "bigquery" : "ARRAY_TO_STRING(CASE WHEN REGEXP_MATCHES(LEFT(col, 1), '[' || (' \t \n \r \x0c ' || CHR(11) || '\\ \\ \\ [\\ \\ \\ ](){}/|<>!?@\" \\ ^#$&~_,.:;*%+\\ \\ \\ -') || ']') THEN LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || (' \t \n \r \x0c ' || CHR(11) || '\\ \\ \\ [\\ \\ \\ ](){}/|<>!?@\" \\ ^#$&~_,.:;*%+\\ \\ \\ -') || ']+|[^' || (' \t \n \r \x0c ' || CHR(11) || '\\ \\ \\ [\\ \\ \\ ](){}/|<>!?@\" \\ ^#$&~_,.:;*%+\\ \\ \\ -') || ']+)'), (seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) ELSE LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || (' \t \n \r \x0c ' || CHR(11) || '\\ \\ \\ [\\ \\ \\ ](){}/|<>!?@\" \\ ^#$&~_,.:;*%+\\ \\ \\ -') || ']+|[^' || (' \t \n \r \x0c ' || CHR(11) || '\\ \\ \\ [\\ \\ \\ ](){}/|<>!?@\" \\ ^#$&~_,.:;*%+\\ \\ \\ -') || ']+)'), (seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) END, '')" ,
4319+ "snowflake" : "ARRAY_TO_STRING(CASE WHEN REGEXP_MATCHES(LEFT(col, 1), '[' || (' \t \n \r \x0c ' || CHR(11) || '!?@\" \\ ^#$&~_,.:;+\\ \\ \\ -*%/|\\ \\ \\ [\\ \\ \\ ](){}<>') || ']') THEN LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || (' \t \n \r \x0c ' || CHR(11) || '!?@\" \\ ^#$&~_,.:;+\\ \\ \\ -*%/|\\ \\ \\ [\\ \\ \\ ](){}<>') || ']+|[^' || (' \t \n \r \x0c ' || CHR(11) || '!?@\" \\ ^#$&~_,.:;+\\ \\ \\ -*%/|\\ \\ \\ [\\ \\ \\ ](){}<>') || ']+)'), (seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) ELSE LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || (' \t \n \r \x0c ' || CHR(11) || '!?@\" \\ ^#$&~_,.:;+\\ \\ \\ -*%/|\\ \\ \\ [\\ \\ \\ ](){}<>') || ']+|[^' || (' \t \n \r \x0c ' || CHR(11) || '!?@\" \\ ^#$&~_,.:;+\\ \\ \\ -*%/|\\ \\ \\ [\\ \\ \\ ](){}<>') || ']+)'), (seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) END, '')" ,
4320+ "spark" : "ARRAY_TO_STRING(CASE WHEN REGEXP_MATCHES(LEFT(col, 1), '[' || ' ' || ']') THEN LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || ' ' || ']+|[^' || ' ' || ']+)'), (seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) ELSE LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || ' ' || ']+|[^' || ' ' || ']+)'), (seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) END, '')" ,
43224321 }
43234322
4324- def duckdb_regex_literal_sql (delimiters : str ) -> str :
4325- escaped_literal = "" .join (REGEX_LITERAL_ESCAPES .get (ch , ch ) for ch in delimiters )
4326- return exp .Literal .string (escaped_literal ).sql ("duckdb" )
4327-
43284323 # None delimiters arg doesn't error
43294324 with self .subTest ("Testing INITCAP with None delimiters arg" ):
43304325 self .assertEqual (exp .Initcap (this = exp .Literal .string ("col" )).sql (), "INITCAP('col')" )
@@ -4355,17 +4350,10 @@ def duckdb_regex_literal_sql(delimiters: str) -> str:
43554350
43564351 for dialect , default_delimiters in delimiter_chars .items ():
43574352 with self .subTest (f"DuckDB rewrite for { dialect or 'default' } default delimiters" ):
4358- escaped_literal = duckdb_regex_literal_sql (default_delimiters )
4359- expected = (
4360- "ARRAY_TO_STRING("
4361- f"CASE WHEN REGEXP_MATCHES(LEFT(col, 1), '[' || { escaped_literal } || ']') "
4362- f"THEN LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || { escaped_literal } || ']+|[^' || { escaped_literal } || ']+)'), "
4363- f"(seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) "
4364- f"ELSE LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || { escaped_literal } || ']+|[^' || { escaped_literal } || ']+)'), "
4365- f"(seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) "
4366- "END, '')"
4353+ self .assertEqual (
4354+ parse_one ("INITCAP(col)" , read = dialect ).sql ("duckdb" ),
4355+ duckdb_default_delimiter_sql [dialect ],
43674356 )
4368- self .assertEqual (parse_one ("INITCAP(col)" , read = dialect ).sql ("duckdb" ), expected )
43694357
43704358 # DuckDB generation for BQ/Snowflake calls with custom delimiters arg
43714359 for dialect in ("bigquery" , "snowflake" ):
@@ -4378,49 +4366,41 @@ def duckdb_regex_literal_sql(delimiters: str) -> str:
43784366
43794367 query = "INITCAP(col, NULL)"
43804368 with self .subTest (f"DuckDB generation for { query } from { dialect } " ):
4369+ # NULL delimiters generate verbose REPLACE calls but still evaluate to NULL correctly
4370+ escaped_null = r"REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(NULL, '\', '\\'), '-', '\-'), '^', '\^'), '[', '\['), ']', '\]')"
43814371 self .assertEqual (
43824372 parse_one (query , read = dialect ).sql ("duckdb" ),
4383- "ARRAY_TO_STRING("
4384- "CASE WHEN REGEXP_MATCHES(LEFT(col, 1), NULL ) "
4385- "THEN LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, NULL ), "
4386- "(seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) "
4387- "ELSE LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, NULL ), "
4388- "(seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) "
4389- "END, '')" ,
4373+ f "ARRAY_TO_STRING("
4374+ f "CASE WHEN REGEXP_MATCHES(LEFT(col, 1), '[' || { escaped_null } || ']' ) "
4375+ f "THEN LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || { escaped_null } || ']+|[^' || { escaped_null } || ']+)' ), "
4376+ f "(seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) "
4377+ f "ELSE LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || { escaped_null } || ']+|[^' || { escaped_null } || ']+)' ), "
4378+ f "(seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) "
4379+ f "END, '')" ,
43904380 )
43914381
4392- for custom_delimiter in (" " , "@" , " _@" , r"\\" ):
4382+ custom_delimiter_expectations = {
4383+ " " : "ARRAY_TO_STRING(CASE WHEN REGEXP_MATCHES(LEFT(col, 1), '[' || ' ' || ']') THEN LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || ' ' || ']+|[^' || ' ' || ']+)'), (seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) ELSE LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || ' ' || ']+|[^' || ' ' || ']+)'), (seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) END, '')" ,
4384+ "@" : "ARRAY_TO_STRING(CASE WHEN REGEXP_MATCHES(LEFT(col, 1), '[' || '@' || ']') THEN LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || '@' || ']+|[^' || '@' || ']+)'), (seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) ELSE LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || '@' || ']+|[^' || '@' || ']+)'), (seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) END, '')" ,
4385+ " _@" : "ARRAY_TO_STRING(CASE WHEN REGEXP_MATCHES(LEFT(col, 1), '[' || ' _@' || ']') THEN LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || ' _@' || ']+|[^' || ' _@' || ']+)'), (seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) ELSE LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || ' _@' || ']+|[^' || ' _@' || ']+)'), (seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) END, '')" ,
4386+ r"\\" : "ARRAY_TO_STRING(CASE WHEN REGEXP_MATCHES(LEFT(col, 1), '[' || '\\ \\ \\ \\ ' || ']') THEN LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || '\\ \\ \\ \\ ' || ']+|[^' || '\\ \\ \\ \\ ' || ']+)'), (seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) ELSE LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || '\\ \\ \\ \\ ' || ']+|[^' || '\\ \\ \\ \\ ' || ']+)'), (seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) END, '')" ,
4387+ "\u000b " : "ARRAY_TO_STRING(CASE WHEN REGEXP_MATCHES(LEFT(col, 1), '[' || CHR(11) || ']') THEN LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || CHR(11) || ']+|[^' || CHR(11) || ']+)'), (seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) ELSE LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || CHR(11) || ']+|[^' || CHR(11) || ']+)'), (seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) END, '')" ,
4388+ }
4389+ for custom_delimiter , expected_duckdb_sql in custom_delimiter_expectations .items ():
43934390 with self .subTest (
43944391 f"DuckDB generation for INITCAP(col, { custom_delimiter } ) from { dialect } "
43954392 ):
43964393 literal_sql = exp .Literal .string (custom_delimiter ).sql (dialect )
43974394 expression = parse_one (f"INITCAP(col, { literal_sql } )" , read = dialect )
4398- duckdb_sql = expression .sql ("duckdb" )
4399- escaped_custom_delimiter = duckdb_regex_literal_sql (custom_delimiter )
44004395 self .assertEqual (
4401- duckdb_sql ,
4402- "ARRAY_TO_STRING("
4403- f"CASE WHEN REGEXP_MATCHES(LEFT(col, 1), '[' || { escaped_custom_delimiter } || ']') "
4404- f"THEN LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || { escaped_custom_delimiter } || ']+|[^' || { escaped_custom_delimiter } || ']+)'), "
4405- f"(seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) "
4406- f"ELSE LIST_TRANSFORM(REGEXP_EXTRACT_ALL(col, '([' || { escaped_custom_delimiter } || ']+|[^' || { escaped_custom_delimiter } || ']+)'), "
4407- f"(seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END) "
4408- "END, '')" ,
4396+ expression .sql ("duckdb" ),
4397+ expected_duckdb_sql ,
44094398 )
44104399
4411- def escape_expression_sql (sql : str ) -> str :
4412- escaped_sql = sql
4413- for raw , escaped in REGEX_LITERAL_ESCAPES .items ():
4414- raw_sql = exp .Literal .string (raw ).sql ()
4415- escaped_literal_sql = exp .Literal .string (escaped ).sql ()
4416- escaped_sql = f"REPLACE({ escaped_sql } , { raw_sql } , { escaped_literal_sql } )"
4417-
4418- return escaped_sql
4419-
44204400 with self .subTest (
44214401 f"DuckDB generation for INITCAP subquery as custom delimiter arg from { dialect } "
44224402 ):
4423- escaped_subquery = escape_expression_sql ( "( SELECT delimiter FROM settings LIMIT 1)" )
4403+ escaped_subquery = "REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(( SELECT delimiter FROM settings LIMIT 1), ' \\ ', ' \\ \\ '), '-', ' \\ -'), '^', ' \\ ^'), '[', ' \\ ['), ']', ' \\ ]')"
44244404 self .assertEqual (
44254405 parse_one (
44264406 "INITCAP(col, (SELECT delimiter FROM settings LIMIT 1))" , read = dialect
0 commit comments