Skip to content

Instantly share code, notes, and snippets.

@Taytay
Created August 2, 2024 12:37
Show Gist options
  • Save Taytay/5b516f7caf22483d12c6617f621d7e87 to your computer and use it in GitHub Desktop.
Save Taytay/5b516f7caf22483d12c6617f621d7e87 to your computer and use it in GitHub Desktop.
Snowflake enhanced regex_substr. This allows you to use python style regexes in Snowflake.
-- Originally found here: https://github.com/sfc-gh-gpavlik/SnowflakeUDFs/blob/main/RegularExpressions/regexp2.sql
-- Edited to use python and group_num by taytay: https://github.com/taytay/
/********************************************************************************************************
* *
* Snowflake Regular Expression Extensions *
* *
* Copyright (c) 2020 Snowflake Computing Inc. All rights reserved. *
* *
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in *
* compliance with the License. You may obtain a copy of the License at *
* *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, software distributed under the License *
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or *
* implied. See the License for the specific language governing permissions and limitations under the *
* License. *
* *
* Copyright (c) 2020, 2021, 2022, 2023 Snowflake Computing Inc. All rights reserved. *
* *
********************************************************************************************************/
CREATE OR REPLACE FUNCTION REGEXP_SUBSTR2(
SUBJECT STRING,
PATTERN STRING,
POSITION FLOAT,
OCCURRENCE FLOAT,
PARAMETERS STRING,
GROUP_NUM FLOAT
)
RETURNS STRING
LANGUAGE PYTHON
RUNTIME_VERSION = '3.10'
PACKAGES = ('pandas', 'nltk')
HANDLER = 'regexp_substr2'
AS $$
import re
from re import Match, Pattern
from typing import Optional
import pandas as pd
from _snowflake import vectorized
def compile_pattern(pattern: str, parameters: str) -> Pattern[str]:
regex_flags = 0
if parameters is not None and parameters != '':
if 'i' in parameters:
regex_flags |= re.IGNORECASE
if 'm' in parameters:
regex_flags |= re.MULTILINE
if 's' in parameters:
regex_flags |= re.DOTALL
if 'u' in parameters:
regex_flags |= re.UNICODE
if 'x' in parameters:
regex_flags |= re.VERBOSE
return re.compile(pattern, regex_flags)
def process_row(subject: str, pattern_compiled: Pattern[str], position: float, occurrence: float, parameters: str, group_num: float) -> Optional[str]:
# Slice the subject string from the given position
str_part = subject[position:]
match : Match[str] | None = None
# Find the nth occurrence
for _ in range(occurrence):
match = pattern_compiled.search(str_part)
if not match:
return ""
str_part = str_part[match.start() + 1:]
if match is not None:
if 'e' in parameters:
group_count = len(match.groups())
if group_count > 0:
group_index = int(group_num) if group_num is not None else 1
if 1 <= group_index <= group_count:
return match.group(group_index)
return match.group(0)
else:
return ""
@vectorized(input=pd.DataFrame)
def regexp_substr2(params: pd.DataFrame) -> pd.Series:
subject_series = params[0]
pattern = params[1][0]
position = params[2][0]
occurrence = params[3][0]
parameters = params[4][0]
group_num = params[5][0]
# Adjust parameters if necessary
occurrence = int(occurrence)
if occurrence < 1:
occurrence = 1
position = int(position) - 1
if position < 0:
position = 0
if group_num is None or group_num < 1:
group_num = 1
else:
group_num = int(group_num)
if group_num < 1:
group_num = 1
pattern_compiled = compile_pattern(pattern, parameters)
return subject_series.apply(
lambda row: process_row(
row, pattern_compiled, position, occurrence, parameters, group_num
)
)
$$;
create or replace function REGEXP_SUBSTR2(SUBJECT string, PATTERN string, POSITION float, OCCURRENCE float, PARAMETERS string)
returns string
language SQL
as
$$
regexp_substr2(SUBJECT, PATTERN, POSITION, OCCURRENCE, PARAMETERS, 0)
$$;
create or replace function REGEXP_SUBSTR2(SUBJECT string, PATTERN string, POSITION float, OCCURRENCE float)
returns string
language SQL
as
$$
regexp_substr2(SUBJECT, PATTERN, POSITION, OCCURRENCE, '', 0)
$$;
create or replace function REGEXP_SUBSTR2(SUBJECT string, PATTERN string, POSITION float)
returns string
language SQL
as
$$
regexp_substr2(SUBJECT, PATTERN, POSITION, 1, '', 0)
$$;
create or replace function REGEXP_SUBSTR2(SUBJECT string, PATTERN string)
returns string
language SQL
as
$$
regexp_substr2(SUBJECT, PATTERN, 1, 1, '', 0)
$$;
-- Returns the second group, which is ABCDEF
SELECT REGEXP_SUBSTR2('1234567890ABCDEF', $$
(\d+)
(\D+).* # Look, I've got comments in my regex!
$$, 1, 1, 'eix', 2);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment