Skip to content

Instantly share code, notes, and snippets.

@allanbatista
Created March 9, 2020 17:34
Show Gist options
  • Save allanbatista/c16e4ccf2e9d4398a6b2ac6f305f6ef3 to your computer and use it in GitHub Desktop.
Save allanbatista/c16e4ccf2e9d4398a6b2ac6f305f6ef3 to your computer and use it in GitHub Desktop.
Normalização de caracteres especiais direto no bigquery
#standardSQL
# https://en.it1352.com/article/e546eb6a693c4f8db4c58aa717684d8d.html
CREATE TEMP FUNCTION accent2latin(word STRING) AS
((
WITH lookups AS (
SELECT
'ç,æ,œ,á,é,í,ó,ú,à,è,ì,ò,ù,ä,ë,ï,ö,ü,ÿ,â,ê,î,ô,û,å,ø,Ø,Å,Á,À,Â,Ä,È,É,Ê,Ë,Í,Î,Ï,Ì,Ò,Ó,Ô,Ö,Ú,Ù,Û,Ü,Ÿ,Ç,Æ,Œ,ñ' AS accents,
'c,ae,oe,a,e,i,o,u,a,e,i,o,u,a,e,i,o,u,y,a,e,i,o,u,a,o,O,A,A,A,A,A,E,E,E,E,I,I,I,I,O,O,O,O,U,U,U,U,Y,C,AE,OE,n' AS latins
),
pairs AS (
SELECT accent, latin FROM lookups,
UNNEST(SPLIT(accents)) AS accent WITH OFFSET AS p1,
UNNEST(SPLIT(latins)) AS latin WITH OFFSET AS p2
WHERE p1 = p2
)
SELECT STRING_AGG(IFNULL(latin, char), '')
FROM UNNEST(SPLIT(word, '')) char
LEFT JOIN pairs
ON char = accent
));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment