Created
July 1, 2017 03:21
-
-
Save taybin/1e9b037f6b0e83a5ab2f97959da13ee6 to your computer and use it in GitHub Desktop.
Technique for looking up stopwards from a table, instead of a static file in postgres
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE TABLE IF NOT EXISTS stopwords ( | |
word VARCHAR(20) NOT NULL; | |
); | |
CREATE OR REPLACE FUNCTION remove_stopwords(string TEXT, OUT tsv TSVECTOR) AS | |
$$ | |
BEGIN | |
SELECT string_agg(words.*, ' ')::tsvector FROM | |
-- split tsvector into row for each word | |
regexp_split_to_table( | |
( | |
SELECT to_tsvector('simple'::regconfig, string)::text), E'\\s+') words(word) | |
WHERE NOT EXISTS ( | |
SELECT 1 FROM stopwords sw WHERE sw.word = ( | |
-- break word out of 'hello':323 pattern | |
SELECT w[1] FROM regexp_matches(words.word, E'''(\\w+)'':') AS w | |
) | |
) INTO tsv; | |
END; | |
$$ LANGUAGE plpgsql; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment