Skip to content

Instantly share code, notes, and snippets.

@wolever
Last active October 3, 2022 09:47
Show Gist options
  • Save wolever/1a5ccf6396f00229b2dc to your computer and use it in GitHub Desktop.
Save wolever/1a5ccf6396f00229b2dc to your computer and use it in GitHub Desktop.
Parse and escape a query string so it's safe to use with Postgres' `to_tsquery(…)`
import re
from nose.tools import assert_equal
from nose_parameterized import parameterized
from tsquery_escape import tsquery_escape
@parameterized([
("1 OR 2", "1 | 2"),
("(1) 2", "( 1 ) & 2"),
("&", "'&':*"),
('"hello world"', "'hello world'"),
("not 1", "! 1"),
("1 and not (2 or 3)", "1 & ! ( 2 | 3 )"),
("not and and 1) or ( 2 not", "! 1 | ( 2 )"),
("() 1", "1"),
("1 2 3", "1 & 2 & 3"),
("'&' |", "'&':* & '|':*"),
(") and 1 (2 or", "1 & ( 2 )"),
("it's '", "'its':*"),
("(1)", "( 1 )"),
("1((", "1"),
])
def test_tsquery_escape(input, expected):
expected = re.sub("([0-9])", r"'\1':*", expected)
actual = tsquery_escape(input)
assert_equal(actual, expected)
import re
def tsquery_escape(term):
""" Escape a query string so it's safe to use with Postgres'
``to_tsquery(...)``. Single quotes are ignored, double quoted strings
are used as literals, and the logical operators 'and', 'or', 'not',
'(', and ')' can be used:
>>> tsquery_escape("Hello")
"'hello':*"
>>> tsquery_escape('"Quoted string"')
"'quoted string'"
>>> tsquery_escape("multiple terms OR another")
"'multiple':* & 'terms':* | 'another':*"
>>> tsquery_escape("'\"*|")
"'\"*|':*"
>>> tsquery_escape('not foo and (bar or "baz")')
"! 'foo':* & ( 'bar':* | 'baz' )"
"""
magic_terms = {
"and": "&",
"or": "|",
"not": "!",
"OR": "|",
"AND": "&",
"NOT": "!",
"(": "(",
")": ")",
}
magic_values = set(magic_terms.values())
paren_count = 0
res = []
bits = re.split(r'((?:".*?")|[()])', term)
for bit in bits:
if not bit:
continue
split_bits = (
[bit] if bit.startswith('"') and bit.endswith('"') else
bit.strip().split()
)
for bit in split_bits:
if not bit:
continue
if bit in magic_terms:
bit = magic_terms[bit]
last = res and res[-1] or ""
if bit == ")":
if last == "(":
paren_count -= 1
res.pop()
continue
if paren_count == 0:
continue
if last in magic_values and last != "(":
res.pop()
elif bit == "|" and last == "&":
res.pop()
elif bit == "!":
pass
elif bit == "(":
pass
elif last in magic_values or not last:
continue
if bit == ")":
paren_count -= 1
elif bit == "(":
paren_count += 1
res.append(bit)
if bit == ")":
res.append("&")
continue
bit = bit.replace("'", "")
if not bit:
continue
if bit.startswith('"') and bit.endswith('"'):
res.append(bit.replace('"', "'"))
else:
res.append("'%s':*" %(bit.replace("'", ""), ))
res.append("&")
while res and res[-1] in magic_values:
last = res[-1]
if last == ")":
break
if last == "(":
paren_count -= 1
res.pop()
while paren_count > 0:
res.append(")")
paren_count -= 1
return " ".join(res)
@reinoldus
Copy link

Hi,
thank you for sharing this snippet! I think I found two bugs:

The following queries:
" and ""
get converted to:
' and ''

which throws a syntax error

A quick fix would be to replace line 78 with this one: if not bit or bit in ['""', '"']:
There is probably a better solution

@Mariownyou
Copy link

it breaks if string contains :

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment