Last active
February 16, 2024 20:52
-
-
Save mikesamuel/19170cb0bcea6e475bc88838bcbfe6d6 to your computer and use it in GitHub Desktop.
Exploring how regexes work differently on engines with different code-unit sizes / assumed encodings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Matching code-units that have all the same size OK. | |
applying /a/ to '"aฯ๐"' | |
. py UTF-8 | |
. . 00000000 61 |a| | |
. . 00000001 | |
. js UTF-16 | |
. . 00000000 61 |a| | |
. . 00000001 | |
. py UTF-32 | |
. . 00000000 61 |a| | |
. . 00000001 | |
Matching all, excluding only units that have the same size OK. | |
applying /[^b]*/ to '"aฯ๐"' | |
. py UTF-8 | |
. . 00000000 61 cf 8c f0 9d 84 9e |a......| | |
. . 00000007 | |
. js UTF-16 | |
. . 00000000 61 cf 8c f0 9d 84 9e |a......| | |
. . 00000007 | |
. py UTF-32 | |
. . 00000000 61 cf 8c f0 9d 84 9e |a......| | |
. . 00000007 | |
Matching a limited number that includes code-units of different sizes, NOT OK. | |
applying /../ to '"aฯ๐"' | |
. py UTF-8 | |
. . 00000000 61 cf |a.| | |
. . 00000002 | |
. js UTF-16 | |
. . 00000000 61 cf 8c |a..| | |
. . 00000003 | |
. py UTF-32 | |
. . 00000000 61 cf 8c |a..| | |
. . 00000003 | |
Same. Matching different sizes, non-exhaustively, NOT OK. | |
applying /^[^\n]{3}/ to '"aฯ๐"' | |
. py UTF-8 | |
. . 00000000 61 cf 8c |a..| | |
. . 00000003 | |
. js UTF-16 | |
. . 00000000 61 cf 8c ef bf bd |a.....| | |
. . 00000006 | |
. py UTF-32 | |
. . 00000000 61 cf 8c f0 9d 84 9e |a......| | |
. . 00000007 | |
Range has different interpretations for different encodings. NOT OK. | |
applying /[^\x00-\x7f]{3}/ to '"aฯ๐"' | |
. py UTF-8 | |
. . 00000000 cf 8c f0 |...| | |
. . 00000003 | |
. js UTF-16 | |
. . 00000000 cf 8c f0 9d 84 9e |......| | |
. . 00000006 | |
. py UTF-32 | |
. . 00000000 3c 4e 4f 20 4d 41 54 43 48 3e |<NO MATCH>| | |
. . 0000000a | |
Same. Range has different interpretation for UTF-16 than UTF-32. NOT OK. | |
applying /[\ud800-\udfff]/ to '"aฯ๐"' | |
. py UTF-8 | |
. . 00000000 3c 4e 4f 20 4d 41 54 43 48 3e |<NO MATCH>| | |
. . 0000000a | |
. js UTF-16 | |
. . 00000000 ef bf bd |...| | |
. . 00000003 | |
. py UTF-32 | |
. . 00000000 3c 4e 4f 20 4d 41 54 43 48 3e |<NO MATCH>| | |
. . 0000000a | |
Exhaustive, but different interpretation bites us again. NOT OK. | |
applying /[\x80-\xff]+/ to '"aฯ๐"' | |
. py UTF-8 | |
. . 00000000 cf 8c f0 9d 84 9e |......| | |
. . 00000006 | |
. js UTF-16 | |
. . 00000000 3c 4e 4f 20 4d 41 54 43 48 3e |<NO MATCH>| | |
. . 0000000a | |
. py UTF-32 | |
. . 00000000 3c 4e 4f 20 4d 41 54 43 48 3e |<NO MATCH>| | |
. . 0000000a |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function applyRegex { | |
local regex="$1" | |
local input="$2" | |
echo applying /"$regex"/ to "'$input'" | |
echo '. py UTF-8' | |
python3 -c 'import regex | |
import json | |
x = regex.search(r"'"$regex"'".encode("UTF-8"), json.loads(r'"'$input'"').encode("UTF-8")) | |
if x is None: | |
print("<NO MATCH>") | |
else: | |
import sys | |
sys.stdout.buffer.write(x.group(0)) | |
print() | |
' \ | |
| perl -pe 's/\n\Z//' | hexdump -C | sed 's/^/. . /' | |
echo '. js UTF-16' | |
node -e 'console.log(/'"$regex"'/.exec(JSON.parse(String.raw`'"$input"'`))?.[0] ?? "<NO MATCH>")' \ | |
| perl -pe 's/\n\Z//' | hexdump -C | sed 's/^/. . /' | |
echo '. py UTF-32' | |
python3 -c ' | |
import regex | |
import json | |
x = regex.search(r"'"$regex"'", json.loads(r'"'$input'"')) | |
print("%s" % (x is None and "<NO MATCH>" or x.group(0),)) | |
' \ | |
| perl -pe 's/\n\Z//' | hexdump -C | sed 's/^/. . /' | |
echo | |
} | |
THREE_SIZES='"aฯ๐"' | |
echo Matching code-units that have all the same size OK. | |
applyRegex 'a' "$THREE_SIZES" | |
echo Matching all, excluding only units that have the same size OK. | |
applyRegex '[^b]*' "$THREE_SIZES" | |
echo Matching a limited number that includes code-units of different sizes, NOT OK. | |
applyRegex '..' "$THREE_SIZES" | |
echo Same. Matching different sizes, non-exhaustively, NOT OK. | |
applyRegex '^[^\n]{3}' "$THREE_SIZES" | |
echo Range has different interpretations for different encodings. NOT OK. | |
applyRegex '[^\x00-\x7f]{3}' "$THREE_SIZES" | |
echo Same. Range has different interpretation for UTF-16 than UTF-32. NOT OK. | |
applyRegex '[\ud800-\udfff]' "$THREE_SIZES" | |
echo Exhaustive, but different interpretation bites us again. NOT OK. | |
applyRegex '[\x80-\xff]+' "$THREE_SIZES" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment