Created
August 21, 2023 04:59
-
-
Save opparco/d86ef144604b38e594de3a1bb3730e4c to your computer and use it in GitHub Desktop.
debug tokenizer of lmsys/vicuna-13b-v1.3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# debug tokenizer of lmsys/vicuna-13b-v1.3 | |
# | |
from transformers import AutoTokenizer | |
tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-13b-v1.3") | |
def encode_decode(string: str): | |
ids = tokenizer.encode(string, add_special_tokens=False) | |
decoded = tokenizer.decode(ids) | |
u8 = decoded.encode('utf-8') | |
print(ids, decoded, u8) | |
def encode_decode_each_id(string: str): | |
ids = tokenizer.encode(string, add_special_tokens=False) | |
for i in ids: | |
decoded = tokenizer.decode([i]) | |
u8 = decoded.encode('utf-8') | |
print(i, decoded, u8) | |
string = '神々の黄昏' | |
encode_decode(string) | |
encode_decode_each_id(string) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment