Skip to content

Instantly share code, notes, and snippets.

@shreyansh26
Created February 21, 2024 19:00
Show Gist options
  • Save shreyansh26/1f7d491d6bacc83692f51048ddf16c2b to your computer and use it in GitHub Desktop.
Save shreyansh26/1f7d491d6bacc83692f51048ddf16c2b to your computer and use it in GitHub Desktop.
import json
import sentencepiece as spm
import sentencepiece.sentencepiece_model_pb2 as sp_pb2
from google.protobuf.json_format import MessageToDict
PATH = "tokenizer.model"
s = spm.SentencePieceProcessor(model_file=PATH)
dc = {}
# Get vocab
for i in range(s.vocab_size()):
dc[i] = s.detokenize(i)
with open('vocab.json', 'w') as f:
json.dump(dc, f, indent=4)
# Dump the model protobuf
mp = sp_pb2.ModelProto()
mp.ParseFromString(open(PATH, "rb").read())
dict_obj = MessageToDict(mp)
with open('config.json', 'w') as f:
json.dump(dict_obj, f, indent=4)
# Directly print values
print(mp.trainer_spec)
print(mp.normalizer_spec)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment