Skip to content

Instantly share code, notes, and snippets.

@almugabo
Created February 23, 2024 19:06
Show Gist options
  • Save almugabo/43212e321968547709b0dac38f3ab92c to your computer and use it in GitHub Desktop.
Save almugabo/43212e321968547709b0dac38f3ab92c to your computer and use it in GitHub Desktop.
compare tokenizers of gemma and llama
import os
import pandas as pd
import random
from transformers import AutoTokenizer
xFld = '....' # folder with a bunch of text files
xtok_tl = AutoTokenizer.from_pretrained('unsloth/tinyllama-bnb-4bit')
xtok_ge = AutoTokenizer.from_pretrained('unsloth/gemma-2b-bnb-4bit')
print('vocab_size llama:', xtok_tl.vocab_size)
print('vocab_size gemma:', xtok_ge.vocab_size)
def len_tl (xtext):
return len(xtok_tl.tokenize(xtext))
def len_ge (xtext):
return len(xtok_ge.tokenize(xtext))
xstr = 'everybody is happier when they see the loved ones'
xfiles = [xFld + x for x in random.sample(os.listdir(xFld), 1000) ]
xlst_res = []
for xfile in xfiles:
with open(xfile, 'r') as xff:
xtext = xff.read()
xdata={}
xdata['tllama'] = len_tl (xtext)
xdata['gemma'] = len_ge (xtext)
xlst_res.append(xdata)
t1 = pd.DataFrame(xlst_res)
t1.tllama.hist()
t1.gemma.hist()
t1.describe()
'''
tllama gemma
count 1000.00000 1000.000000
mean 571.06400 496.238000
std 223.80022 192.722971
min 76.00000 64.000000
25% 417.00000 363.750000
50% 555.00000 487.500000
75% 690.00000 604.000000
max 1947.00000 1595.000000
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment