Skip to content

Instantly share code, notes, and snippets.

@philschmid
Created March 20, 2024 08:04
Show Gist options
  • Save philschmid/c9d1dbfbd64de3a88f6d1b7fcdbf034b to your computer and use it in GitHub Desktop.
Save philschmid/c9d1dbfbd64de3a88f6d1b7fcdbf034b to your computer and use it in GitHub Desktop.
import requests as r
from huggingface_hub import HfFolder
from tqdm import tqdm
from datasets import Dataset
headers = {"Authorization": f"Bearer {HfFolder.get_token()}"}
sess = r.Session()
sess.headers.update(headers)
SUPPORTED_ARCHITECTURES = [
"llama",
"mistral",
"gpt2",
"clip",
"bloom",
"opt" "albert",
"bert",
"camembert",
"convbert",
"deberta",
"deberta-v2",
"distilbert",
"electra",
"esm",
"flaubert",
"mobilebert",
"mpnet",
"phi",
"roberta",
"roformer",
"xlm",
"xlm-roberta",
"t5",
"stable-diffusion",
"stable-diffusion-xl",
"latent-consistency",
]
def get_architecture(model_id):
url = f"https://huggingface.co/api/models/{model_id}"
response = sess.get(url).json()
try:
return response["config"]["model_type"]
except:
if "stable-diffusion" in response["tags"]:
return "stable-diffusion"
elif "stable-diffusion-xl" in response["tags"]:
return "stable-diffusion-xl"
else:
return "N/A"
def is_model_cached(model_id):
url= f"https://optimum-neuron.huggingface.tech/lookup/{model_id}"
try:
response = sess.get(url).json()
return True if len(response["cached_configs"]) > 0 else False
except:
return False
def get_top_100_models(limit=100, type="likes30d", filter="text-generation-inference"):
url = f"https://huggingface.co/api/models?sort={type}&direction=-1&limit={limit}"
response = sess.get(url).json()
# map, filter list to remove gguf
filtered_models = []
for model in tqdm(response, desc="Filtering models", total=len(response)):
try:
# get model architecture
arch = get_architecture(model["id"])
# filter supported architectures
supported = False
if arch in SUPPORTED_ARCHITECTURES:
supported = True
# remove gguf models
if "gguf" in model["tags"] and not "text-generation-inference" in model["tags"]:
continue
# get license
license_value = next(
(
tag.split(":", 1)[1]
for tag in model["tags"]
if tag.startswith("license:")
),
"N/A",
)
_cached = is_model_cached(model["id"])
# model size
filtered_models.append(
{
"model_id": model["id"],
"url": f"https://huggingface.co/{model['id']}",
"architecture": arch,
"supported": supported,
"cached": _cached,
"license": license_value,
"likes30d": model["likes30d"],
"likes": model["likes"],
"downloads": model["downloads"],
}
)
except Exception as e:
print(e)
print(f"Error parsing model {model['id']}")
continue
return filtered_models
response = get_top_100_models()
ds = Dataset.from_list(response)
ds.to_csv("inf2_top_100.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment