Spaces:
Runtime error
Runtime error
import re | |
import json | |
import requests | |
import pandas as pd | |
from tqdm import tqdm | |
from bs4 import BeautifulSoup | |
from huggingface_hub import HfApi, list_models, list_datasets, list_spaces | |
import gradio as gr | |
api = HfApi() | |
def get_models(org_name, which_one): | |
all_list = [] | |
if which_one == "models": | |
things = api.list_models(author=org_name) | |
elif which_one == "datasets": | |
things = api.list_datasets(author=org_name) | |
elif which_one == "spaces": | |
things = api.list_spaces(author=org_name) | |
for i in things: | |
i = i.__dict__ | |
json_format_data = {"id": i['id'], "downloads": i['downloads'], "likes": i['likes']} if which_one != "spaces" else {"id": i['id'], "downloads": 0, "likes": i['likes']} | |
all_list.append(json_format_data) | |
df_all_list = (pd.DataFrame(all_list)) | |
return df_all_list | |
def get_most(df_for_most_function): | |
download_sorted_df = df_for_most_function.sort_values(by=['downloads'], ascending=False) | |
most_downloaded = download_sorted_df.iloc[0] | |
like_sorted_df = df_for_most_function.sort_values(by=['likes'], ascending=False) | |
most_liked = like_sorted_df.iloc[0] | |
return {"Most Download": {"id": most_downloaded['id'], "downloads": most_downloaded['downloads'], "likes": most_downloaded['likes']}, "Most Likes": {"id": most_liked['id'], "downloads": most_liked['downloads'], "likes": most_liked['likes']}} | |
def get_sum(df_for_sum_function): | |
sum_downloads = sum(df_for_sum_function['downloads'].tolist()) | |
sum_likes = sum(df_for_sum_function['likes'].tolist()) | |
return {"Downloads": sum_downloads, "Likes": sum_likes} | |
def get_openllm_leaderboard(): | |
url = 'https://huggingfaceh4-open-llm-leaderboard.hf.space/' | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
script_elements = soup.find_all('script') | |
data = json.loads(str(script_elements[1])[31:-10]) | |
component_index = 11 | |
pattern = r'href="([^"]*)"' | |
zero_or_one = 1 | |
result_list = [] | |
i = 0 | |
while True: | |
try: | |
unfiltered = data['components'][component_index]['props']['value']['data'][i][zero_or_one].rstrip("\n") | |
normal_name = re.search(pattern, unfiltered).group(1) | |
normal_name = "/".join(normal_name.split("/")[-2:]) | |
result_list.append(normal_name) | |
i += 1 | |
except (IndexError, AttributeError): | |
return result_list | |
def get_ranking(model_list, target_org): | |
for index, model in enumerate(model_list): | |
if model.split("/")[0].lower() == target_org.lower(): | |
return [index+1, model] | |
return "Not Found" | |
def make_leaderboard(orgs, which_one): | |
data_rows = [] | |
open_llm_leaderboard = get_openllm_leaderboard() if which_one == "models" else None | |
trend = get_trending_list(1, which_one) | |
for org in tqdm(orgs, desc=f"Scraping Organizations ({which_one})", position=0, leave=True): | |
rank = get_ranking_trend(trend, org) | |
df = get_models(org, which_one) | |
if len(df) == 0: | |
continue | |
num_things = len(df) | |
sum_info = get_sum(df) | |
most_info = get_most(df) | |
if which_one == "models": | |
open_llm_leaderboard_get_org = get_ranking(open_llm_leaderboard, org) | |
data_rows.append({ | |
"Organization Name": org, | |
"Total Downloads": sum_info["Downloads"], | |
"Total Likes": sum_info["Likes"], | |
"Number of Models": num_things, | |
"Best Model On Open LLM Leaderboard": open_llm_leaderboard_get_org[1] if open_llm_leaderboard_get_org != "Not Found" else open_llm_leaderboard_get_org, | |
"Best Rank On Open LLM Leaderboard": open_llm_leaderboard_get_org[0] if open_llm_leaderboard_get_org != "Not Found" else open_llm_leaderboard_get_org, | |
"Average Downloads per Model": int(sum_info["Downloads"] / num_things) if num_things != 0 else 0, | |
"Average Likes per Model": int(sum_info["Likes"] / num_things) if num_things != 0 else 0, | |
"Most Downloaded Model": most_info["Most Download"]["id"], | |
"Most Download Count": most_info["Most Download"]["downloads"], | |
"Most Liked Model": most_info["Most Likes"]["id"], | |
"Most Like Count": most_info["Most Likes"]["likes"], | |
"Trending Model": rank['id'], | |
"Best Rank at Trending Models": rank['rank'] | |
}) | |
elif which_one == "datasets": | |
data_rows.append({ | |
"Organization Name": org, | |
"Total Downloads": sum_info["Downloads"], | |
"Total Likes": sum_info["Likes"], | |
"Number of Datasets": num_things, | |
"Average Downloads per Dataset": int(sum_info["Downloads"] / num_things) if num_things != 0 else 0, | |
"Average Likes per Dataset": int(sum_info["Likes"] / num_things) if num_things != 0 else 0, | |
"Most Downloaded Dataset": most_info["Most Download"]["id"], | |
"Most Download Count": most_info["Most Download"]["downloads"], | |
"Most Liked Dataset": most_info["Most Likes"]["id"], | |
"Most Like Count": most_info["Most Likes"]["likes"], | |
"Trending Dataset": rank['id'], | |
"Best Rank at Trending Datasets": rank['rank'] | |
}) | |
elif which_one == "spaces": | |
data_rows.append({ | |
"Organization Name": org, | |
"Total Likes": sum_info["Likes"], | |
"Number of Spaces": num_things, | |
"Average Likes per Space": int(sum_info["Likes"] / num_things) if num_things != 0 else 0, | |
"Most Liked Space": most_info["Most Likes"]["id"], | |
"Most Like Count": most_info["Most Likes"]["likes"], | |
"Trending Space": rank['id'], | |
"Best Rank at Trending Spaces": rank['rank'] | |
}) | |
leaderboard = pd.DataFrame(data_rows) | |
temp = ["Total Downloads"] if which_one != "spaces" else ["Total Likes"] | |
leaderboard = leaderboard.sort_values(by=temp, ascending=False) | |
leaderboard.insert(0, "Serial Number", range(1, len(leaderboard) + 1)) | |
return leaderboard | |
"""# Gradio baΕlasΔ±n | |
""" | |
with open("org_names.txt", "r") as f: | |
org_names_in_list = [i.rstrip("\n") for i in f.readlines()] | |
INTRODUCTION_TEXT = f""" | |
π― The Organization Leaderboard aims to track organization rankings. This space is inspired by the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). | |
## Available Dataframes: | |
- ποΈ Models | |
- π Datasets | |
- π Spaces | |
## Backend | |
π οΈ The leaderboard's backend mainly runs on the [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/v0.5.1/en/package_reference/hf_api). | |
π οΈ Organization names are retrieved using web scraping from [Huggingface Organizations](https://huggingface.co/organizations). | |
**π Note:** In the model's dataframe, there are some columns related to the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). This data is also retrieved through web scraping. | |
* | |
*π Note:** In trending models, first 300 models/datasets/spaces is being retrieved from huggingface. | |
""" | |
def clickable(x, which_one): | |
if which_one == "models": | |
if x != "Not Found": | |
return f'<a target="_blank" href="https://huggingface.co/{x}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{x}</a>' | |
else: | |
return "Not Found" | |
else: | |
if x != "Not Found": | |
return f'<a target="_blank" href="https://huggingface.co/{which_one}/{x}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{x}</a>' | |
return "Not Found" | |
def models_df_to_clickable(df, columns, which_one): | |
for column in columns: | |
if column == "Organization Name": | |
df[column] = df[column].apply(lambda x: clickable(x, "models")) | |
else: | |
df[column] = df[column].apply(lambda x: clickable(x, which_one)) | |
return df | |
def get_trending_list(pages, which_one): | |
trending_list = [] | |
for i in range(pages): | |
json_data = requests.get(f"https://huggingface.co/{which_one}-json?p={i}").json() | |
for thing in json_data[which_one]: | |
id = thing["id"] | |
likes = thing["likes"] | |
if which_one != "spaces": | |
downloads = thing["downloads"] | |
trending_list.append({"id": id, "downloads": downloads, "likes": likes}) | |
else: | |
trending_list.append({"id": id, "likes": likes}) | |
return trending_list | |
def get_ranking_trend(json_data, org_name): | |
names = [item['id'].split("/")[0] for item in json_data] | |
models = [item['id'] for item in json_data] | |
if org_name in names: | |
temp = names.index(org_name) | |
return {"id": models[temp], "rank": temp+1} | |
else: | |
return {"id": "Not Found", "rank": "Not Found"} | |
demo = gr.Blocks() | |
with gr.Blocks() as demo: | |
gr.Markdown("""<h1 align="center" id="space-title">π€ Organization Leaderboard</h1>""") | |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
with gr.TabItem("ποΈ Models", id=1): | |
columns_to_convert = ["Organization Name", "Best Model On Open LLM Leaderboard", "Most Downloaded Model", "Most Liked Model", "Trending Model"] | |
models_df = make_leaderboard(org_names_in_list, "models") | |
models_df = models_df_to_clickable(models_df, columns_to_convert, "models") | |
headers = ["π’ Serial Number", "π’ Organization Name", "π₯ Total Downloads", "π Total Likes", "π€ Number of Models", "π Best Model On Open LLM Leaderboard", "π₯ Best Rank On Open LLM Leaderboard", "π Average Downloads per Model", "π Average Likes per Model", "π Most Downloaded Model", "π Most Download Count", "β€οΈ Most Liked Model", "π Most Like Count", "π₯ Trending Model", "π Best Rank at Trending Models"] | |
gr.Dataframe(models_df.head(400), headers=headers, interactive=True, datatype=["str", "markdown", "str", "str", "str", "markdown", "str", "str", "str", "markdown", "str", "markdown", "str", "markdown", "str"]) | |
with gr.TabItem("π Datasets", id=2): | |
columns_to_convert = ["Organization Name", "Most Downloaded Dataset", "Most Liked Dataset", "Trending Dataset"] | |
dataset_df = make_leaderboard(org_names_in_list, "datasets") | |
dataset_df = models_df_to_clickable(dataset_df, columns_to_convert, "datasets") | |
headers = ["π’ Serial Number", "π’ Organization Name", "π₯ Total Downloads", "π Total Likes", "π Number of Datasets", "π Average Downloads per Dataset", "π Average Likes per Dataset", "π Most Downloaded Dataset", "π Most Download Count", "β€οΈ Most Liked Dataset", "π Most Like Count", "π₯ Trending Dataset", "π Best Rank at Trending Datasets"] | |
gr.Dataframe(dataset_df.head(250), headers=headers, interactive=False, datatype=["str", "markdown", "str", "str", "str", "str", "str", "markdown", "str", "markdown", "str", "markdown", "str"]) | |
with gr.TabItem("π Spaces", id=3): | |
columns_to_convert = ["Organization Name", "Most Liked Space", "Trending Space"] | |
spaces_df = make_leaderboard(org_names_in_list, "spaces") | |
spaces_df = models_df_to_clickable(spaces_df, columns_to_convert, "spaces") | |
headers = ["π’ Serial Number", "π’ Organization Name", "π Total Likes", "π Number of Spaces", "π Average Likes per Space", "β€οΈ Most Liked Space", "π Most Like Count", "π₯ Trending Space", "π Best Rank at Trending Spaces"] | |
gr.Dataframe(spaces_df.head(150), headers=headers, interactive=False, datatype=["str", "markdown", "str", "str", "str", "markdown", "str", "markdown", "str"]) | |
demo.launch() | |