Spaces:

jsulz
/

repo-info

Running

File size: 5,103 Bytes

eea405a
 
 
 
2b02896
eea405a
 
cc6d57f
eea405a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
add58d5
eea405a
 
add58d5
 
 
eea405a
 
 
 
 
 
 
2b02896
 
 
 
 
 
 
 
 
 
 
b4cf22f
2b02896
 
eea405a
 
 
 
 
add58d5
 
 
b4cf22f
add58d5
 
 
 
 
 
 
 
 
b4cf22f
add58d5
eea405a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4cf22f
add58d5
eea405a
 
 
b4cf22f
eea405a
 
 
 
 
cc6d57f
b4cf22f
 
 
 
 
eea405a
 
 
 
 
 
 
b4cf22f
eea405a
 
 
 
 
b4cf22f
 
 
 
eea405a
 
 
 
b4cf22f
eea405a

# pylint: disable=no-member
import gradio as gr
import requests
from huggingface_hub import HfApi
from huggingface_hub.errors import RepositoryNotFoundError
import pandas as pd
import plotly.express as px
from gradio_huggingfacehub_search import HuggingfaceHubSearch

HF_API = HfApi()


def format_repo_size(r_size: int) -> str:
    units = {0: "B", 1: "KB", 2: "MB", 3: "GB", 4: "TB", 5: "PB"}
    order = 0
    while r_size >= 1024 and order < len(units) - 1:
        r_size /= 1024
        order += 1
    return f"{r_size:.2f} {units[order]}"


def repo_files(r_type: str, r_id: str) -> dict:
    r_info = HF_API.repo_info(repo_id=r_id, repo_type=r_type, files_metadata=True)
    files = {}
    for sibling in r_info.siblings:
        ext = sibling.rfilename.split(".")[-1]
        if ext in files:
            files[ext]["size"] += sibling.size
            files[ext]["count"] += 1
        else:
            files[ext] = {}
            files[ext]["size"] = sibling.size
            files[ext]["count"] = 1
    return files


def repo_size(r_type, r_id):
    r_refs = HF_API.list_repo_refs(repo_id=r_id, repo_type=r_type)
    repo_sizes = {}
    for branch in r_refs.branches:
        try:
            response = requests.get(
                f"https://huggingface.co/api/{r_type}s/{r_id}/treesize/{branch.name}",
                timeout=1000,
            )
            response = response.json()
            # get the status code
        except Exception:
            response = {}
        if response.get("error") and "restricted" in response.get("error"):
            gr.Warning(f"Branch information for {r_id} not available.")
            return {}
        size = response.get("size")
        if size is not None:
            repo_sizes[branch.name] = size
    return repo_sizes


def get_repo_info(r_type, r_id):
    try:
        repo_sizes = repo_size(r_type, r_id)
        repo_files_info = repo_files(r_type, r_id)
    except RepositoryNotFoundError:
        gr.Warning(
            "Repository not found. Make sure you've entered a valid repo ID and type that corresponds to the repository."
        )
        return (
            gr.Row(visible=False),
            gr.Dataframe(visible=False),
            gr.Plot(visible=False),
            gr.Row(visible=False),
            gr.Dataframe(visible=False),
        )
    rf_sizes_df = (
        pd.DataFrame(repo_files_info)
        .T.reset_index(names="ext")
        .sort_values(by="size", ascending=False)
    )
    # check if repo_sizes is just {}
    if not repo_sizes:
        r_sizes_component = gr.Dataframe(visible=False)
        b_block = gr.Row(visible=False)
    else:
        r_sizes_df = pd.DataFrame(repo_sizes, index=["size"]).T.reset_index(
            names="branch"
        )
        r_sizes_df["formatted_size"] = r_sizes_df["size"].apply(format_repo_size)
        r_sizes_df.columns = ["Branch", "bytes", "Size"]
        r_sizes_component = gr.Dataframe(
            value=r_sizes_df[["Branch", "Size"]], visible=True
        )
        b_block = gr.Row(visible=True)

    rf_sizes_df["formatted_size"] = rf_sizes_df["size"].apply(format_repo_size)
    rf_sizes_df.columns = ["Extension", "bytes", "Count", "Size"]
    rf_sizes_plot = px.pie(
        rf_sizes_df,
        values="bytes",
        names="Extension",
        hover_data=["Size"],
        title=f"File Distribution in {r_id}",
        hole=0.3,
    )
    return (
        gr.Row(visible=True),
        gr.Dataframe(
            value=rf_sizes_df[["Extension", "Count", "Size"]],
            visible=True,
        ),
        gr.Plot(rf_sizes_plot, visible=True),
        b_block,
        r_sizes_component,
    )


with gr.Blocks(theme="ocean") as demo:
    gr.Markdown("# Repository Information")
    gr.Markdown(
        "Enter a repository ID and repository type and get back information about the repository's files and branches."
    )
    with gr.Blocks():
        # repo_id = gr.Textbox(label="Repository ID", placeholder="123456")
        repo_id = HuggingfaceHubSearch(
            label="Hub Model ID",
            placeholder="Search for model id on Huggingface",
            search_type=["model", "dataset"],
        )
        repo_type = gr.Radio(
            choices=["model", "dataset", "space"],
            label="Repository Type",
            value="model",
        )
        search_button = gr.Button(value="Search")
    with gr.Blocks():
        with gr.Row(visible=False) as results_block:
            with gr.Column():
                gr.Markdown("## File Information")
                with gr.Row():
                    file_info = gr.Dataframe(visible=False)
                    file_info_plot = gr.Plot(visible=False)
                with gr.Row(visible=False) as branch_block:
                    with gr.Column():
                        gr.Markdown("## Branch Sizes")
                        branch_sizes = gr.Dataframe(visible=False)

    search_button.click(
        get_repo_info,
        inputs=[repo_type, repo_id],
        outputs=[results_block, file_info, file_info_plot, branch_block, branch_sizes],
    )

demo.launch()