File size: 4,032 Bytes
9990990
 
272ccb0
 
9990990
 
 
 
 
 
 
 
 
 
 
e4449d4
9990990
 
 
 
 
 
 
 
e4449d4
 
9990990
 
 
 
 
 
 
 
e4449d4
 
9990990
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272ccb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9990990
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import pandas as pd
import plotly.express as px
import reflex as rx
from datasets import load_dataset
from reflex_ag_grid import ag_grid
from sklearn.metrics import accuracy_score

chart_md = """
Make sure you explore what happeened between:
- Base Model -> Final Answer
- Base Model -> Reasoning (Both models) Final Answer 
- Base Model -> Final Answer Reasoning (Both models)
"""

# Load the HF dataset
dataset = load_dataset("derek-thomas/labeled-multiple-choice-explained-falcon-results")

# Convert the dataset to a Pandas DataFrame
df = dataset['train'].to_pandas()

# Columns to analyze
cols_to_analyze = [
    "predictions_base",
    "predictions_FA",
    "predictions_RFA_falcon",
    "predictions_FAR_falcon",
    "predictions_RFA_gpt3_5",
    "predictions_FAR_gpt3_5",
    ]

# Mapping for renaming models
model_names = {
    "predictions_base": "Base Model",
    "predictions_FA": "Final Answer",
    "predictions_RFA_falcon": "Reasoning (Falcon) -> Final Answer)",
    "predictions_FAR_falcon": "Final Answer -> Reasoning (Falcon)",
    "predictions_RFA_gpt3_5": "Reasoning (GPT-3.5 ) -> Final Answer",
    "predictions_FAR_gpt3_5": "Final Answer -> Reasoning(GPT-3.5)",
    }

# Compute metrics for each model
metrics_data = []
for col in cols_to_analyze:
    accuracy = round(accuracy_score(df["answer_key"], df[col]) * 100, 2)
    metrics_data.append({"Prediction Type": model_names[col], "Accuracy (%)": accuracy})

# Create a DataFrame for metrics
metrics_df = pd.DataFrame(metrics_data)

# Column definitions for the metrics table
metrics_column_defs = [
    ag_grid.column_def(field="Prediction Type", header_name="Prediction Type", width=250),
    ag_grid.column_def(field="Accuracy (%)", header_name="Accuracy (%)"),
    ]


# Function to generate the topic performance star chart
def topic_star_chart():
    # Calculate per-topic accuracy
    topic_accuracy = []
    for topic in df["topic"].unique():
        topic_data = df[df["topic"] == topic]
        for col in cols_to_analyze:
            accuracy = round((topic_data[col] == topic_data["answer_key"]).mean() * 100, 2)
            topic_accuracy.append({"Topic": topic, "Prediction Type": model_names[col], "Accuracy (%)": accuracy})

    # Create DataFrame for visualization
    topic_df = pd.DataFrame(topic_accuracy)

    # Find the top 10 topics by number of rows
    topic_counts = df["topic"].value_counts().head(10).index
    filtered_topic_df = topic_df[topic_df["Topic"].isin(topic_counts)]

    # Create star chart (radar chart)
    fig = px.line_polar(
            filtered_topic_df,
            r="Accuracy (%)",
            theta="Topic",
            color="Prediction Type",
            title="Top 10 Topics: Per-Topic Performance Star Chart",
            line_close=True,  # Close the lines to form a star shape
            )
    fig.update_layout(width=900, height=900)

    return fig


def page():
    return rx.vstack(
            rx.heading("Results", size="9", margin="20px 0"),
            rx.markdown("Here we have a sortable table of our experiments and the results"),
            ag_grid(
                    id="ag_grid_metrics",
                    row_data=metrics_df.to_dict("records"),
                    column_defs=metrics_column_defs,
                    width="60%",
                    margin="20px auto",  # Center the table
                    size_columns_to_fit=True
                    ),
            rx.markdown("\n---\n"),
            rx.divider(),
            rx.heading("Performance Star Chart", size="8", margin="20px 0"),
            rx.text(
                    "The chart below shows how each model performed across the most popular top 10 topics by row count. "
                    "Each line represents a model, and the radial axis represents accuracy.",
                    font_size="md",
                    padding="10px",
                    ),
            rx.markdown(chart_md),
            rx.plotly(data=topic_star_chart()),  # Render the radar chart
            padding="20px",
            )