derek-thomas
Updating for falcon
e4449d4
import pandas as pd
import plotly.express as px
import reflex as rx
from datasets import load_dataset
from reflex_ag_grid import ag_grid
from sklearn.metrics import accuracy_score
chart_md = """
Make sure you explore what happeened between:
- Base Model -> Final Answer
- Base Model -> Reasoning (Both models) Final Answer
- Base Model -> Final Answer Reasoning (Both models)
"""
# Load the HF dataset
dataset = load_dataset("derek-thomas/labeled-multiple-choice-explained-falcon-results")
# Convert the dataset to a Pandas DataFrame
df = dataset['train'].to_pandas()
# Columns to analyze
cols_to_analyze = [
"predictions_base",
"predictions_FA",
"predictions_RFA_falcon",
"predictions_FAR_falcon",
"predictions_RFA_gpt3_5",
"predictions_FAR_gpt3_5",
]
# Mapping for renaming models
model_names = {
"predictions_base": "Base Model",
"predictions_FA": "Final Answer",
"predictions_RFA_falcon": "Reasoning (Falcon) -> Final Answer)",
"predictions_FAR_falcon": "Final Answer -> Reasoning (Falcon)",
"predictions_RFA_gpt3_5": "Reasoning (GPT-3.5 ) -> Final Answer",
"predictions_FAR_gpt3_5": "Final Answer -> Reasoning(GPT-3.5)",
}
# Compute metrics for each model
metrics_data = []
for col in cols_to_analyze:
accuracy = round(accuracy_score(df["answer_key"], df[col]) * 100, 2)
metrics_data.append({"Prediction Type": model_names[col], "Accuracy (%)": accuracy})
# Create a DataFrame for metrics
metrics_df = pd.DataFrame(metrics_data)
# Column definitions for the metrics table
metrics_column_defs = [
ag_grid.column_def(field="Prediction Type", header_name="Prediction Type", width=250),
ag_grid.column_def(field="Accuracy (%)", header_name="Accuracy (%)"),
]
# Function to generate the topic performance star chart
def topic_star_chart():
# Calculate per-topic accuracy
topic_accuracy = []
for topic in df["topic"].unique():
topic_data = df[df["topic"] == topic]
for col in cols_to_analyze:
accuracy = round((topic_data[col] == topic_data["answer_key"]).mean() * 100, 2)
topic_accuracy.append({"Topic": topic, "Prediction Type": model_names[col], "Accuracy (%)": accuracy})
# Create DataFrame for visualization
topic_df = pd.DataFrame(topic_accuracy)
# Find the top 10 topics by number of rows
topic_counts = df["topic"].value_counts().head(10).index
filtered_topic_df = topic_df[topic_df["Topic"].isin(topic_counts)]
# Create star chart (radar chart)
fig = px.line_polar(
filtered_topic_df,
r="Accuracy (%)",
theta="Topic",
color="Prediction Type",
title="Top 10 Topics: Per-Topic Performance Star Chart",
line_close=True, # Close the lines to form a star shape
)
fig.update_layout(width=900, height=900)
return fig
def page():
return rx.vstack(
rx.heading("Results", size="9", margin="20px 0"),
rx.markdown("Here we have a sortable table of our experiments and the results"),
ag_grid(
id="ag_grid_metrics",
row_data=metrics_df.to_dict("records"),
column_defs=metrics_column_defs,
width="60%",
margin="20px auto", # Center the table
size_columns_to_fit=True
),
rx.markdown("\n---\n"),
rx.divider(),
rx.heading("Performance Star Chart", size="8", margin="20px 0"),
rx.text(
"The chart below shows how each model performed across the most popular top 10 topics by row count. "
"Each line represents a model, and the radial axis represents accuracy.",
font_size="md",
padding="10px",
),
rx.markdown(chart_md),
rx.plotly(data=topic_star_chart()), # Render the radar chart
padding="20px",
)