|
import pandas as pd |
|
import plotly.express as px |
|
import reflex as rx |
|
from datasets import load_dataset |
|
from reflex_ag_grid import ag_grid |
|
from sklearn.metrics import accuracy_score |
|
|
|
chart_md = """ |
|
Make sure you explore what happeened between: |
|
- Base Model -> Final Answer |
|
- Base Model -> Reasoning (Both models) Final Answer |
|
- Base Model -> Final Answer Reasoning (Both models) |
|
""" |
|
|
|
|
|
dataset = load_dataset("derek-thomas/labeled-multiple-choice-explained-falcon-results") |
|
|
|
|
|
df = dataset['train'].to_pandas() |
|
|
|
|
|
cols_to_analyze = [ |
|
"predictions_base", |
|
"predictions_FA", |
|
"predictions_RFA_falcon", |
|
"predictions_FAR_falcon", |
|
"predictions_RFA_gpt3_5", |
|
"predictions_FAR_gpt3_5", |
|
] |
|
|
|
|
|
model_names = { |
|
"predictions_base": "Base Model", |
|
"predictions_FA": "Final Answer", |
|
"predictions_RFA_falcon": "Reasoning (Falcon) -> Final Answer)", |
|
"predictions_FAR_falcon": "Final Answer -> Reasoning (Falcon)", |
|
"predictions_RFA_gpt3_5": "Reasoning (GPT-3.5 ) -> Final Answer", |
|
"predictions_FAR_gpt3_5": "Final Answer -> Reasoning(GPT-3.5)", |
|
} |
|
|
|
|
|
metrics_data = [] |
|
for col in cols_to_analyze: |
|
accuracy = round(accuracy_score(df["answer_key"], df[col]) * 100, 2) |
|
metrics_data.append({"Prediction Type": model_names[col], "Accuracy (%)": accuracy}) |
|
|
|
|
|
metrics_df = pd.DataFrame(metrics_data) |
|
|
|
|
|
metrics_column_defs = [ |
|
ag_grid.column_def(field="Prediction Type", header_name="Prediction Type", width=250), |
|
ag_grid.column_def(field="Accuracy (%)", header_name="Accuracy (%)"), |
|
] |
|
|
|
|
|
|
|
def topic_star_chart(): |
|
|
|
topic_accuracy = [] |
|
for topic in df["topic"].unique(): |
|
topic_data = df[df["topic"] == topic] |
|
for col in cols_to_analyze: |
|
accuracy = round((topic_data[col] == topic_data["answer_key"]).mean() * 100, 2) |
|
topic_accuracy.append({"Topic": topic, "Prediction Type": model_names[col], "Accuracy (%)": accuracy}) |
|
|
|
|
|
topic_df = pd.DataFrame(topic_accuracy) |
|
|
|
|
|
topic_counts = df["topic"].value_counts().head(10).index |
|
filtered_topic_df = topic_df[topic_df["Topic"].isin(topic_counts)] |
|
|
|
|
|
fig = px.line_polar( |
|
filtered_topic_df, |
|
r="Accuracy (%)", |
|
theta="Topic", |
|
color="Prediction Type", |
|
title="Top 10 Topics: Per-Topic Performance Star Chart", |
|
line_close=True, |
|
) |
|
fig.update_layout(width=900, height=900) |
|
|
|
return fig |
|
|
|
|
|
def page(): |
|
return rx.vstack( |
|
rx.heading("Results", size="9", margin="20px 0"), |
|
rx.markdown("Here we have a sortable table of our experiments and the results"), |
|
ag_grid( |
|
id="ag_grid_metrics", |
|
row_data=metrics_df.to_dict("records"), |
|
column_defs=metrics_column_defs, |
|
width="60%", |
|
margin="20px auto", |
|
size_columns_to_fit=True |
|
), |
|
rx.markdown("\n---\n"), |
|
rx.divider(), |
|
rx.heading("Performance Star Chart", size="8", margin="20px 0"), |
|
rx.text( |
|
"The chart below shows how each model performed across the most popular top 10 topics by row count. " |
|
"Each line represents a model, and the radial axis represents accuracy.", |
|
font_size="md", |
|
padding="10px", |
|
), |
|
rx.markdown(chart_md), |
|
rx.plotly(data=topic_star_chart()), |
|
padding="20px", |
|
) |
|
|