import pandas as pd import plotly.express as px import reflex as rx from datasets import load_dataset from reflex_ag_grid import ag_grid from sklearn.metrics import accuracy_score chart_md = """ Make sure you explore what happeened between: - Base Model -> Final Answer - Base Model -> Reasoning (Both models) Final Answer - Base Model -> Final Answer Reasoning (Both models) """ # Load the HF dataset dataset = load_dataset("derek-thomas/labeled-multiple-choice-explained-falcon-results") # Convert the dataset to a Pandas DataFrame df = dataset['train'].to_pandas() # Columns to analyze cols_to_analyze = [ "predictions_base", "predictions_FA", "predictions_RFA_falcon", "predictions_FAR_falcon", "predictions_RFA_gpt3_5", "predictions_FAR_gpt3_5", ] # Mapping for renaming models model_names = { "predictions_base": "Base Model", "predictions_FA": "Final Answer", "predictions_RFA_falcon": "Reasoning (Falcon) -> Final Answer)", "predictions_FAR_falcon": "Final Answer -> Reasoning (Falcon)", "predictions_RFA_gpt3_5": "Reasoning (GPT-3.5 ) -> Final Answer", "predictions_FAR_gpt3_5": "Final Answer -> Reasoning(GPT-3.5)", } # Compute metrics for each model metrics_data = [] for col in cols_to_analyze: accuracy = round(accuracy_score(df["answer_key"], df[col]) * 100, 2) metrics_data.append({"Prediction Type": model_names[col], "Accuracy (%)": accuracy}) # Create a DataFrame for metrics metrics_df = pd.DataFrame(metrics_data) # Column definitions for the metrics table metrics_column_defs = [ ag_grid.column_def(field="Prediction Type", header_name="Prediction Type", width=250), ag_grid.column_def(field="Accuracy (%)", header_name="Accuracy (%)"), ] # Function to generate the topic performance star chart def topic_star_chart(): # Calculate per-topic accuracy topic_accuracy = [] for topic in df["topic"].unique(): topic_data = df[df["topic"] == topic] for col in cols_to_analyze: accuracy = round((topic_data[col] == topic_data["answer_key"]).mean() * 100, 2) topic_accuracy.append({"Topic": topic, "Prediction Type": model_names[col], "Accuracy (%)": accuracy}) # Create DataFrame for visualization topic_df = pd.DataFrame(topic_accuracy) # Find the top 10 topics by number of rows topic_counts = df["topic"].value_counts().head(10).index filtered_topic_df = topic_df[topic_df["Topic"].isin(topic_counts)] # Create star chart (radar chart) fig = px.line_polar( filtered_topic_df, r="Accuracy (%)", theta="Topic", color="Prediction Type", title="Top 10 Topics: Per-Topic Performance Star Chart", line_close=True, # Close the lines to form a star shape ) fig.update_layout(width=900, height=900) return fig def page(): return rx.vstack( rx.heading("Results", size="9", margin="20px 0"), rx.markdown("Here we have a sortable table of our experiments and the results"), ag_grid( id="ag_grid_metrics", row_data=metrics_df.to_dict("records"), column_defs=metrics_column_defs, width="60%", margin="20px auto", # Center the table size_columns_to_fit=True ), rx.markdown("\n---\n"), rx.divider(), rx.heading("Performance Star Chart", size="8", margin="20px 0"), rx.text( "The chart below shows how each model performed across the most popular top 10 topics by row count. " "Each line represents a model, and the radial axis represents accuracy.", font_size="md", padding="10px", ), rx.markdown(chart_md), rx.plotly(data=topic_star_chart()), # Render the radar chart padding="20px", )