Spaces:

derek-thomas
/

prompt-order-experiment

Running

prompt-order-experiment / prompt_order_exeriment /pages /results.py

derek-thomas

Updating for falcon

e4449d4 about 1 month ago

4.03 kB

	import pandas as pd
	import plotly.express as px
	import reflex as rx
	from datasets import load_dataset
	from reflex_ag_grid import ag_grid
	from sklearn.metrics import accuracy_score

	chart_md = """
	Make sure you explore what happeened between:
	- Base Model -> Final Answer
	- Base Model -> Reasoning (Both models) Final Answer
	- Base Model -> Final Answer Reasoning (Both models)
	"""

	# Load the HF dataset
	dataset = load_dataset("derek-thomas/labeled-multiple-choice-explained-falcon-results")

	# Convert the dataset to a Pandas DataFrame
	df = dataset['train'].to_pandas()

	# Columns to analyze
	cols_to_analyze = [
	"predictions_base",
	"predictions_FA",
	"predictions_RFA_falcon",
	"predictions_FAR_falcon",
	"predictions_RFA_gpt3_5",
	"predictions_FAR_gpt3_5",
	]

	# Mapping for renaming models
	model_names = {
	"predictions_base": "Base Model",
	"predictions_FA": "Final Answer",
	"predictions_RFA_falcon": "Reasoning (Falcon) -> Final Answer)",
	"predictions_FAR_falcon": "Final Answer -> Reasoning (Falcon)",
	"predictions_RFA_gpt3_5": "Reasoning (GPT-3.5 ) -> Final Answer",
	"predictions_FAR_gpt3_5": "Final Answer -> Reasoning(GPT-3.5)",
	}

	# Compute metrics for each model
	metrics_data = []
	for col in cols_to_analyze:
	accuracy = round(accuracy_score(df["answer_key"], df[col]) * 100, 2)
	metrics_data.append({"Prediction Type": model_names[col], "Accuracy (%)": accuracy})

	# Create a DataFrame for metrics
	metrics_df = pd.DataFrame(metrics_data)

	# Column definitions for the metrics table
	metrics_column_defs = [
	ag_grid.column_def(field="Prediction Type", header_name="Prediction Type", width=250),
	ag_grid.column_def(field="Accuracy (%)", header_name="Accuracy (%)"),
	]


	# Function to generate the topic performance star chart
	def topic_star_chart():
	# Calculate per-topic accuracy
	topic_accuracy = []
	for topic in df["topic"].unique():
	topic_data = df[df["topic"] == topic]
	for col in cols_to_analyze:
	accuracy = round((topic_data[col] == topic_data["answer_key"]).mean() * 100, 2)
	topic_accuracy.append({"Topic": topic, "Prediction Type": model_names[col], "Accuracy (%)": accuracy})

	# Create DataFrame for visualization
	topic_df = pd.DataFrame(topic_accuracy)

	# Find the top 10 topics by number of rows
	topic_counts = df["topic"].value_counts().head(10).index
	filtered_topic_df = topic_df[topic_df["Topic"].isin(topic_counts)]

	# Create star chart (radar chart)
	fig = px.line_polar(
	filtered_topic_df,
	r="Accuracy (%)",
	theta="Topic",
	color="Prediction Type",
	title="Top 10 Topics: Per-Topic Performance Star Chart",
	line_close=True, # Close the lines to form a star shape
	)
	fig.update_layout(width=900, height=900)

	return fig


	def page():
	return rx.vstack(
	rx.heading("Results", size="9", margin="20px 0"),
	rx.markdown("Here we have a sortable table of our experiments and the results"),
	ag_grid(
	id="ag_grid_metrics",
	row_data=metrics_df.to_dict("records"),
	column_defs=metrics_column_defs,
	width="60%",
	margin="20px auto", # Center the table
	size_columns_to_fit=True
	),
	rx.markdown("\n---\n"),
	rx.divider(),
	rx.heading("Performance Star Chart", size="8", margin="20px 0"),
	rx.text(
	"The chart below shows how each model performed across the most popular top 10 topics by row count. "
	"Each line represents a model, and the radial axis represents accuracy.",
	font_size="md",
	padding="10px",
	),
	rx.markdown(chart_md),
	rx.plotly(data=topic_star_chart()), # Render the radar chart
	padding="20px",
	)