lcipolina commited on
Commit
d778057
·
verified ·
1 Parent(s): d545927

Upgraded the leaderboard

Browse files
Files changed (2) hide show
  1. app.py +96 -0
  2. app_old.py +0 -216
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import gradio as gr
4
+ from agents.llm_registry import LLM_REGISTRY # Dynamically fetch LLM models
5
+
6
+ # Extract available LLM models
7
+ llm_models = list(LLM_REGISTRY.keys())
8
+
9
+ # Define game list manually (for now)
10
+ games_list = [
11
+ "rock_paper_scissors",
12
+ "prisoners_dilemma",
13
+ "tic_tac_toe",
14
+ "connect_four",
15
+ "matching_pennies",
16
+ "kuhn_poker",
17
+ ]
18
+
19
+ # File to persist results
20
+ RESULTS_TRACKER_FILE = "results_tracker.json"
21
+
22
+ # Load or initialize the results tracker
23
+ if os.path.exists(RESULTS_TRACKER_FILE):
24
+ with open(RESULTS_TRACKER_FILE, "r") as f:
25
+ results_tracker = json.load(f)
26
+ else:
27
+ results_tracker = {
28
+ llm: {game: {"wins": 0, "ties": 0, "losses": 0, "games": 0} for game in games_list}
29
+ for llm in llm_models
30
+ }
31
+
32
+
33
+ def save_results_tracker():
34
+ """Save the results tracker to a JSON file."""
35
+ with open(RESULTS_TRACKER_FILE, "w") as f:
36
+ json.dump(results_tracker, f, indent=4)
37
+
38
+
39
+ def calculate_leaderboard():
40
+ """Generate a leaderboard table summarizing LLM performance across games."""
41
+ leaderboard_data = {"LLM Model": llm_models}
42
+
43
+ for game in games_list:
44
+ leaderboard_data[game] = [
45
+ f"{(results_tracker[llm][game]['wins'] / max(1, results_tracker[llm][game]['games']) * 100):.1f}% W / "
46
+ f"{(results_tracker[llm][game]['ties'] / max(1, results_tracker[llm][game]['games']) * 100):.1f}% T / "
47
+ f"{(results_tracker[llm][game]['losses'] / max(1, results_tracker[llm][game]['games']) * 100):.1f}% L"
48
+ for llm in llm_models
49
+ ]
50
+
51
+ return leaderboard_data
52
+
53
+
54
+ def get_model_details(model_name):
55
+ """Returns detailed performance of the selected LLM model."""
56
+ if model_name not in results_tracker:
57
+ return "No data available for this model."
58
+
59
+ details = f"### {model_name} Performance Breakdown\n"
60
+ for game, record in results_tracker[model_name].items():
61
+ total_games = record["games"]
62
+ details += f"- **{game.capitalize()}**: {record['wins']} Wins, {record['ties']} Ties, {record['losses']} Losses (Total: {total_games})\n"
63
+
64
+ return details
65
+
66
+
67
+ # Gradio Interface
68
+ with gr.Blocks() as interface:
69
+ with gr.Tab("Game Arena"):
70
+ gr.Markdown("# LLM Game Arena\nPlay against LLMs or other players in classic games!")
71
+
72
+ # (Game selection and play functionality remains unchanged)
73
+
74
+ with gr.Tab("Leaderboard"):
75
+ gr.Markdown("# LLM Model Leaderboard\nTrack performance across different games!")
76
+
77
+ leaderboard_table = gr.Dataframe(label="Leaderboard", value=calculate_leaderboard())
78
+
79
+ with gr.Row():
80
+ model_dropdown = gr.Dropdown(choices=llm_models, label="Select LLM Model")
81
+ details_output = gr.Markdown(label="Model Performance Details")
82
+
83
+ def update_leaderboard():
84
+ """Updates the leaderboard table."""
85
+ return calculate_leaderboard()
86
+
87
+ def update_details(model_name):
88
+ """Updates the details section when an LLM is selected."""
89
+ return get_model_details(model_name)
90
+
91
+ update_leaderboard_button = gr.Button("Refresh Leaderboard")
92
+ update_leaderboard_button.click(update_leaderboard, inputs=[], outputs=leaderboard_table)
93
+
94
+ model_dropdown.change(update_details, inputs=[model_dropdown], outputs=details_output)
95
+
96
+ interface.launch()
app_old.py DELETED
@@ -1,216 +0,0 @@
1
- import os
2
- import json
3
- import gradio as gr
4
- from games_registry import GAMES_REGISTRY
5
- from llm_registry import LLM_REGISTRY
6
- from simulators.base_simulator import PlayerType
7
- from typing import Dict
8
-
9
- # File to persist results
10
- RESULTS_TRACKER_FILE = "results_tracker.json"
11
-
12
- # Load or initialize the results tracker
13
- if os.path.exists(RESULTS_TRACKER_FILE):
14
- with open(RESULTS_TRACKER_FILE, "r") as f:
15
- results_tracker = json.load(f)
16
- else:
17
- results_tracker = {
18
- name: {opponent: {"wins": 0, "games": 0} for opponent in ["Human"] + list(LLM_REGISTRY.keys())}
19
- for name in ["Human"] + list(LLM_REGISTRY.keys())
20
- }
21
-
22
-
23
- def save_results_tracker():
24
- """Save the results tracker to a JSON file."""
25
- with open(RESULTS_TRACKER_FILE, "w") as f:
26
- json.dump(results_tracker, f, indent=4)
27
-
28
-
29
- def initialize_game(game_name, player1_type, player2_type, player1_model, player2_model):
30
- """Initialize the game state and simulator."""
31
- game_config = GAMES_REGISTRY[game_name]
32
- game = game_config["loader"]()
33
- simulator_class = game_config["simulator"]
34
-
35
- # Ensure models are selected if players are LLMs
36
- if player1_type == "llm" and not player1_model:
37
- raise ValueError("Player 1 is set to LLM, but no model is selected.")
38
- if player2_type == "llm" and not player2_model:
39
- raise ValueError("Player 2 is set to LLM, but no model is selected.")
40
-
41
- # Initialize LLMs for the players
42
- llms = {
43
- "Player 1": LLM_REGISTRY[player1_model]["model_loader"]() if player1_type == "llm" else None,
44
- "Player 2": LLM_REGISTRY[player2_model]["model_loader"]() if player2_type == "llm" else None,
45
- }
46
-
47
- # Map player types to names
48
- player_type_map = {
49
- "Player 1": player1_type,
50
- "Player 2": player2_type,
51
- }
52
-
53
- # Create the simulator
54
- simulator = simulator_class(
55
- game,
56
- game_name,
57
- llms=llms,
58
- player_type=player_type_map,
59
- )
60
- state = game.new_initial_state()
61
-
62
- return simulator, state, "Game Initialized! Click 'Next Turn' to start."
63
-
64
-
65
- def toggle_model_dropdown(player1, player2):
66
- """Control visibility and set default models for LLM players."""
67
- player1_model_visible = gr.update(visible=(player1 == "llm"))
68
- player2_model_visible = gr.update(visible=(player2 == "llm"))
69
-
70
- # Set default models if the player type is "llm"
71
- default_model1 = list(LLM_REGISTRY.keys())[0] if player1 == "llm" else None
72
- default_model2 = list(LLM_REGISTRY.keys())[0] if player2 == "llm" else None
73
-
74
- return player1_model_visible, player2_model_visible, default_model1, default_model2
75
-
76
-
77
- def update_results_tracker(scores, player1, player2):
78
- """Update the matrix results tracker with game outcomes."""
79
- player1_name = player1 if player1 != "llm" else "Human"
80
- player2_name = player2 if player2 != "llm" else "Human"
81
-
82
- # Update games played
83
- results_tracker[player1_name][player2_name]["games"] += 1
84
- results_tracker[player2_name][player1_name]["games"] += 1
85
-
86
- # Update wins
87
- if scores[0] > scores[1]: # Player 1 wins
88
- results_tracker[player1_name][player2_name]["wins"] += 1
89
- elif scores[1] > scores[0]: # Player 2 wins
90
- results_tracker[player2_name][player1_name]["wins"] += 1
91
-
92
- save_results_tracker() # Save after every update
93
-
94
-
95
- def calculate_matrix_leaderboard():
96
- """Generate a matrix leaderboard table."""
97
- matrix = [[""] + list(results_tracker.keys())] # Header row
98
- for player, opponents in results_tracker.items():
99
- row = [player]
100
- for opponent in results_tracker.keys():
101
- games = opponents[opponent]["games"]
102
- wins = opponents[opponent]["wins"]
103
- win_percentage = (wins / games * 100) if games > 0 else 0
104
- row.append(f"{win_percentage:.2f}%")
105
- matrix.append(row)
106
- return matrix
107
-
108
-
109
- def play_turn(simulator, state, player1_type, player2_type, human_move=None, player1_model=None, player2_model=None):
110
- """Play a single turn of the game."""
111
- if state.is_terminal():
112
- final_scores = state.returns()
113
- update_results_tracker(final_scores, player1_model, player2_model)
114
- return f"Game Over!\nFinal Scores: {final_scores}", state
115
-
116
- current_player = state.current_player()
117
- legal_moves = state.legal_actions(current_player)
118
- board = str(state)
119
-
120
- # Human player's turn
121
- if (player1_type == "human" and current_player == 0) or (player2_type == "human" and current_player == 1):
122
- if human_move is None:
123
- return f"Your Turn! Current Board:\n{board}\nValid Moves: {legal_moves}", state
124
- try:
125
- human_move = int(human_move)
126
- if human_move not in legal_moves:
127
- return f"Invalid move. Legal moves are: {legal_moves}\nCurrent Board:\n{board}", state
128
- state.apply_action(human_move)
129
- except ValueError:
130
- return f"Invalid input. Please enter a valid move number.\nValid Moves: {legal_moves}\nCurrent Board:\n{board}", state
131
- else:
132
- # LLM or bot's turn
133
- action = simulator._get_action(current_player, state, legal_moves)
134
- state.apply_action(action)
135
-
136
- # Continue to the next turn
137
- legal_moves = state.legal_actions(state.current_player())
138
- board = str(state)
139
- return f"Next Turn! Current Board:\n{board}\nValid Moves: {legal_moves}", state
140
-
141
-
142
- # Gradio Interface
143
- with gr.Blocks() as interface:
144
- with gr.Tab("Game Arena"):
145
- gr.Markdown("# LLM Game Arena\nPlay against LLMs or other players in classic games!")
146
-
147
- with gr.Row():
148
- game_dropdown = gr.Dropdown(
149
- choices=list(GAMES_REGISTRY.keys()),
150
- label="Select a Game",
151
- value="tic_tac_toe", # Default to Tic-Tac-Toe
152
- )
153
- with gr.Row():
154
- player1_dropdown = gr.Dropdown(
155
- choices=["human", "random_bot", "llm"],
156
- label="Player 1 Type",
157
- value="human", # Default to human
158
- )
159
- player2_dropdown = gr.Dropdown(
160
- choices=["human", "random_bot", "llm"],
161
- label="Player 2 Type",
162
- value="llm", # Default to LLM
163
- )
164
- with gr.Row():
165
- player1_model_dropdown = gr.Dropdown(
166
- choices=list(LLM_REGISTRY.keys()),
167
- label="Player 1 Model",
168
- value=None, # No default value if Player 1 is human
169
- visible=False, # Hidden by default for a human player
170
- )
171
- player2_model_dropdown = gr.Dropdown(
172
- choices=list(LLM_REGISTRY.keys()),
173
- label="Player 2 Model",
174
- value=list(LLM_REGISTRY.keys())[0], # Default to the first LLM for Player 2
175
- visible=True, # Visible by default for an LLM player
176
- )
177
-
178
- with gr.Row():
179
- human_input = gr.Textbox(label="Enter your move (number)", visible=True)
180
- with gr.Row():
181
- result_output = gr.Textbox(label="Game Progress", interactive=False)
182
- with gr.Row():
183
- restart_button = gr.Button("Restart Game")
184
- next_turn_button = gr.Button("Next Turn")
185
-
186
- # State management
187
- simulator_state = gr.State(None) # To store the simulator
188
- game_state = gr.State(None) # To store the game state
189
-
190
- restart_button.click(
191
- initialize_game,
192
- inputs=[game_dropdown, player1_dropdown, player2_dropdown, player1_model_dropdown, player2_model_dropdown],
193
- outputs=[simulator_state, game_state, result_output],
194
- )
195
-
196
- next_turn_button.click(
197
- play_turn,
198
- inputs=[simulator_state, game_state, player1_dropdown, player2_dropdown, human_input, player1_model_dropdown, player2_model_dropdown],
199
- outputs=[result_output, game_state],
200
- )
201
-
202
- with gr.Tab("Leaderboard"):
203
- gr.Markdown("# Matrix Leaderboard\nSee how players perform against each other!")
204
- leaderboard_table = gr.Dataframe(label="Leaderboard Matrix")
205
-
206
- def update_leaderboard_tab():
207
- return calculate_matrix_leaderboard()
208
-
209
- update_leaderboard_button = gr.Button("Update Leaderboard")
210
- update_leaderboard_button.click(
211
- update_leaderboard_tab,
212
- inputs=[],
213
- outputs=leaderboard_table,
214
- )
215
-
216
- interface.launch()