import os from datetime import datetime import random import gradio as gr from datasets import load_dataset, Dataset, DatasetDict from huggingface_hub import whoami, InferenceClient # Initialize the inference client client = InferenceClient( api_key=os.getenv("HF_API_KEY"), # Make sure to set this environment variable ) # Load questions from Hugging Face dataset EXAM_MAX_QUESTIONS = os.getenv("EXAM_MAX_QUESTIONS") or 5 # We have 5 questions total EXAM_PASSING_SCORE = os.getenv("EXAM_PASSING_SCORE") or 0.7 EXAM_DATASET_ID = "burtenshaw/dummy-code-quiz" # prep the dataset for the quiz ds = load_dataset(EXAM_DATASET_ID, split="train") quiz_data = ds.to_list() random.shuffle(quiz_data) def check_code(user_code, solution, challenge): """ Use LLM to evaluate if the user's code solution is correct. Returns True if the solution is correct, False otherwise. """ prompt = f"""You are an expert Python programming instructor evaluating a student's code solution. Challenge: {challenge} Reference Solution: {solution} Student's Solution: {user_code} Evaluate if the student's solution is functionally equivalent to the reference solution. Consider: 1. Does it solve the problem correctly? 2. Does it handle edge cases appropriately? 3. Does it follow the requirements of the challenge? Respond with ONLY "CORRECT" or "INCORRECT" followed by a brief explanation. """ messages = [{"role": "user", "content": prompt}] try: completion = client.chat.completions.create( model="Qwen/Qwen2.5-Coder-32B-Instruct", messages=messages, max_tokens=500, ) response = completion.choices[0].message.content.strip() # Extract the verdict from the response is_correct = response.upper().startswith("CORRECT") # Add the explanation to the status text with emoji explanation = response.split("\n", 1)[1] if "\n" in response else "" status = "✅ Correct!" if is_correct else "❌ Incorrect!" gr.Info(f"{status}\n\n{explanation}") return is_correct except Exception as e: gr.Warning(f"Error checking code: {str(e)}") # Fall back to simple string comparison if LLM fails is_correct = user_code.strip() == solution.strip() status = "✅ Correct!" if is_correct else "❌ Incorrect!" gr.Info(f"{status} (Fallback comparison)") return is_correct def on_user_logged_in(token: gr.OAuthToken | None): """Handle user login state""" if token is not None: return gr.update(visible=False), gr.update(visible=True) else: return gr.update(visible=True), gr.update(visible=False) def push_results_to_hub( user_answers: list, token: gr.OAuthToken | None, signed_in_message: str ): """Push results to Hugging Face Hub.""" print(f"signed_in_message: {signed_in_message}") if not user_answers: # Check if there are any answers to submit gr.Warning("No answers to submit!") return "No answers to submit!" if token is None: gr.Warning("Please log in to Hugging Face before pushing!") return "Please log in to Hugging Face before pushing!" # Calculate grade correct_count = sum(1 for answer in user_answers if answer["is_correct"]) total_questions = len(user_answers) grade = correct_count / total_questions if total_questions > 0 else 0 if grade < float(EXAM_PASSING_SCORE): gr.Warning( f"Score {grade:.1%} below passing threshold of {float(EXAM_PASSING_SCORE):.1%}" ) return f"You scored {grade:.1%}. Please try again to achieve at least {float(EXAM_PASSING_SCORE):.1%}" gr.Info("Submitting answers to the Hub. Please wait...", duration=2) user_info = whoami(token=token.token) username = user_info["name"] repo_id = f"{EXAM_DATASET_ID}_responses" submission_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # Create a dataset with the user's answers and metadata submission_data = [ { "username": username, "datetime": submission_time, "grade": grade, **answer, # Include all answer data } for answer in user_answers ] try: # Try to load existing dataset existing_ds = load_dataset(repo_id) # Convert to DatasetDict if it isn't already if not isinstance(existing_ds, dict): existing_ds = DatasetDict({"default": existing_ds}) except Exception: # If dataset doesn't exist, create empty DatasetDict existing_ds = DatasetDict() # Create new dataset from submission new_ds = Dataset.from_list(submission_data) # Add or update the split for this user existing_ds[username] = new_ds # Push the updated dataset to the Hub existing_ds.push_to_hub( repo_id, private=True, # Make it private by default since it contains student submissions ) return f"Your responses have been submitted to the Hub! Final grade: {grade:.1%}" def handle_quiz(question_idx, user_answers, submitted_code, is_start): """Handle quiz state and progression""" # Hide the start button once the first question is shown start_btn_update = gr.update(visible=False) if is_start else None # If this is the first time (start=True), begin at question_idx=0 if is_start: question_idx = 0 else: # If not the first question and there's a submission, store the user's last submission if ( question_idx < len(quiz_data) and submitted_code.strip() ): # Only check if there's code current_q = quiz_data[question_idx] is_correct = check_code( submitted_code, current_q["solution"], current_q["challenge"] ) user_answers.append( { "challenge": current_q["challenge"], "submitted_code": submitted_code, "correct_solution": current_q["solution"], "is_correct": is_correct, } ) question_idx += 1 # If we've reached the end, show final results if question_idx >= len(quiz_data): correct_count = sum(1 for answer in user_answers if answer["is_correct"]) grade = correct_count / len(user_answers) results_text = ( f"**Quiz Complete!**\n\n" f"Your score: {grade:.1%}\n" f"Passing score: {float(EXAM_PASSING_SCORE):.1%}\n\n" f"Your answers:\n\n" ) for idx, answer in enumerate(user_answers): results_text += ( f"Question {idx + 1}: {'✅' if answer['is_correct'] else '❌'}\n" ) results_text += ( f"Your code:\n```python\n{answer['submitted_code']}\n```\n\n" ) return ( "", # question_text becomes blank gr.update(value="", visible=False), # clear and hide code input f"{'✅ Passed!' if grade >= float(EXAM_PASSING_SCORE) else '❌ Did not pass'}", question_idx, user_answers, start_btn_update, gr.update(value=results_text, visible=True), # show final_markdown ) else: # Show the next question q = quiz_data[question_idx] challenge_text = f"## Question {question_idx + 1} \n### {q['challenge']}" return ( challenge_text, gr.update(value=q["placeholder"], visible=True), "Submit your code solution and click 'Next' to continue.", question_idx, user_answers, start_btn_update, gr.update(visible=False), # Hide final_markdown ) with gr.Blocks() as demo: demo.title = f"Coding Quiz: {EXAM_DATASET_ID}" # State variables question_idx = gr.State(value=0) user_answers = gr.State(value=[]) with gr.Row(variant="compact"): gr.Markdown(f"## Welcome to the {EXAM_DATASET_ID} Quiz") with gr.Row(variant="compact"): gr.Markdown( "Log in first, then click 'Start' to begin. Complete each coding challenge, click 'Next', " "and finally click 'Submit' to publish your results to the Hugging Face Hub." ) with gr.Row(variant="panel"): question_text = gr.Markdown("") code_input = gr.Code(language="python", label="Your Solution", visible=False) with gr.Row(variant="compact"): status_text = gr.Markdown("") with gr.Row(variant="compact"): final_markdown = gr.Markdown("", visible=False) next_btn = gr.Button("Next ⏭️") submit_btn = gr.Button("Submit ✅") with gr.Row(variant="compact"): login_btn = gr.LoginButton() start_btn = gr.Button("Start", visible=False) login_btn.click(fn=on_user_logged_in, inputs=None, outputs=[login_btn, start_btn]) start_btn.click( fn=handle_quiz, inputs=[question_idx, user_answers, code_input, gr.State(True)], outputs=[ question_text, code_input, status_text, question_idx, user_answers, start_btn, final_markdown, ], ) next_btn.click( fn=handle_quiz, inputs=[question_idx, user_answers, code_input, gr.State(False)], outputs=[ question_text, code_input, status_text, question_idx, user_answers, start_btn, final_markdown, ], ) submit_btn.click( fn=push_results_to_hub, inputs=[user_answers, login_btn], outputs=status_text, ) if __name__ == "__main__": demo.launch()