import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import spaces

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("diabolic6045/open-llama-Instruct")
model = AutoModelForCausalLM.from_pretrained("diabolic6045/open-llama-Instruct")
model.eval()
if torch.cuda.is_available():
    model.to('cuda')

@Spaces.GPU()
def respond(
    message,
    history,
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    # Build the conversation history
    conversation = f"System: {system_message}\n"
    for user_msg, bot_msg in history:
        conversation += f"User: {user_msg}\nAssistant: {bot_msg}\n"
    conversation += f"User: {message}\nAssistant:"

    # Tokenize the input
    inputs = tokenizer(conversation, return_tensors='pt', truncation=True, max_length=1024)
    if torch.cuda.is_available():
        inputs = {k: v.to('cuda') for k, v in inputs.items()}

    # Generate the response
    output = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        pad_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the assistant's reply
    response = response[len(conversation):].strip()
    return response

# Create the Gradio interface with the Ocean theme
demo = gr.ChatInterface(
    fn=respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System Message"),
        gr.Slider(minimum=1, maximum=512, value=256, step=1, label="Max New Tokens"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p (Nucleus Sampling)"),
    ],
    title="Open Llama Chatbot",
    description="Chat with an AI assistant powered by the Open Llama Instruct model.",
    theme=gr.themes.Ocean(),
)

if __name__ == "__main__":
    demo.launch()