diabolic6045's picture
Update app.py
7867487 verified
raw
history blame
2.03 kB
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import spaces
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("diabolic6045/open-llama-Instruct")
model = AutoModelForCausalLM.from_pretrained("diabolic6045/open-llama-Instruct")
model.eval()
if torch.cuda.is_available():
model.to('cuda')
@Spaces.GPU()
def respond(
message,
history,
system_message,
max_tokens,
temperature,
top_p,
):
# Build the conversation history
conversation = f"System: {system_message}\n"
for user_msg, bot_msg in history:
conversation += f"User: {user_msg}\nAssistant: {bot_msg}\n"
conversation += f"User: {message}\nAssistant:"
# Tokenize the input
inputs = tokenizer(conversation, return_tensors='pt', truncation=True, max_length=1024)
if torch.cuda.is_available():
inputs = {k: v.to('cuda') for k, v in inputs.items()}
# Generate the response
output = model.generate(
**inputs,
max_new_tokens=max_tokens,
do_sample=True,
temperature=temperature,
top_p=top_p,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(output[0], skip_special_tokens=True)
# Extract the assistant's reply
response = response[len(conversation):].strip()
return response
# Create the Gradio interface with the Ocean theme
demo = gr.ChatInterface(
fn=respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System Message"),
gr.Slider(minimum=1, maximum=512, value=256, step=1, label="Max New Tokens"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p (Nucleus Sampling)"),
],
title="Open Llama Chatbot",
description="Chat with an AI assistant powered by the Open Llama Instruct model.",
theme=gr.themes.Ocean(),
)
if __name__ == "__main__":
demo.launch()