Alejadro Sanchez-Giraldo
updates to start
d0e9a6d
import os
import logging
import gradio as gr
import torch
import uuid
import time
import ldclient
from ldclient.config import Config
from ldclient import Context
from transformers import AutoTokenizer, AutoModelForCausalLM
def capture_logs(log_body, log_file, uuid_label):
logger = logging.getLogger('MyApp')
logger.setLevel(logging.INFO)
# Check if handlers are already added to avoid duplication
if not logger.handlers:
fh = logging.FileHandler(log_file)
fh.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
logger.addHandler(fh)
logger.addHandler(ch)
logger.info('uuid: %s - %s', log_body, uuid_label)
return
print("CUDA available: ", torch.cuda.is_available())
print("MPS available: ", torch.backends.mps.is_available())
sdkKey = os.getenv('sdkKEY')
ldclient.set_config(Config(sdkKey))
client = ldclient.get()
context = Context.builder("huggie-face") \
.set("application", "deepSeekChatbot") \
.build()
flag_value = client.variation("themeColors", context, False)
if flag_value:
print("Feature flag on")
theme = gr.themes.Soft(
primary_hue="fuchsia",
neutral_hue="blue",
)
else:
print("Feature flag off")
theme = gr.themes.Soft(
primary_hue="sky",
neutral_hue="slate",
)
tokenizer = AutoTokenizer.from_pretrained(
"deepseek-ai/deepseek-coder-1.3b-instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
"deepseek-ai/deepseek-coder-1.3b-instruct", trust_remote_code=True, torch_dtype=torch.bfloat16)
# Disable tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "True"
# Check if MPS (Metal Performance Shaders) is available
device = torch.device(
"mps") if torch.backends.mps.is_available() else torch.device("cpu")
model = model.to(device)
# Function to handle user input and generate a response
def chatbot_response(query, tokens, top_k, top_p):
uuid_label = str(uuid.uuid4())
start_time = time.time() # Start timer
# Generate response using the model
messages = [{'role': 'user', 'content': query}]
inputs = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
outputs = model.generate(
inputs,
max_new_tokens=tokens,
do_sample=True,
top_k=top_k,
top_p=top_p,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id
)
model_response = tokenizer.decode(
outputs[0][len(inputs[0]):], skip_special_tokens=True)
end_time = time.time() # End timer
performance_time = round(end_time - start_time, 2)
log_body = 'query: %s, pocessTime: %s, tokens: %s, top_k: %s, top_p: %s' % (
query, performance_time, tokens, top_k, top_p)
capture_logs(uuid_label, 'query_logs.csv', log_body)
return model_response
# Set up the Gradio interface
iface = gr.Interface(
fn=chatbot_response,
inputs=[
gr.Textbox(label="Ask our DSChatbot Expert"),
gr.Slider(label="Max New Tokens", minimum=128,
maximum=2048, step=128, value=512),
gr.Slider(label="Top K", minimum=0, maximum=100, step=10, value=50),
gr.Slider(label="Top P", minimum=0.0,
maximum=1.0, step=0.1, value=0.95),
],
outputs=gr.Textbox(label="Hope it helps!"),
theme=theme,
title="DSChatbot"
)
if __name__ == "__main__":
iface.launch()