Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from transformers import AutoTokenizer | |
from awq import AutoAWQForCausalLM | |
model_path = "bragour/Camel-7b-chat-awq" | |
model = AutoAWQForCausalLM.from_quantized(model_path, fuse_layers=True, trust_remote_code=False, safetensors=True) | |
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=False) | |
def respond( | |
message | |
): | |
formatted_prompt = f"<s>[INST]{message}[/INST]" | |
tokens = tokenizer(formatted_prompt, return_tensors='pt').input_ids.cuda() | |
# Generate the response from the API | |
result = model.generate( | |
tokens, | |
do_sample=False, | |
max_new_tokens=200 | |
) | |
response = tokenizer.decode(result[0], skip_special_tokens=True) | |
return response | |
# Define the Gradio interface | |
demo = gr.Interface( | |
fn=respond, | |
inputs="text", | |
outputs=["text"] | |
) | |
demo.launch(inline=False) | |
if __name__ == "__main__": | |
demo.launch() | |