File size: 2,386 Bytes
5fd0c28 038ef00 2936c26 01945bd eb2e235 932195b 038ef00 eb2e235 932195b eb2e235 932195b 038ef00 20e04d7 038ef00 eb2e235 038ef00 eb2e235 038ef00 01945bd def541d eb2e235 01945bd 038ef00 01945bd eb2e235 def541d 01945bd 038ef00 01945bd eb2e235 01945bd b97d649 038ef00 b97d649 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
from huggingface_hub import hf_hub_download
# Hugging Face repository IDs
base_model_repo = "unsloth/Llama-3.2-3B-Instruct-GGUF"
adapter_repo = "Mat17892/llama_lora_gguf"
# Download model and adapter
print("Downloading base model...")
base_model_path = hf_hub_download(repo_id=base_model_repo, filename="Llama-3.2-3B-Instruct-Q8_0.gguf")
print("Downloading LoRA adapter...")
lora_adapter_path = hf_hub_download(repo_id=adapter_repo, filename="llama_lora_adapter.gguf")
# Load the tokenizer and base model
print("Loading base model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
base_model = AutoModelForCausalLM.from_pretrained(base_model_path)
# Load the LoRA adapter
print("Loading LoRA adapter...")
config = PeftConfig.from_pretrained(lora_adapter_path)
model = PeftModel.from_pretrained(base_model, lora_adapter_path)
print("Model is ready!")
# Function for inference
def chat_with_model(user_input, chat_history):
"""
Generate a response from the model using the chat history and user input.
"""
# Prepare the prompt
prompt = ""
for user, ai in chat_history:
prompt += f"User: {user}\nAI: {ai}\n"
prompt += f"User: {user_input}\nAI:" # Add latest user input
# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt")
# Generate response
outputs = model.generate(**inputs, max_new_tokens=200, pad_token_id=tokenizer.eos_token_id)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Update chat history
chat_history.append((user_input, response))
return chat_history, chat_history
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# 🦙 LLaMA Chatbot with Base Model and LoRA Adapter")
chatbot = gr.Chatbot(label="Chat with the Model")
with gr.Row():
with gr.Column(scale=4):
user_input = gr.Textbox(label="Your Message", placeholder="Type a message...")
with gr.Column(scale=1):
submit_btn = gr.Button("Send")
chat_history = gr.State([])
# Link components
submit_btn.click(
chat_with_model,
inputs=[user_input, chat_history],
outputs=[chatbot, chat_history],
show_progress=True,
)
# Launch the Gradio app
demo.launch()
|