File size: 2,386 Bytes
5fd0c28
038ef00
 
2936c26
01945bd
eb2e235
 
 
932195b
038ef00
eb2e235
932195b
 
eb2e235
 
932195b
038ef00
 
 
 
20e04d7
038ef00
 
 
 
eb2e235
038ef00
eb2e235
038ef00
 
 
 
 
 
01945bd
def541d
 
eb2e235
01945bd
038ef00
 
 
 
 
 
01945bd
eb2e235
def541d
01945bd
 
038ef00
01945bd
eb2e235
 
01945bd
 
 
 
b97d649
 
 
 
 
 
 
038ef00
b97d649
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
from huggingface_hub import hf_hub_download

# Hugging Face repository IDs
base_model_repo = "unsloth/Llama-3.2-3B-Instruct-GGUF"
adapter_repo = "Mat17892/llama_lora_gguf"

# Download model and adapter
print("Downloading base model...")
base_model_path = hf_hub_download(repo_id=base_model_repo, filename="Llama-3.2-3B-Instruct-Q8_0.gguf")

print("Downloading LoRA adapter...")
lora_adapter_path = hf_hub_download(repo_id=adapter_repo, filename="llama_lora_adapter.gguf")

# Load the tokenizer and base model
print("Loading base model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
base_model = AutoModelForCausalLM.from_pretrained(base_model_path)

# Load the LoRA adapter
print("Loading LoRA adapter...")
config = PeftConfig.from_pretrained(lora_adapter_path)
model = PeftModel.from_pretrained(base_model, lora_adapter_path)

print("Model is ready!")

# Function for inference
def chat_with_model(user_input, chat_history):
    """
    Generate a response from the model using the chat history and user input.
    """
    # Prepare the prompt
    prompt = ""
    for user, ai in chat_history:
        prompt += f"User: {user}\nAI: {ai}\n"
    prompt += f"User: {user_input}\nAI:"  # Add latest user input

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # Generate response
    outputs = model.generate(**inputs, max_new_tokens=200, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Update chat history
    chat_history.append((user_input, response))
    return chat_history, chat_history

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# 🦙 LLaMA Chatbot with Base Model and LoRA Adapter")
    chatbot = gr.Chatbot(label="Chat with the Model")

    with gr.Row():
        with gr.Column(scale=4):
            user_input = gr.Textbox(label="Your Message", placeholder="Type a message...")
        with gr.Column(scale=1):
            submit_btn = gr.Button("Send")

    chat_history = gr.State([])

    # Link components
    submit_btn.click(
        chat_with_model,
        inputs=[user_input, chat_history],
        outputs=[chatbot, chat_history],
        show_progress=True,
    )

# Launch the Gradio app
demo.launch()