llama-3.2-3B-Instruct

Running on Zero

File size: 4,438 Bytes

8c029ff
 
 
 
 
 
 
9dffb16
237a953
8c029ff
 
9147ca6
8c029ff
9147ca6
 
 
8c029ff
 
 
 
 
237a953
8c029ff
 
db9d573
2ea7af4
 
237a953
e45a115
 
8c029ff
2ea7af4
fd91d6d
237a953
b4e4cc6
2ea7af4
db9d573
411e698
db9d573
 
fd91d6d
237a953
fd91d6d
 
 
 
 
 
b4e4cc6
fd91d6d
 
 
db9d573
fd91d6d
4b22725
8c029ff
 
7c32a88
8c029ff
 
 
 
 
 
7c32a88
 
 
 
 
 
4b22725
7c32a88
8c029ff
 
7c32a88
8c029ff
 
 
 
 
 
 
 
 
2ea7af4
8c029ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73fa276
8c029ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c5913d
8c029ff
 
 
 
 
 
 
 
fd91d6d

import os
from threading import Thread
from typing import Iterator

import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from peft import AutoPeftModelForCausalLM

DESCRIPTION = """\
# Llama 3.2 3B Instruct

Llama 3.2 3B is Meta's latest iteration of open LLMs.
This is a demo of [`meta-llama/Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct), fine-tuned for instruction following.
For more details, please check [our post](https://huggingface.co/blog/llama32).
"""

MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
HF_TOKEN = os.getenv("HF_TOKEN")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
'''
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    trust_remote_code=True
)
'''
tokenizer = AutoTokenizer.from_pretrained(model_name)

peft_model = AutoPeftModelForCausalLM.from_pretrained("ehristoforu/fd-lora-16x32", torch_dtype=torch.float16, trust_remote_code=True)
merged_model = peft_model.merge_and_unload()
merged_model.save_pretrained("./coolqwen")
#model.save_pretrained("./coolqwen")
tokenizer.save_pretrained("./coolqwen")

from huggingface_hub import HfApi

api = HfApi()



api.upload_folder(
    folder_path="./coolqwen",
    repo_id="ehristoforu/fd-lora-merged-16x32",
    repo_type="model",
    token=HF_TOKEN,
)


@spaces.GPU()
def generate(
    message: str,
    chat_history: list[dict],
    max_new_tokens: int = 1024,
    temperature: float = 0.6,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.2,
) -> Iterator[str]:
    conversation = [*chat_history, {"role": "user", "content": message}]

    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
    input_ids = input_ids.to(merged_model.device)

    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        {"input_ids": input_ids},
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        num_beams=1,
        repetition_penalty=repetition_penalty,
    )
    t = Thread(target=merged_model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        yield "".join(outputs)


chat_interface = gr.ChatInterface(
    fn=generate,
    additional_inputs=[
        gr.Slider(
            label="Max new tokens",
            minimum=1,
            maximum=MAX_MAX_NEW_TOKENS,
            step=1,
            value=DEFAULT_MAX_NEW_TOKENS,
        ),
        gr.Slider(
            label="Temperature",
            minimum=0.0,
            maximum=4.0,
            step=0.1,
            value=0.6,
        ),
        gr.Slider(
            label="Top-p (nucleus sampling)",
            minimum=0.05,
            maximum=1.0,
            step=0.05,
            value=0.9,
        ),
        gr.Slider(
            label="Top-k",
            minimum=1,
            maximum=1000,
            step=1,
            value=50,
        ),
        gr.Slider(
            label="Repetition penalty",
            minimum=1.0,
            maximum=2.0,
            step=0.05,
            value=1.2,
        ),
    ],
    stop_btn=None,
    examples=[
        ["Hello there! How are you doing?"],
        ["Can you explain briefly to me what is the Python programming language?"],
        ["Explain the plot of Cinderella in a sentence."],
        ["How many hours does it take a man to eat a Helicopter?"],
        ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
    ],
    cache_examples=False,
)

with gr.Blocks(css="style.css", fill_height=True) as demo:
    gr.Markdown(DESCRIPTION)
    gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
    chat_interface.render()

if __name__ == "__main__":
    demo.queue(max_size=20).launch()