Spaces:

Genius-Society
/

DeepSeek_R1_Qwen_7B

Running

File size: 3,502 Bytes

9d250e2
859baec
9d250e2
 
859baec
 
9d250e2
 
 
c54c298
859baec
9d250e2
 
859baec
9d250e2
 
859baec
9d250e2
 
 
859baec
 
9d250e2
 
 
 
 
859baec
9d250e2
832cc9b
2c32302
 
 
 
 
832cc9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d250e2
 
 
 
 
859baec
9d250e2
859baec
 
 
9d250e2
2c32302
 
 
 
9d250e2
 
 
c54c298
9d250e2

import torch
import gradio as gr
from threading import Thread
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer


MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
MODEL_NAME = MODEL_ID.split("/")[-1]
CONTEXT_LENGTH = 16000
DESCRIPTION = f"This is a HuggingFace deployment instance of {MODEL_NAME} model, if you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment"


def predict(
    message,
    history,
    system_prompt,
    temperature,
    max_new_tokens,
    top_k,
    repetition_penalty,
    top_p,
):
    # Format history with a given chat template
    stop_tokens = ["<|endoftext|>", "<|im_end|>", "|im_end|"]
    instruction = "<|im_start|>system\n" + system_prompt + "\n<|im_end|>\n"
    for user, assistant in history:
        instruction += f"<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n"

    instruction += f"<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n"
    try:
        if device == torch.device("cpu"):
            raise EnvironmentError(
                "If you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment"
            )

        streamer = TextIteratorStreamer(
            tokenizer,
            skip_prompt=True,
            skip_special_tokens=True,
        )
        enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
        input_ids, attention_mask = enc.input_ids, enc.attention_mask
        if input_ids.shape[1] > CONTEXT_LENGTH:
            input_ids = input_ids[:, -CONTEXT_LENGTH:]
            attention_mask = attention_mask[:, -CONTEXT_LENGTH:]

        generate_kwargs = dict(
            input_ids=input_ids.to(device),
            attention_mask=attention_mask.to(device),
            streamer=streamer,
            do_sample=True,
            temperature=temperature,
            max_new_tokens=max_new_tokens,
            top_k=top_k,
            repetition_penalty=repetition_penalty,
            top_p=top_p,
        )
        t = Thread(target=model.generate, kwargs=generate_kwargs)
        t.start()

    except Exception as e:
        streamer = f"{e}"

    outputs = []
    for new_token in streamer:
        outputs.append(new_token)
        if new_token in stop_tokens:
            break

        yield "".join(outputs)


if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if device == torch.device("cuda"):
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")

    # Create Gradio interface
    gr.ChatInterface(
        predict,
        title=f"{MODEL_NAME} Deployment Instance",
        description=DESCRIPTION,
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False),
        additional_inputs=[
            gr.Textbox(
                "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
                label="System prompt",
            ),
            gr.Slider(0, 1, 0.6, label="Temperature"),
            gr.Slider(0, 32000, 10000, label="Max new tokens"),
            gr.Slider(1, 80, 40, label="Top K sampling"),
            gr.Slider(0, 2, 1.1, label="Repetition penalty"),
            gr.Slider(0, 1, 0.95, label="Top P sampling"),
        ],
    ).queue().launch()