Spaces:

jonathanjordan21
/

mos-mamba-chat

Sleeping

File size: 3,312 Bytes

b538faf
 
 
2ef9ba6
 
 
 
 
b232ee4
2ef9ba6
 
 
 
 
 
78b95a0
b232ee4
2ef9ba6
 
 
 
 
 
 
 
 
0f72328
 
2ef9ba6
b232ee4
2ef9ba6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0f72328
2ef9ba6
fe0550e
b232ee4
 
 
5d3a921
fe0550e
 
 
2ef9ba6
fe0550e
2ef9ba6
78b95a0
 
 
b232ee4
b538faf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f4db35
b538faf
fd2ed4e
b538faf

import gradio as gr
from huggingface_hub import InferenceClient

# """
# For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
# """
# client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, TextStreamer
from peft import PeftConfig, PeftModel

config = PeftConfig.from_pretrained("jonathanjordan21/mos-mamba-6x130m-trainer")

tokenizer = AutoTokenizer.from_pretrained("jonathanjordan21/mos-mamba-6x130m-trainer", trust_remote_code=True)

# streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

model = AutoModelForCausalLM.from_pretrained(
    "jonathanjordan21/mos-mamba-6x130m-trainer",
    eos_token_id=tokenizer.eos_token_id,
    trust_remote_code=True
)

model = PeftModel.from_pretrained(model, "jonathanjordan21/mos-mamba-6x130m-trainer",)#, adapter_name="norobots")
model = model.merge_and_unload()

print(model.config.eos_token_id)



def invoke(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    tokens = tokenizer.apply_chat_template(messages, return_tensors='pt', add_generation_prompt=True)

    response = model.generate(
        tokens, 
        eos_token_id=model.config.eos_token_id, 
        max_new_tokens=max_tokens, 
        # temperature=temperature
    )

    print(response)

    res = tokenizer.batch_decode(response)

    print(res)

    return res[0]
    # yield res


def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    response = ""

    for message in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = message.choices[0].delta.content

        response += token
        yield response

"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    invoke,
    additional_inputs=[
        gr.Textbox(value="You are a helpful assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)


if __name__ == "__main__":
    demo.launch()