Spaces:

Blexus
/

Quble_Instuct_v1_DEMO

Runtime error

File size: 3,136 Bytes

5195c78
 
 
f213f73
5195c78
 
 
 
0eacc14
 
0ea9587
 
0af0fc5
0ea9587
 
 
5195c78
 
 
0af0fc5
5195c78
 
 
 
 
7116955
0af0fc5
8bae99c
 
f213f73
 
0af0fc5
f213f73
0f9e51d
f213f73
0f9e51d
 
8bae99c
f213f73
8bae99c
b9ba584
5195c78
0eacc14
 
 
 
 
5195c78
 
0af0fc5
c0c5ff7
0eacc14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5195c78
 
0eacc14
5195c78
 
b9ba584

from transformers import pipeline
import gradio as gr
import json
import time

# Initialize the pipeline with the new model
pipe = pipeline("text-generation", model="Blexus/Quble_test_model_v1_INSTRUCT_v1")

def format_prompt(message, system, history):
    prompt = f"SYSTEM: {system}\n<|endofsystem|>\n"
    
    for entry in history:
        if len(entry) == 2:
            user_prompt, bot_response = entry
            prompt += f"USER: {user_prompt}\n\n\nASSISTANT: {bot_response}<|endoftext|>\n"
    
    prompt += f"USER: {message}\n\n\nASSISTANT:"
    return prompt

def generate(prompt, system, history, temperature=0.9, max_new_tokens=4096, top_p=0.9, repetition_penalty=1.2):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    formatted_prompt = format_prompt(prompt, system, history)
    response_text = "We are sorry but Quble doesn't know how to answer."
    # Generate the response without streaming
    try:
            response = pipe(formatted_prompt, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty)[0]["generated_text"]
            response_text = response.split("ASSISTANT:")[-1].strip()

            # Simulate streaming by yielding parts of the response
            accumulated_response = ""  # To keep track of the full response
            for char in response_text:
                accumulated_response += char  # Append the new character
                yield accumulated_response  # Yield the accumulated response
                time.sleep(0.02)  # Add a slight delay to simulate typing

    except Exception as e:
            print(f"Error generating response: {e}")

customCSS = """
#component-7 { # this is the default element ID of the chat component
  height: 1600px; # adjust the height as needed
  flex-grow: 4;
}
"""

additional_inputs = [
    gr.Textbox(
        label="System prompt",
        value="You are a helpful assistant, with no access to external functions.",
        info="System prompt",
        interactive=True,
    ),
    gr.Slider(
        label="Temperature",
        value=0.9,
        minimum=0.0,
        maximum=1.0,
        step=0.05,
        interactive=True,
        info="Higher values produce more diverse outputs",
    ),
    gr.Slider(
        label="Max new tokens",
        value=1024,
        minimum=64,
        maximum=4096,
        step=64,
        interactive=True,
        info="The maximum numbers of new tokens",
    ),
    gr.Slider(
        label="Top-p (nucleus sampling)",
        value=0.90,
        minimum=0.0,
        maximum=1,
        step=0.05,
        interactive=True,
        info="Higher values sample more low-probability tokens",
    ),
    gr.Slider(
        label="Repetition penalty",
        value=1.2,
        minimum=1.0,
        maximum=2.0,
        step=0.05,
        interactive=True,
        info="Penalize repeated tokens",
    )
]

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.ChatInterface(
        generate,
        additional_inputs=additional_inputs,
    )

demo.queue().launch(debug=True)