Spaces:
Paused
Paused
File size: 9,383 Bytes
7455667 11aac14 bdc3cce 11aac14 7455667 11aac14 7455667 6502352 7455667 64ee482 7455667 86c655a 354e87f 7455667 f037927 7455667 cb5e4b5 7455667 354e87f 7455667 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 |
import spaces
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp_cuda_tensorcores import Llama
REPO_ID = "MaziyarPanahi/Meta-Llama-3-70B-Instruct-GGUF"
MODEL_NAME = "Meta-Llama-3-70B-Instruct.Q3_K_L.gguf"
MAX_CONTEXT_LENGTH = 8192
CUDA = True
SYSTEM_PROMPT = """You are controlling a 2 DOF robot on a 50x50 grid. The robot can move one step in any of the four cardinal directions. The robot can perform the following actions:
- 'up': Move one unit up (increasing y coordinate by 1).
- 'down': Move one unit down (decreasing y coordinate by 1).
- 'left': Move one unit left (decreasing x coordinate by 1).
- 'right': Move one unit right (increasing x coordinate by 1).
Given a target coordinate, your task is to calculate and output the shortest sequence of commands that will move the robot from its current position to the target position.
Output Format:
- Begin with the exact phrase: 'The full list is:'.
- Provide the sequence of commands as a JSON array, with each command as a string. Commands must be exactly 'up', 'down', 'left', or 'right'.
- All coordinates should be formatted as JSON objects with keys 'x' and 'y' and integer values. For example, the starting position should be output as {'x': 0, 'y': 0}.
- When calling tools, ensure that all arguments use this JSON object format for coordinates, with keys 'x' and 'y'.
- Example of correct output:
If the target coordinate is {'x': 2, 'y': 3}, your response should include:
'The full list is: ["right", "right", "up", "up", "up"]'
And for tool calls, use:
'tool_calls': [{'function': {'name': 'validate_path', 'arguments': {'commands': ["right", "right", "up", "up", "up"], 'start_position': {'x': 0, 'y': 0}, 'target_position': {'x': 2, 'y': 3}}}}]'
Please ensure that all output strictly adheres to these formats. If any output is not in the correct format, redo the task and correct the output before providing the final answer."""
TOKEN_STOP = ["<|eot_id|>"]
SYS_MSG = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nSYSTEM_PROMPT<|eot_id|>\n"
USER_PROMPT = (
"<|start_header_id|>user<|end_header_id|>\n\nUSER_PROMPT<|eot_id|>\n"
)
ASSIS_PROMPT = "<|start_header_id|>assistant<|end_header_id|>\n\n"
END_ASSIS_PREVIOUS_RESPONSE = "<|eot_id|>\n"
TASK_PROMPT = {
"Assistant": SYSTEM_PROMPT,
}
# css = ".gradio-container {background-image: url('file=./assets/background.png'); background-size: cover; background-position: center; background-repeat: no-repeat;}"
class ChatLLM:
def __init__(self, config_model):
self.llm = None
self.config_model = config_model
# self.load_cpp_model()
def load_cpp_model(self):
self.llm = Llama(**config_model)
def apply_chat_template(
self,
history,
system_message,
):
history = history or []
messages = SYS_MSG.replace("SYSTEM_PROMPT", system_message.strip())
for msg in history:
messages += (
USER_PROMPT.replace("USER_PROMPT", msg[0]) + ASSIS_PROMPT + msg[1]
)
messages += END_ASSIS_PREVIOUS_RESPONSE if msg[1] else ""
print(messages)
# messages = messages[:-1]
return messages
@spaces.GPU(duration=30)
def response(
self,
history,
system_message,
max_tokens,
temperature,
top_p,
top_k,
repeat_penalty,
):
messages = self.apply_chat_template(history, system_message)
history[-1][1] = ""
if not self.llm:
print("Loading model")
self.load_cpp_model()
for output in self.llm(
messages,
echo=False,
stream=True,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repeat_penalty=repeat_penalty,
stop=TOKEN_STOP,
):
answer = output["choices"][0]["text"]
history[-1][1] += answer # here we append the answer to the last message in the history
# stream the response
yield history, history
def user(message, history):
history = history or []
# Append the user's message to the conversation history
history.append([message, ""])
return "", history
def clear_chat(chat_history_state, chat_message):
chat_history_state = []
chat_message = ""
return chat_history_state, chat_message
def gui(llm_chat):
with gr.Blocks() as app:
gr.Markdown("# Llama 3 70B Instruct GGUF")
gr.Markdown(
f"""
### This demo utilizes the repository ID {REPO_ID} with the model {MODEL_NAME}, powered by the LLaMA.cpp backend.
"""
)
with gr.Row():
with gr.Column(scale=2):
chatbot = gr.Chatbot(
label="Chat",
height=700,
avatar_images=(
"assets/avatar_user.jpeg",
"assets/avatar_llama.jpeg",
),
)
with gr.Column(scale=1):
with gr.Row():
message = gr.Textbox(
label="Message",
placeholder="Ask me anything.",
lines=3,
)
with gr.Row():
submit = gr.Button(value="Send message", variant="primary")
clear = gr.Button(value="New chat", variant="primary")
stop = gr.Button(value="Stop", variant="secondary")
with gr.Accordion("Contextual Prompt Editor"):
default_task = "Assistant"
task_prompts_gui = gr.Dropdown(
TASK_PROMPT,
value=default_task,
label="Prompt selector",
visible=True,
interactive=True,
)
system_msg = gr.Textbox(
TASK_PROMPT[default_task],
label="System Message",
placeholder="system prompt",
lines=4,
)
def task_selector(choice):
return gr.update(value=TASK_PROMPT[choice])
task_prompts_gui.change(
task_selector,
[task_prompts_gui],
[system_msg],
)
with gr.Accordion("Advanced settings", open=False):
with gr.Column():
max_tokens = gr.Slider(
20, 4096, label="Max Tokens", step=20, value=400
)
temperature = gr.Slider(
0.2, 2.0, label="Temperature", step=0.1, value=0.8
)
top_p = gr.Slider(
0.0, 1.0, label="Top P", step=0.05, value=0.95
)
top_k = gr.Slider(
0, 100, label="Top K", step=1, value=40
)
repeat_penalty = gr.Slider(
0.0,
2.0,
label="Repetition Penalty",
step=0.1,
value=1.1,
)
chat_history_state = gr.State()
clear.click(
clear_chat,
inputs=[chat_history_state, message],
outputs=[chat_history_state, message],
queue=False,
)
clear.click(lambda: None, None, chatbot, queue=False)
submit_click_event = submit.click(
fn=user,
inputs=[message, chat_history_state],
outputs=[message, chat_history_state],
queue=True,
).then(
fn=llm_chat.response,
inputs=[
chat_history_state,
system_msg,
max_tokens,
temperature,
top_p,
top_k,
repeat_penalty,
],
outputs=[chatbot, chat_history_state],
queue=True,
)
stop.click(
fn=None,
inputs=None,
outputs=None,
cancels=[submit_click_event],
queue=False,
)
return app
if __name__ == "__main__":
model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_NAME)
config_model = {
"model_path": model_path,
"n_ctx": MAX_CONTEXT_LENGTH,
"n_gpu_layers": -1 if CUDA else 0,
}
llm_chat = ChatLLM(config_model)
app = gui(llm_chat)
app.queue(default_concurrency_limit=40)
app.launch(
max_threads=40,
share=False,
show_error=True,
quiet=False,
debug=True,
allowed_paths=["./assets/"],
)
|