File size: 9,383 Bytes
7455667
 
 
 
 
 
 
 
 
11aac14
bdc3cce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11aac14
7455667
 
 
 
 
 
 
 
 
 
 
 
11aac14
7455667
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6502352
7455667
 
 
 
 
 
 
 
 
 
 
 
 
 
64ee482
7455667
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86c655a
354e87f
7455667
f037927
7455667
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb5e4b5
7455667
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354e87f
7455667
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
import spaces
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp_cuda_tensorcores import Llama

REPO_ID = "MaziyarPanahi/Meta-Llama-3-70B-Instruct-GGUF"
MODEL_NAME = "Meta-Llama-3-70B-Instruct.Q3_K_L.gguf"
MAX_CONTEXT_LENGTH = 8192
CUDA = True

SYSTEM_PROMPT = """You are controlling a 2 DOF robot on a 50x50 grid. The robot can move one step in any of the four cardinal directions. The robot can perform the following actions:

- 'up': Move one unit up (increasing y coordinate by 1).
- 'down': Move one unit down (decreasing y coordinate by 1).
- 'left': Move one unit left (decreasing x coordinate by 1).
- 'right': Move one unit right (increasing x coordinate by 1).
Given a target coordinate, your task is to calculate and output the shortest sequence of commands that will move the robot from its current position to the target position.

Output Format:
- Begin with the exact phrase: 'The full list is:'.
- Provide the sequence of commands as a JSON array, with each command as a string. Commands must be exactly 'up', 'down', 'left', or 'right'.
- All coordinates should be formatted as JSON objects with keys 'x' and 'y' and integer values. For example, the starting position should be output as {'x': 0, 'y': 0}.
- When calling tools, ensure that all arguments use this JSON object format for coordinates, with keys 'x' and 'y'.
- Example of correct output:
  If the target coordinate is {'x': 2, 'y': 3}, your response should include:
  'The full list is: ["right", "right", "up", "up", "up"]'
  And for tool calls, use:
  'tool_calls': [{'function': {'name': 'validate_path', 'arguments': {'commands': ["right", "right", "up", "up", "up"], 'start_position': {'x': 0, 'y': 0}, 'target_position': {'x': 2, 'y': 3}}}}]'

Please ensure that all output strictly adheres to these formats. If any output is not in the correct format, redo the task and correct the output before providing the final answer."""

TOKEN_STOP = ["<|eot_id|>"]
SYS_MSG = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nSYSTEM_PROMPT<|eot_id|>\n"
USER_PROMPT = (
    "<|start_header_id|>user<|end_header_id|>\n\nUSER_PROMPT<|eot_id|>\n"
)
ASSIS_PROMPT = "<|start_header_id|>assistant<|end_header_id|>\n\n"
END_ASSIS_PREVIOUS_RESPONSE = "<|eot_id|>\n"

TASK_PROMPT = {
    "Assistant": SYSTEM_PROMPT,
}

# css = ".gradio-container {background-image: url('file=./assets/background.png'); background-size: cover; background-position: center; background-repeat: no-repeat;}"


class ChatLLM:
    def __init__(self, config_model):
        self.llm = None
        self.config_model = config_model
        # self.load_cpp_model()

    def load_cpp_model(self):
        self.llm = Llama(**config_model)

    def apply_chat_template(
        self,
        history,
        system_message,
    ):
        history = history or []

        messages = SYS_MSG.replace("SYSTEM_PROMPT", system_message.strip())
        for msg in history:
            messages += (
                USER_PROMPT.replace("USER_PROMPT", msg[0]) + ASSIS_PROMPT + msg[1]
            )
            messages += END_ASSIS_PREVIOUS_RESPONSE if msg[1] else ""

        print(messages)

        # messages = messages[:-1]

        return messages

    @spaces.GPU(duration=30)
    def response(
        self,
        history,
        system_message,
        max_tokens,
        temperature,
        top_p,
        top_k,
        repeat_penalty,
    ):

        messages = self.apply_chat_template(history, system_message)

        history[-1][1] = ""

        if not self.llm:
            print("Loading model")
            self.load_cpp_model()

        for output in self.llm(
            messages,
            echo=False,
            stream=True,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            repeat_penalty=repeat_penalty,
            stop=TOKEN_STOP,
        ):
            answer = output["choices"][0]["text"]
            history[-1][1] += answer # here we append the answer to the last message in the history
            # stream the response
            yield history, history


def user(message, history):
    history = history or []
    # Append the user's message to the conversation history
    history.append([message, ""])
    return "", history


def clear_chat(chat_history_state, chat_message):
    chat_history_state = []
    chat_message = ""
    return chat_history_state, chat_message


def gui(llm_chat):
    with gr.Blocks() as app:
        gr.Markdown("# Llama 3 70B Instruct GGUF")
        gr.Markdown(
            f"""
                ### This demo utilizes the repository ID {REPO_ID} with the model {MODEL_NAME}, powered by the LLaMA.cpp backend.
                """
        )
        with gr.Row():
            with gr.Column(scale=2):
                chatbot = gr.Chatbot(
                    label="Chat",
                    height=700,
                    avatar_images=(
                        "assets/avatar_user.jpeg",
                        "assets/avatar_llama.jpeg",
                    ),
                )
            with gr.Column(scale=1):
                with gr.Row():
                    message = gr.Textbox(
                        label="Message",
                        placeholder="Ask me anything.",
                        lines=3,
                    )
                with gr.Row():
                    submit = gr.Button(value="Send message", variant="primary")
                    clear = gr.Button(value="New chat", variant="primary")
                    stop = gr.Button(value="Stop", variant="secondary")

                with gr.Accordion("Contextual Prompt Editor"):
                    default_task = "Assistant"
                    task_prompts_gui = gr.Dropdown(
                        TASK_PROMPT,
                        value=default_task,
                        label="Prompt selector",
                        visible=True,
                        interactive=True,
                    )
                    system_msg = gr.Textbox(
                        TASK_PROMPT[default_task],
                        label="System Message",
                        placeholder="system prompt",
                        lines=4,
                    )

                    def task_selector(choice):
                        return gr.update(value=TASK_PROMPT[choice])

                    task_prompts_gui.change(
                        task_selector,
                        [task_prompts_gui],
                        [system_msg],
                    )

                with gr.Accordion("Advanced settings", open=False):
                    with gr.Column():
                        max_tokens = gr.Slider(
                            20, 4096, label="Max Tokens", step=20, value=400
                        )
                        temperature = gr.Slider(
                            0.2, 2.0, label="Temperature", step=0.1, value=0.8
                        )
                        top_p = gr.Slider(
                            0.0, 1.0, label="Top P", step=0.05, value=0.95
                        )
                        top_k = gr.Slider(
                            0, 100, label="Top K", step=1, value=40
                        )
                        repeat_penalty = gr.Slider(
                            0.0,
                            2.0,
                            label="Repetition Penalty",
                            step=0.1,
                            value=1.1,
                        )

                chat_history_state = gr.State()
                clear.click(
                    clear_chat,
                    inputs=[chat_history_state, message],
                    outputs=[chat_history_state, message],
                    queue=False,
                )
                clear.click(lambda: None, None, chatbot, queue=False)

                submit_click_event = submit.click(
                    fn=user,
                    inputs=[message, chat_history_state],
                    outputs=[message, chat_history_state],
                    queue=True,
                ).then(
                    fn=llm_chat.response,
                    inputs=[
                        chat_history_state,
                        system_msg,
                        max_tokens,
                        temperature,
                        top_p,
                        top_k,
                        repeat_penalty,
                    ],
                    outputs=[chatbot, chat_history_state],
                    queue=True,
                )
                stop.click(
                    fn=None,
                    inputs=None,
                    outputs=None,
                    cancels=[submit_click_event],
                    queue=False,
                )
    return app


if __name__ == "__main__":

    model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_NAME)

    config_model = {
        "model_path": model_path,
        "n_ctx": MAX_CONTEXT_LENGTH,
        "n_gpu_layers": -1 if CUDA else 0,
    }

    llm_chat = ChatLLM(config_model)

    app = gui(llm_chat)

    app.queue(default_concurrency_limit=40)

    app.launch(
        max_threads=40,
        share=False,
        show_error=True,
        quiet=False,
        debug=True,
        allowed_paths=["./assets/"],
    )