Spaces:
Build error
Build error
from flask import Flask, request, jsonify | |
import requests | |
import json | |
from typing import List, Dict | |
import subprocess | |
import time | |
from werkzeug.middleware.dispatcher import DispatcherMiddleware | |
from werkzeug.wrappers import Response | |
app = Flask(__name__) | |
def disable_gzip_compression(response: Response): | |
response.headers["Content-Encoding"] = "identity" | |
return response | |
# Function to start the LLaMA server | |
def start_llama_server(): | |
# print("Starting llama server") | |
# llama_command = [ | |
# 'llama-server.exe', | |
# '-m', './models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', | |
# '-c', '2048' | |
# ] | |
# subprocess.Popen(llama_command) | |
# print("Done starting llama server") | |
print("Starting llama server") | |
llama_command = [ | |
"./llama-server", # Adjust for Linux (Hugging Face Spaces) | |
"-m", "./models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", | |
"-c", "2048" | |
] | |
# Start the LLaMA server and redirect output to logs (use nohup for background execution) | |
subprocess.Popen(llama_command, stdout=open("llama.log", "w"), stderr=subprocess.STDOUT) | |
print("Done starting llama server") | |
# Initialize LLaMA server before starting Flask | |
start_llama_server() | |
# Wait for the LLaMA server to fully start up (if needed, adjust time) | |
time.sleep(10) | |
# Function to send request to the model server | |
def get_response( | |
server_url: str, | |
messages: List[Dict[str, str]], | |
temperature: float = 0.7, | |
top_p: float = 0.9, | |
max_tokens: int = 4096, | |
stream: bool = False, # Disable streaming for testing | |
) -> str: | |
headers = {"Content-Type": "application/json"} | |
data = { | |
"messages": messages, | |
"temperature": temperature, | |
"top_p": top_p, | |
"max_tokens": max_tokens, | |
"stream": stream, | |
} | |
response = requests.post(f"{server_url}/v1/chat/completions", headers=headers, json=data) | |
response.raise_for_status() | |
result = response.json() | |
if "choices" in result and len(result["choices"]) > 0: | |
return result["choices"][0]["message"]["content"] | |
else: | |
return "" | |
def home(): | |
response = jsonify({"status": "Flask server is running"}) | |
response.headers["Content-Encoding"] = "identity" # Disable compression | |
return response | |
def chatbot(): | |
data = request.json | |
user_message = data.get("message", "") | |
if not user_message: | |
return jsonify({"error": "No message provided"}), 400 | |
# Request LLaMA model for summarization of the test message (hardcoded) | |
# test_message = "Summarize: the product is good in overall. just have some drawbacks. it heats up easily and the accuracy is inconsistent. If all these problems are solved, potential order might be 1000 units." | |
# server_url = "http://127.0.0.1:8080" | |
server_url = "http://0.0.0.0:8080" | |
messages = [{"role": "system", "content": "You are an asistant that helps to solve problem based on user's input."}, | |
{"role": "user", "content": user_message}] # Using the hardcoded test message here | |
response_text = get_response(server_url, messages) | |
# Print response in terminal (CMD) | |
print(f"Test message: {user_message}") | |
print(f"Assistant: {response_text}") | |
return jsonify({"response": response_text}) | |
if __name__ == "__main__": | |
print("Flask server is running...") | |
# app.run(host="127.0.0.1", port=8000, debug=True) | |
app.run(host="0.0.0.0", port=7860, debug=True) | |