flask-llama / Flask-llama.py
YZ-TAN's picture
Upload 2821 files
5a29263 verified
from flask import Flask, request, jsonify
import requests
import json
from typing import List, Dict
import subprocess
import time
from werkzeug.middleware.dispatcher import DispatcherMiddleware
from werkzeug.wrappers import Response
app = Flask(__name__)
@app.after_request
def disable_gzip_compression(response: Response):
response.headers["Content-Encoding"] = "identity"
return response
# Function to start the LLaMA server
def start_llama_server():
# print("Starting llama server")
# llama_command = [
# 'llama-server.exe',
# '-m', './models/Qwen2.5-7B-Instruct-Q4_K_M.gguf',
# '-c', '2048'
# ]
# subprocess.Popen(llama_command)
# print("Done starting llama server")
print("Starting llama server")
llama_command = [
"./llama-server", # Adjust for Linux (Hugging Face Spaces)
"-m", "./models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
"-c", "2048"
]
# Start the LLaMA server and redirect output to logs (use nohup for background execution)
subprocess.Popen(llama_command, stdout=open("llama.log", "w"), stderr=subprocess.STDOUT)
print("Done starting llama server")
# Initialize LLaMA server before starting Flask
start_llama_server()
# Wait for the LLaMA server to fully start up (if needed, adjust time)
time.sleep(10)
# Function to send request to the model server
def get_response(
server_url: str,
messages: List[Dict[str, str]],
temperature: float = 0.7,
top_p: float = 0.9,
max_tokens: int = 4096,
stream: bool = False, # Disable streaming for testing
) -> str:
headers = {"Content-Type": "application/json"}
data = {
"messages": messages,
"temperature": temperature,
"top_p": top_p,
"max_tokens": max_tokens,
"stream": stream,
}
response = requests.post(f"{server_url}/v1/chat/completions", headers=headers, json=data)
response.raise_for_status()
result = response.json()
if "choices" in result and len(result["choices"]) > 0:
return result["choices"][0]["message"]["content"]
else:
return ""
@app.route("/", methods=["GET"])
def home():
response = jsonify({"status": "Flask server is running"})
response.headers["Content-Encoding"] = "identity" # Disable compression
return response
@app.route("/chat", methods=["POST"])
def chatbot():
data = request.json
user_message = data.get("message", "")
if not user_message:
return jsonify({"error": "No message provided"}), 400
# Request LLaMA model for summarization of the test message (hardcoded)
# test_message = "Summarize: the product is good in overall. just have some drawbacks. it heats up easily and the accuracy is inconsistent. If all these problems are solved, potential order might be 1000 units."
# server_url = "http://127.0.0.1:8080"
server_url = "http://0.0.0.0:8080"
messages = [{"role": "system", "content": "You are an asistant that helps to solve problem based on user's input."},
{"role": "user", "content": user_message}] # Using the hardcoded test message here
response_text = get_response(server_url, messages)
# Print response in terminal (CMD)
print(f"Test message: {user_message}")
print(f"Assistant: {response_text}")
return jsonify({"response": response_text})
if __name__ == "__main__":
print("Flask server is running...")
# app.run(host="127.0.0.1", port=8000, debug=True)
app.run(host="0.0.0.0", port=7860, debug=True)