Spaces:

YZ-TAN
/

flask-llama

Build error

App Files Files Community

flask-llama / Flask-llama.py

YZ-TAN

Upload 2821 files

5a29263 verified about 14 hours ago

raw

history blame contribute delete

3.66 kB

	from flask import Flask, request, jsonify
	import requests
	import json
	from typing import List, Dict
	import subprocess
	import time
	from werkzeug.middleware.dispatcher import DispatcherMiddleware
	from werkzeug.wrappers import Response

	app = Flask(__name__)

	@app.after_request
	def disable_gzip_compression(response: Response):
	response.headers["Content-Encoding"] = "identity"
	return response

	# Function to start the LLaMA server
	def start_llama_server():
	# print("Starting llama server")
	# llama_command = [
	# 'llama-server.exe',
	# '-m', './models/Qwen2.5-7B-Instruct-Q4_K_M.gguf',
	# '-c', '2048'
	# ]
	# subprocess.Popen(llama_command)
	# print("Done starting llama server")
	print("Starting llama server")

	llama_command = [
	"./llama-server", # Adjust for Linux (Hugging Face Spaces)
	"-m", "./models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
	"-c", "2048"
	]

	# Start the LLaMA server and redirect output to logs (use nohup for background execution)
	subprocess.Popen(llama_command, stdout=open("llama.log", "w"), stderr=subprocess.STDOUT)

	print("Done starting llama server")


	# Initialize LLaMA server before starting Flask
	start_llama_server()

	# Wait for the LLaMA server to fully start up (if needed, adjust time)
	time.sleep(10)

	# Function to send request to the model server
	def get_response(
	server_url: str,
	messages: List[Dict[str, str]],
	temperature: float = 0.7,
	top_p: float = 0.9,
	max_tokens: int = 4096,
	stream: bool = False, # Disable streaming for testing
	) -> str:
	headers = {"Content-Type": "application/json"}
	data = {
	"messages": messages,
	"temperature": temperature,
	"top_p": top_p,
	"max_tokens": max_tokens,
	"stream": stream,
	}

	response = requests.post(f"{server_url}/v1/chat/completions", headers=headers, json=data)
	response.raise_for_status()

	result = response.json()
	if "choices" in result and len(result["choices"]) > 0:
	return result["choices"][0]["message"]["content"]
	else:
	return ""

	@app.route("/", methods=["GET"])
	def home():
	response = jsonify({"status": "Flask server is running"})
	response.headers["Content-Encoding"] = "identity" # Disable compression
	return response

	@app.route("/chat", methods=["POST"])
	def chatbot():
	data = request.json
	user_message = data.get("message", "")

	if not user_message:
	return jsonify({"error": "No message provided"}), 400

	# Request LLaMA model for summarization of the test message (hardcoded)
	# test_message = "Summarize: the product is good in overall. just have some drawbacks. it heats up easily and the accuracy is inconsistent. If all these problems are solved, potential order might be 1000 units."

	# server_url = "http://127.0.0.1:8080"
	server_url = "http://0.0.0.0:8080"
	messages = [{"role": "system", "content": "You are an asistant that helps to solve problem based on user's input."},
	{"role": "user", "content": user_message}] # Using the hardcoded test message here

	response_text = get_response(server_url, messages)

	# Print response in terminal (CMD)
	print(f"Test message: {user_message}")
	print(f"Assistant: {response_text}")

	return jsonify({"response": response_text})

	if __name__ == "__main__":
	print("Flask server is running...")
	# app.run(host="127.0.0.1", port=8000, debug=True)
	app.run(host="0.0.0.0", port=7860, debug=True)