from fastapi import FastAPI from pydantic import BaseModel from llama_cpp import Llama app = FastAPI() # Load the model llm = Llama.from_pretrained( repo_id="unsloth/phi-4-GGUF", filename="phi-4-Q4_K_M.gguf", ) # Define request model class ChatRequest(BaseModel): system_prompt: str query: str @app.post("/chat-p4q4") async def chat(request: ChatRequest): response = llm.create_chat_completion( messages=[ {"role": "system", "content": request.system_prompt}, {"role": "user", "content": request.query}, ] ) return {"response": response}