from fastapi import FastAPI from transformers import AutoModelForCausalLM, AutoTokenizer import torch import os app = FastAPI() # Set the cache directory environment variable os.environ["TRANSFORMERS_CACHE"] = "/tmp/cache" # Load the model and tokenizer model_name = "unsloth/llama-3-8b-bnb-4bit" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) @app.get("/") def read_root(): return {"Hello": "World"} @app.post("/generate/") def generate(prompt: str): inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate(inputs["input_ids"], max_length=50) text = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"generated_text": text}