Spaces:

lokidev
/

lokiTinyLLama

Running

App Files Files Community

lokidev commited on 8 days ago

Commit

fa2391a

verified ·

1 Parent(s): 82a8632

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -112

app.py CHANGED Viewed

@@ -1,121 +1,34 @@
-# app.py
 import gradio as gr
-import onnxruntime
-from transformers import AutoTokenizer
-import logging
-import os
-# Set up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-# Model and Tokenizer paths - Define ONNX model path
-MODEL_NAME = "microsoft/DialoGPT-small"
-ONNX_MODEL_PATH = "dialogpt-small.onnx"  # Path to the ONNX model
-TOKENIZER_NAME = MODEL_NAME  # Use the same name for tokenizer
-# Fallback message in case of errors
-FALLBACK_MESSAGE = "Sorry, I am having trouble processing your request. Please try again later."
-# Global variables to hold loaded model and tokenizer
-ort_session = None
-tokenizer = None
-# --- Model Loading and Preprocessing ---
-def load_model_and_tokenizer():
-    global ort_session, tokenizer
-    try:
-        logging.info(f"Checking for ONNX model at: {ONNX_MODEL_PATH}")
-        if not os.path.exists(ONNX_MODEL_PATH):
-            logging.warning(f"ONNX model not found at {ONNX_MODEL_PATH}. Please ensure it exists.  Refer to README for conversion instructions.")
-            return False  # Model file missing - indicate failure
-        logging.info("Loading ONNX Runtime session...")
-        ort_session = onnxruntime.InferenceSession(ONNX_MODEL_PATH, providers=['CPUExecutionProvider']) # Explicitly using CPU provider for simplicity for this example, you could expand providers
-        logging.info("Loading tokenizer...")
-        tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
-        logging.info(f"Tokenizer loaded: {tokenizer.__class__.__name__}")
-        logging.info("Model and Tokenizer loaded successfully using ONNX Runtime.")
-        return True  # Success
-    except Exception as e:
-        logging.error(f"Error during model or tokenizer loading: {e}")
-        if ort_session:
-            del ort_session  # Attempt to cleanup in case of failure during loading
-        ort_session = None
-        tokenizer = None
-        return False # Failure
-# Pre-load model and tokenizer on app startup
-model_loaded_successfully = load_model_and_tokenizer()
-# --- Inference Function ---
-def predict(message, history):
-    if not model_loaded_successfully:
-        logging.warning("Model not loaded, returning fallback message.")
-        return FALLBACK_MESSAGE
-    if ort_session is None or tokenizer is None: # Double check after global check, for robustness.
-        logging.error("ONNX Session or Tokenizer is unexpectedly None in predict function.")
-        return FALLBACK_MESSAGE
-    try:
-        # Reconstruct conversation history for DialoGPT input
-        input_text = ""
-        for human_msg, bot_response in history: # History comes as list of lists [user_msg, bot_response] pairs
-            input_text += human_msg + tokenizer.eos_token
-        input_text += message + tokenizer.eos_token
-        inputs = tokenizer(input_text, return_tensors="np")
-        # Get input and output names - essential for ONNX Runtime
-        input_name = ort_session.get_inputs()[0].name
-        output_name = ort_session.get_outputs()[0].name # Assuming output is logits for generation
-        ort_inputs = {input_name: inputs['input_ids']} # Only input_ids typically needed for simple generation with DialoGPT
-        ort_outputs = ort_session.run([output_name], ort_inputs) # Run inference
-        logits = ort_outputs[0] # logits from the model
-        # Basic generation - argmax for simplicity.  For better responses, consider more sophisticated decoding (sampling)
-        predicted_token_ids = logits.argmax(axis=-1) # Pick token with highest probability. Very simple decoding.
-        # Decode ONLY the last generated turn to get the bot's response.
-        # Find the EOS token indices to split the input_text (which includes history) and extract only the NEW response
-        generated_text = tokenizer.decode(predicted_token_ids[0, inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
-        if not generated_text.strip(): # Handle empty responses from the model.
-            logging.info("Model returned an empty response, using default.")
-            return "I'm not sure what to say." # Or a more specific fallback.
-        return generated_text.strip()
-    except Exception as e:
-        logging.error(f"Error during inference: {e}")
-        return FALLBACK_MESSAGE
-# --- Gradio Interface ---
-if __name__ == "__main__":
-    iface = gr.ChatInterface(
-        fn=predict,
-        textbox=gr.Textbox(placeholder="Type your message here...", label="User Input"),
-        chatbot=gr.Chatbot(label="Chatbot Response"),
-        title="DialoGPT Chatbot (ONNX Runtime)",
-        description="Chat with a simple DialoGPT-small chatbot powered by ONNX Runtime for faster inference.  This is a basic demonstration. For better performance in a real-world setting, ensure you have a properly quantized and optimized ONNX model. **Note:** For this Space to work, you must upload a `dialogpt-small.onnx` file. Refer to the README on how to convert and optimize the model.",
-        examples=[
-            ["Hello, how are you today?"],
-            ["Tell me a joke"],
-            ["What is the weather like in London?"]
-        ],
-        # Removed retry_btn, undo_btn, clear_btn to resolve TypeError
-        theme="default"
-    )
-    iface.launch()

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
+model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
+def chatbot(input_text, chat_history):
+    # Encode the new user input, add the eos_token and return a tensor in Pytorch
+    new_user_input_ids = tokenizer.encode(input_text + tokenizer.eos_token, return_tensors='pt')
+    # Append the new user input tokens to the chat history
+    bot_input_ids = torch.cat([torch.tensor(chat_history), new_user_input_ids], dim=-1) if chat_history else new_user_input_ids
+    # Generate a response while limiting the total chat history to 1000 tokens,
+    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
+    # Decode the last output tokens from bot
+    output = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
+    # Append to chat history for next turn. Important: Return the *full* chat history tensor to Gradio
+    chat_history_tensor = chat_history_ids.tolist()
+    return output, chat_history_tensor
+iface = gr.ChatInterface(
+    fn=chatbot,
+    inputs=["text", "state"], # "state" will hold the chat history as a tensor list
+    outputs=["text", "state"],
+    title="DialoGPT Chatbot (Small)",
+    description="Simple chat application using microsoft/DialoGPT-small model. Try it out!",
+    examples=["Hello", "How are you?", "Tell me a joke"]
+)
+iface.launch()