from llama_index.llms import HuggingFaceInferenceAPI
from llama_index.llms import ChatMessage, MessageRole
from llama_index.prompts import ChatPromptTemplate
from llama_index import VectorStoreIndex, SimpleDirectoryReader, LLMPredictor, ServiceContext, StorageContext, load_index_from_storage
import gradio as gr
import sys
import logging
import torch
from huggingface_hub import InferenceClient
import tqdm as notebook_tqdm
import requests

def download_file(url, filename):
    """
    Download a file from the specified URL and save it locally under the given filename.
    """
    response = requests.get(url, stream=True)

    # Check if the request was successful
    if response.status_code == 200:
        with open(filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    file.write(chunk)
        print(f"Download complete: {filename}")
    else:
        print(f"Error: Unable to download file. HTTP status code: {response.status_code}")

def generate(prompt, history, file_link, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,):
    mixtral = HuggingFaceInferenceAPI(
        model_name="mistralai/Mixtral-8x7B-Instruct-v0.1"
    )

    service_context = ServiceContext.from_defaults(
        llm=mixtral, embed_model="local:BAAI/bge-small-en-v1.5"
    )


    download = download_file(file_link,file_link.split("/")[-1])

    documents = SimpleDirectoryReader("/content").load_data()
    index = VectorStoreIndex.from_documents(documents,service_context=service_context)

    # Text QA Prompt
    chat_text_qa_msgs = [
        ChatMessage(
            role=MessageRole.SYSTEM,
            content=(
                "Always answer the question, even if the context isn't helpful."
            ),
        ),
        ChatMessage(
            role=MessageRole.USER,
            content=(
                "Context information is below.\n"
                "---------------------\n"
                "{context_str}\n"
                "---------------------\n"
                "Given the context information and not prior knowledge, "
                "answer the question: {query_str}\n"
            ),
        ),
    ]
    text_qa_template = ChatPromptTemplate(chat_text_qa_msgs)

    # Refine Prompt
    chat_refine_msgs = [
        ChatMessage(
            role=MessageRole.SYSTEM,
            content=(
                "Always answer the question, even if the context isn't helpful."
            ),
        ),
        ChatMessage(
            role=MessageRole.USER,
            content=(
                "We have the opportunity to refine the original answer "
                "(only if needed) with some more context below.\n"
                "------------\n"
                "{context_msg}\n"
                "------------\n"
                "Given the new context, refine the original answer to better "
                "answer the question: {query_str}. "
                "If the context isn't useful, output the original answer again.\n"
                "Original Answer: {existing_answer}"
            ),
        ),
    ]
    refine_template = ChatPromptTemplate(chat_refine_msgs)

    stream= index.as_query_engine(
          text_qa_template=text_qa_template, refine_template=refine_template, similarity_top_k=6
      ).query(prompt)
    print(str(stream))

    output=""

    for response in str(stream):
          output += response
          yield output
    return output

def upload_file(files):
    file_paths = [file.name for file in files]
    return file_paths

additional_inputs=[
    gr.Textbox(
        label="File Link",
        max_lines=1,
        interactive=True,
        value="https://arxiv.org/pdf/2401.10020.pdf"
    ),
    gr.Slider(
        label="Temperature",
        value=0.9,
        minimum=0.0,
        maximum=1.0,
        step=0.05,
        interactive=True,
        info="Higher values produce more diverse outputs",
    ),
    gr.Slider(
        label="Max new tokens",
        value=1024,
        minimum=0,
        maximum=2048,
        step=64,
        interactive=True,
        info="The maximum numbers of new tokens",
    ),
    gr.Slider(
        label="Top-p (nucleus sampling)",
        value=0.90,
        minimum=0.0,
        maximum=1,
        step=0.05,
        interactive=True,
        info="Higher values sample more low-probability tokens",
    ),
    gr.Slider(
        label="Repetition penalty",
        value=1.2,
        minimum=1.0,
        maximum=2.0,
        step=0.05,
        interactive=True,
        info="Penalize repeated tokens",
    )
]

examples=[["Explain the paper and describe its novelty", None, None, None, None, None, ],
          ["Can you write a short story about a time-traveling detective who solves historical mysteries?", None, None, None, None, None,],
          ["I'm trying to learn French. Can you provide some common phrases that would be useful for a beginner, along with their pronunciations?", None, None, None, None, None,],
          ["I have chicken, rice, and bell peppers in my kitchen. Can you suggest an easy recipe I can make with these ingredients?", None, None, None, None, None,],
          ["Can you explain how the QuickSort algorithm works and provide a Python implementation?", None, None, None, None, None,],
          ["What are some unique features of Rust that make it stand out compared to other systems programming languages like C++?", None, None, None, None, None,],
         ]

gr.ChatInterface(
    fn=generate,
    chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"),
    additional_inputs=additional_inputs,
    title="RAG Demo",
    examples=examples,
    concurrency_limit=20,
).launch(show_api=False,debug=True)