import pandas as pd import os import json import logging import warnings from dotenv import load_dotenv from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.prompts import PromptTemplate from langchain.schema import Document from langchain_together import Together import streamlit as st # Logging setup logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) # Suppress warnings warnings.filterwarnings("ignore") # Load environment variables load_dotenv() TOGETHER_AI_API = os.getenv("TOGETHER_AI") # Dataset paths csv_path = "bns_dataset.csv" chunked_json_path = "chunked_bns_dataset.json" faiss_db_path = "bns_vector_db" # Streamlit page config st.set_page_config(page_title="LawGPT - Bharatiya Nyay Sanhita", layout="centered") st.title("Law4Her: Bharatiya Nyay Sanhita") st.markdown("This app provides answers to legal questions based on the *Bharatiya Nyay Sanhita (BNS)*.") st.image("https://res.cloudinary.com/dzzhbgbnp/image/upload/v1736073326/lawforher_logo1_yznqxr.png", use_container_width=True) # Load CSV and Process Dataset def process_csv_to_chunks(csv_file, output_json, chunk_size=512, overlap=100): if not os.path.exists(csv_file): st.error(f"CSV file not found: {csv_file}") st.stop() logger.info(f"Loading CSV file: {csv_file}") df = pd.read_csv(csv_file) # Ensure required columns are present required_columns = {"chapter", "section_title", "section_content"} if not required_columns.issubset(df.columns): st.error(f"CSV file is missing required columns: {required_columns - set(df.columns)}") st.stop() logger.info("Creating text chunks...") chunks = [] for _, row in df.iterrows(): chapter = row.get("chapter", "") section_title = row.get("section_title", "") section_content = row.get("section_content", "") # Split content into chunks for i in range(0, len(section_content), chunk_size - overlap): chunk = section_content[i:i + chunk_size] chunks.append({ "chapter": chapter, "section_title": section_title, "chunk": chunk }) # Save chunks to JSON logger.info(f"Saving chunks to {output_json}...") with open(output_json, "w", encoding="utf-8") as f: json.dump(chunks, f, indent=4, ensure_ascii=False) logger.info("Chunks saved successfully.") return chunks if not os.path.exists(chunked_json_path): logger.info("Processing CSV to JSON...") chunks = process_csv_to_chunks(csv_path, chunked_json_path) else: logger.info("Loading pre-processed chunks from JSON...") with open(chunked_json_path, "r", encoding="utf-8") as f: chunks = json.load(f) # Create FAISS Vectorstore logger.info("Initializing embeddings and vectorstore...") embeddings = HuggingFaceEmbeddings( model_name="nomic-ai/nomic-embed-text-v1", model_kwargs={"trust_remote_code": True, "revision": "289f532e14dbbbd5a04753fa58739e9ba766f3c7"}, ) if not os.path.exists(f"{faiss_db_path}/index.faiss"): logger.info("Creating FAISS vectorstore...") texts = [ Document(page_content=chunk["chunk"], metadata={"chapter": chunk["chapter"], "section_title": chunk["section_title"]}) for chunk in chunks ] db = FAISS.from_documents(texts, embeddings) db.save_local(faiss_db_path) else: logger.info("Loading existing FAISS vectorstore...") db = FAISS.load_local(faiss_db_path, embeddings, allow_dangerous_deserialization=True) retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5}) # Define Prompt Template prompt_template = """[INST] You are a legal chatbot specializing in the Bharatiya Nyay Sanhita (Indian Penal Code replacement). Provide answers **only based on the provided CONTEXT**. If the requested information is not available in the CONTEXT, respond with: "The required information is not available." CONTEXT: {context} USER QUERY: {question} RESPONSE: [INST] """ prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"]) # Initialize Together API llm = Together( model="mistralai/Mistral-7B-Instruct-v0.2", temperature=0.5, max_tokens=1024, together_api_key=TOGETHER_AI_API, ) # Chat Interface def generate_response(user_query): try: # Retrieve relevant documents retrieved_docs = retriever.get_relevant_documents(user_query) # Log retrieved documents logger.info(f"User Query: {user_query}") for i, doc in enumerate(retrieved_docs): logger.info( f"Document {i + 1}: Chapter - {doc.metadata['chapter']}, Section - {doc.metadata['section_title']}") logger.info(f"Content: {doc.page_content}") # Prepare context for LLM context = "\n\n".join( f"Chapter: {doc.metadata['chapter']}, Section: {doc.metadata['section_title']}\n{doc.page_content}" for doc in retrieved_docs ) # Construct LLM prompt input prompt_input = {"context": context, "question": user_query} # Generate response using LLM logger.debug(f"Payload sent to LLM: {json.dumps(prompt_input, ensure_ascii=False, indent=2)}") response = llm(prompt.format(**prompt_input)) return response except Exception as e: logger.error(f"Error generating response: {e}") return "An error occurred while generating the response." # Streamlit Chat Interface if "messages" not in st.session_state: st.session_state.messages = [{"role": "assistant", "content": "Hi! How can I assist you today?"}] # Display Chat Messages for message in st.session_state.messages: with st.chat_message(message["role"]): st.write(message["content"]) # User Input if user_input := st.chat_input("Type your question here..."): # User message st.session_state.messages.append({"role": "user", "content": user_input}) with st.chat_message("user"): st.write(user_input) # Assistant response with st.chat_message("assistant"): with st.spinner("Generating response..."): response = generate_response(user_input) st.write(response) st.session_state.messages.append({"role": "assistant", "content": response})