Spaces:

nikhildsst
/

smart_search_projects

Running

App Files Files Community

nikhildsst commited on Jan 8

Commit

2a1f334

verified ·

1 Parent(s): 339a135

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -64

app.py CHANGED Viewed

@@ -1,52 +1,106 @@
 from langchain_community.vectorstores import FAISS
-from langchain_community.embeddings import OpenAIEmbeddings
-from langchain.prompts import PromptTemplate
-from langchain.chains import RetrievalQAWithSourcesChain
-from langchain_openai import OpenAIEmbeddings, OpenAI
-# Gradio imports
-import gradio as gr
 from langchain.docstore.document import Document
-import pandas as pd
-import os
-# Setting OpenAI API key
-os.environ["OPENAI_API_KEY"] = "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"
-# Step 1: Load Course Data (as an example dataset)
-course_data = [
-    {
-        "title": "Introduction to Data Science",
-        "description": "Learn the basics of data science including Python, statistics, and visualization.",
-        "curriculum": "Python basics, statistics, visualization, case studies"
-    },
-    {
-        "title": "Machine Learning Basics",
-        "description": "Understand the fundamentals of machine learning algorithms and their applications.",
-        "curriculum": "Supervised learning, unsupervised learning, regression, classification"
-    },
-    {
-        "title": "Deep Learning Essentials",
-        "description": "Dive into deep learning concepts including neural networks and TensorFlow.",
-        "curriculum": "Neural networks, TensorFlow basics, image classification"
-    }
-]
-# Convert the course data into a DataFrame
-df = pd.DataFrame(course_data)
-# Combine title, description, and curriculum into a single searchable text column
-df["combined_text"] = df["title"] + " " + df["description"] + " " + df["curriculum"]
-# Step 2: Generate Embeddings for the Data
-embedding_model = OpenAIEmbeddings()
-# Generate embeddings for the combined text
-course_embeddings = embedding_model.embed_documents(df["combined_text"].tolist())
-# Step 3: Store the Embeddings in a Vector Database (FAISS)
 documents = [
     Document(
         page_content=text,
@@ -57,26 +111,20 @@ documents = [
 vector_store = FAISS.from_documents(documents, embedding_model)
-# Step 4: Build the Smart Search System
-prompt_template = PromptTemplate(
-    input_variables=["context", "question"],
-    template="Use the following context to answer the question.\nContext: {context}\nQuestion: {question}\nAnswer:"
-)
 retriever = vector_store.as_retriever()
-qa_chain = RetrievalQAWithSourcesChain.from_chain_type(
-    llm=OpenAI(temperature=0),
-    chain_type="stuff",
-    retriever=retriever,
-    return_source_documents=True
-)
-# Step 5: Gradio Interface
 def smart_search(query):
-    result = qa_chain({"question": query})
-    return result['answer']
-# Creating a Gradio interface
 iface = gr.Interface(
     fn=smart_search,
     inputs=gr.Textbox(label="Ask a Question", placeholder="Enter your question here..."),
@@ -86,4 +134,3 @@ iface = gr.Interface(
 if __name__ == "__main__":
     iface.launch()

+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+from sentence_transformers import SentenceTransformer
 from langchain_community.vectorstores import FAISS
 from langchain.docstore.document import Document
+import gradio as gr
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service as ChromeService
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.options import Options
+from webdriver_manager.chrome import ChromeDriverManager
+import time
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+# Step 1: Scrape Course Data
+def scrape_courses(url):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, "html.parser")
+    # Debug: Print the soup structure
+    # print(soup.prettify())
+    courses = []
+    for course in soup.find_all("div", class_="course-block"):
+        title = course.find("div", class_="course-title").get_text(strip=True) if course.find("div", class_="course-title") else "No Title"
+        description = course.find("div", class_="course-description").get_text(strip=True) if course.find("div", class_="course-description") else "No Description"
+        courses.append({"title": title, "description": description})
+    if not courses:
+        print("No data found! Please check the website structure or the scraping logic.")
+    return courses
+def scrape_courses_with_selenium(url):
+    # Set up Selenium WebDriver options
+    options = Options()
+    options.headless = True  # Run in headless mode
+    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
+    # Open the webpage
+    driver.get(url)
+    # Wait for course-block elements to be present
+    try:
+        WebDriverWait(driver, 20).until(
+            EC.presence_of_all_elements_located((By.CLASS_NAME, "course-block"))
+        )
+    except Exception as e:
+        print(f"Error: {e}")
+        driver.quit()
+        return []
+    # Now scrape the courses
+    courses = []
+    try:
+        course_elements = driver.find_elements(By.CLASS_NAME, "course-title")
+        print(f"Found {len(course_elements)} courses")  # Debugging line
+        for course in course_elements:
+            title = course.find_element(By.CLASS_NAME, "course-title").text if course.find_element(By.CLASS_NAME, "course-title") else "No Title"
+            description = course.find_element(By.CLASS_NAME, "course-description").text if course.find_element(By.CLASS_NAME, "course-description") else "No Description"
+            courses.append({"title": title, "description": description})
+    except Exception as e:
+        print(f"Error scraping courses: {e}")
+    driver.quit()
+    return courses
+# Example usage
+url = "https://courses.analyticsvidhya.com/pages/all-free-courses"  # Replace with the actual URL
+courses = scrape_courses_with_selenium(url)
+# Print or process the data as needed
+if courses:
+    for course in courses:
+        print(f"Title: {course['title']}, Description: {course['description']}")
+else:
+    print("No courses found!")
+# Step 2: Convert Data to DataFrame
+df = pd.DataFrame(courses)
+# Check if DataFrame is empty
+if df.empty:
+    print("DataFrame is empty. No valid data was scraped.")
+    exit()
+# Combine title and description for embeddings
+df["combined_text"] = df["title"] + " " + df["description"]
+# Step 3: Generate Embeddings Using SentenceTransformers
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+course_embeddings = embedding_model.encode(df["combined_text"].tolist(), show_progress_bar=True)
+# Step 4: Store Embeddings in FAISS Vector Store
 documents = [
     Document(
         page_content=text,
 vector_store = FAISS.from_documents(documents, embedding_model)
+# Step 5: Build the Smart Search System
 retriever = vector_store.as_retriever()
+# Mock QA Chain
+def mock_qa_chain(question):
+    docs = retriever.get_relevant_documents(question)
+    context = "\n".join([doc.page_content for doc in docs])
+    return f"Mock Answer based on context:\n{context}\n\nSources: {', '.join([doc.metadata['source'] for doc in docs])}"
+# Step 6: Gradio Interface Function
 def smart_search(query):
+    return mock_qa_chain(query)
+# Step 7: Deploying with Gradio
 iface = gr.Interface(
     fn=smart_search,
     inputs=gr.Textbox(label="Ask a Question", placeholder="Enter your question here..."),
 if __name__ == "__main__":
     iface.launch()