Spaces:

nikhildsst
/

smart_search_projects

Running

App Files Files Community

nikhildsst commited on Jan 9

Commit

bbc48ea

verified ·

1 Parent(s): cf98c27

Update app.py

Browse files

Files changed (1) hide show

app.py +214 -84

app.py CHANGED Viewed

@@ -14,47 +14,150 @@ from langchain.docstore.document import Document
 from langchain_community.vectorstores import FAISS
 import gradio as gr
 import numpy as np
-# Function to scrape course data using Selenium
-def scrape_courses_with_selenium(url, limit=50):
-    options = Options()
-    options.headless = True  # Headless browsing
-    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
-    driver.get(url)
     try:
-        WebDriverWait(driver, 60).until(
-            EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card"))
         )
-    except Exception as e:
-        print(f"Error: {e}")
-        driver.quit()
-        return []
-    courses = []
     try:
         course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
         for i, course in enumerate(course_elements):
             if i >= limit:
                 break
-            title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title"
-            description = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Description"
-            lessons = course.find_element(By.CLASS_NAME, "course-card__lesson-count").text or "No Lessons"
-            price = course.find_element(By.CLASS_NAME, "course-card__price").text or "No Price"
-            image_url = course.find_element(By.TAG_NAME, "img").get_attribute("src") or "No Image"
-            courses.append({
-                "title": title,
-                "description": description,
-                "lessons": lessons,
-                "price": price,
-                "image_url": image_url,
-            })
     except Exception as e:
-        print(f"Error scraping courses: {e}")
     finally:
-        driver.quit()
     return courses
 class SentenceTransformersEmbeddings(Embeddings):
@@ -62,74 +165,101 @@ class SentenceTransformersEmbeddings(Embeddings):
         self.model = SentenceTransformer(model_name)
     def embed_documents(self, texts):
-        # Generates embeddings for a list of documents
         embeddings = self.model.encode(texts, show_progress_bar=True)
-        return embeddings.tolist()  # Convert numpy array to list
     def embed_query(self, text):
-        # Generates embedding for a single query
         embedding = self.model.encode([text], show_progress_bar=True)[0]
-        return embedding.tolist()  # Convert numpy array to list
 def main():
-    # URL for scraping
-    url = "https://courses.analyticsvidhya.com/collections/courses"
-    limit = 5  # Number of courses to scrape
-    courses = scrape_courses_with_selenium(url, limit)
-    if not courses:
-        print("No courses found!")
-        return
-    # Print course information
-    for course in courses:
-        print(f"Title: {course['title']}, Description: {course['description']}, Price: {course['price']}, Lessons: {course['lessons']}")
-    # Convert Data to DataFrame
-    df = pd.DataFrame(courses)
-    # Combine title and description for embeddings
-    df["combined_text"] = df["title"] + " " + df["description"]
-    texts = df["combined_text"].tolist()
-    # Initialize embedding model
-    embedding_model = SentenceTransformersEmbeddings('all-MiniLM-L6-v2')
-    # Create Documents for FAISS
-    documents = [
-        Document(
-            page_content=text,
-            metadata={"source": f"Course {i+1}", **{k:v for k,v in courses[i].items() if k != 'description'}}
         )
-        for i, text in enumerate(texts)
-    ]
-    # Create FAISS Vector Store
-    vector_store = FAISS.from_documents(documents, embedding_model)
-    # Define search function
-    def smart_search(query):
-        docs = vector_store.similarity_search(query, k=2)
-        results = []
-        for doc in docs:
-            result = f"\nTitle: {doc.metadata['title']}\n"
-            result += f"Price: {doc.metadata['price']}\n"
-            result += f"Lessons: {doc.metadata['lessons']}\n"
-            result += f"Content: {doc.page_content}\n"
-            results.append(result)
-        return "\n---\n".join(results)
-    # Create Gradio interface
-    iface = gr.Interface(
-        fn=smart_search,
-        inputs=gr.Textbox(label="Search Courses", placeholder="Enter your search query..."),
-        outputs=gr.Textbox(label="Results"),
-        title="Course Search Engine",
-        description="Search for courses based on your query. The system will return the most relevant matches.",
-    )
-    # Launch the interface
-    iface.launch()
 if __name__ == "__main__":
     main()

 from langchain_community.vectorstores import FAISS
 import gradio as gr
 import numpy as np
+import subprocess
+import shutil
+import os
+# Install Google Chrome and ChromeDriver if not already installed
+os.system('wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb')
+os.system('apt-get update && apt-get install -y ./google-chrome-stable_current_amd64.deb')
+os.system('CHROME_VERSION=$(google-chrome --version | awk \'{print $3}\' | cut -d \'.\' -f 1)')
+os.system('wget https://chromedriver.storage.googleapis.com/${CHROME_VERSION}.0/chromedriver_linux64.zip')
+os.system('unzip chromedriver_linux64.zip')
+os.system('mv chromedriver /usr/bin/chromedriver && chmod +x /usr/bin/chromedriver')
+def check_and_install_chrome():
+    # Check if Google Chrome is already installed
+    if check_chrome_installation():
+        print("Google Chrome is already installed.")
+        return
+    print("Installing Google Chrome...")
+    os.system('wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb')
+    os.system('apt-get update && apt-get install -y ./google-chrome-stable_current_amd64.deb')
+def check_chrome_version():
     try:
+        result = subprocess.run(['google-chrome', '--version'], capture_output=True, text=True)
+        if result.returncode == 0:
+            print("Chrome version:", result.stdout)
+            return result.stdout.strip()
+    except FileNotFoundError:
+        print("Google Chrome is not installed.")
+    return None
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+def wait_for_element_by_class(driver, class_name, timeout=60):
+    try:
+        element = WebDriverWait(driver, timeout).until(
+            EC.presence_of_all_elements_located((By.CLASS_NAME, class_name))
         )
+        return element
+    except TimeoutException:
+        print(f"Timed out waiting for element with class: {class_name}")
+        return None
+def check_chrome_installation():
+    """Check if Chrome is installed and get its version"""
     try:
+        # First try the default 'google-chrome' command
+        result = subprocess.run(['google-chrome', '--version'],
+                              capture_output=True,
+                              text=True)
+        if result.returncode == 0:
+            return True
+    except FileNotFoundError:
+        # If 'google-chrome' fails, try 'google-chrome-stable'
+        try:
+            result = subprocess.run(['google-chrome-stable', '--version'],
+                                  capture_output=True,
+                                  text=True)
+            if result.returncode == 0:
+                return True
+        except FileNotFoundError:
+            pass
+    return False
+def setup_chrome_options():
+    """Setup Chrome options with all necessary arguments"""
+    chrome_options = Options()
+    chrome_options.add_argument('--headless=new')  # Updated headless argument
+    chrome_options.add_argument('--no-sandbox')
+    chrome_options.add_argument('--disable-dev-shm-usage')
+    chrome_options.add_argument('--disable-gpu')
+    chrome_options.add_argument('--disable-software-rasterizer')
+    chrome_options.add_argument('--disable-extensions')
+    chrome_options.add_argument('--disable-setuid-sandbox')
+    chrome_options.binary_location = "/usr/bin/google-chrome"  # Specify Chrome binary location
+    return chrome_options
+def scrape_courses_with_selenium(url, limit=10):
+    """Scrape courses with improved error handling"""
+    if not check_chrome_installation():
+        raise RuntimeError("Google Chrome is not installed. Please install it first.")
+    chrome_options = setup_chrome_options()
+    try:
+        service = ChromeService(ChromeDriverManager().install())
+        driver = webdriver.Chrome(service=service, options=chrome_options)
+        print("Successfully initialized Chrome driver")
+        driver.get(url)
+        print(f"Successfully navigated to {url}")
+        # Wait for course cards to load
+        WebDriverWait(driver, 60).until(
+            EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card"))
+        )
+        courses = []
         course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
+        print(f"Found {len(course_elements)} course elements")
         for i, course in enumerate(course_elements):
             if i >= limit:
                 break
+            try:
+                # Updated selectors based on the website structure
+                title_elem = course.find_element(By.CSS_SELECTOR, ".course-card__title")
+                desc_elem = course.find_element(By.CSS_SELECTOR, ".course-card__description")
+                lessons_elem = course.find_element(By.CSS_SELECTOR, ".course-card__lesson-count")
+                price_elem = course.find_element(By.CSS_SELECTOR, ".course-card__price")
+                title = title_elem.text if title_elem else "No Title"
+                description = desc_elem.text if desc_elem else "No Description"
+                lessons = lessons_elem.text if lessons_elem else "No Lessons"
+                price = price_elem.text if price_elem else "No Price"
+                courses.append({
+                    "title": title,
+                    "description": description,
+                    "lessons": lessons,
+                    "price": price
+                })
+                print(f"Successfully scraped course {i+1}")
+            except Exception as e:
+                print(f"Error scraping course {i+1}: {str(e)}")
+                continue
     except Exception as e:
+        print(f"Error during scraping: {str(e)}")
+        raise
     finally:
+        try:
+            driver.quit()
+        except:
+            pass
     return courses
 class SentenceTransformersEmbeddings(Embeddings):
         self.model = SentenceTransformer(model_name)
     def embed_documents(self, texts):
         embeddings = self.model.encode(texts, show_progress_bar=True)
+        return embeddings.tolist()
     def embed_query(self, text):
         embedding = self.model.encode([text], show_progress_bar=True)[0]
+        return embedding.tolist()
+def create_search_interface(vector_store):
+    def search_courses(query):
+        if not query.strip():
+            return "Please enter a search query."
+        try:
+            docs = vector_store.similarity_search(query, k=2)
+            results = []
+            for i, doc in enumerate(docs, 1):
+                result = f"\nResult {i}:\n"
+                result += f"Title: {doc.metadata['title']}\n"
+                result += f"Price: {doc.metadata['price']}\n"
+                result += f"Lessons: {doc.metadata['lessons']}\n"
+                result += f"Content: {doc.page_content}\n"
+                results.append(result)
+            return "\n---\n".join(results)
+        except Exception as e:
+            return f"Error during search: {str(e)}"
+    return search_courses
 def main():
+    try:
+        # Check Chrome installation
+        if not check_chrome_installation():
+            print("Chrome is not installed. Please install Google Chrome first.")
+            return
+        # Scrape courses
+        url = "https://courses.analyticsvidhya.com/collections/courses"
+        limit = 5
+        print("Starting course scraping...")
+        courses = scrape_courses_with_selenium(url, limit)
+        if not courses:
+            print("No courses found!")
+            return
+        print("\nScraped Courses:")
+        for course in courses:
+            print(f"Title: {course['title']}")
+            print(f"Price: {course['price']}")
+            print(f"Lessons: {course['lessons']}")
+            print("---")
+        # Create DataFrame
+        df = pd.DataFrame(courses)
+        df["combined_text"] = df["title"] + " " + df["description"]
+        texts = df["combined_text"].tolist()
+        # Initialize embedding model
+        print("\nInitializing embedding model...")
+        embedding_model = SentenceTransformersEmbeddings('all-MiniLM-L6-v2')
+        # Create Documents for FAISS
+        documents = [
+            Document(
+                page_content=text,
+                metadata={"source": f"Course {i+1}", **{k:v for k,v in courses[i].items() if k != 'description'}}
+            )
+            for i, text in enumerate(texts)
+        ]
+        # Create FAISS Vector Store
+        print("Creating vector store...")
+        vector_store = FAISS.from_documents(documents, embedding_model)
+        # Create and launch Gradio interface
+        print("\nLaunching Gradio interface...")
+        search_fn = create_search_interface(vector_store)
+        iface = gr.Interface(
+            fn=search_fn,
+            inputs=gr.Textbox(label="Search Courses", placeholder="Enter your search query..."),
+            outputs=gr.Textbox(label="Results"),
+            title="Course Search Engine",
+            description="Search for courses based on your query. The system will return the most relevant matches.",
+            examples=[
+                ["python programming course"],
+                ["machine learning basics"],
+                ["data analysis"]
+            ]
         )
+        iface.launch()
+    except Exception as e:
+        print(f"Error in main: {str(e)}")
 if __name__ == "__main__":
     main()