Spaces:

nikhildsst
/

smart_search_projects

Running

App Files Files Community

nikhildsst commited on Jan 10

Commit

a29819d

verified ·

1 Parent(s): 15f82a0

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -170

app.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import os
-import subprocess
-import pandas as pd
-import gradio as gr
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service as ChromeService
 from selenium.webdriver.common.by import By
@@ -9,57 +7,11 @@ from selenium.webdriver.chrome.options import Options
 from webdriver_manager.chrome import ChromeDriverManager
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
-from sentence_transformers import SentenceTransformer
-from langchain.embeddings.base import Embeddings
-from langchain.docstore.document import Document
-from langchain_community.vectorstores import FAISS
-import numpy as np
-import shutil
-# Install Google Chrome and ChromeDriver if not already installed
-os.system('wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb')
-os.system('apt-get update && apt-get install -y ./google-chrome-stable_current_amd64.deb')
-os.system('CHROME_VERSION=$(google-chrome --version | awk \'{print $3}\' | cut -d \'.\' -f 1)')
-os.system('wget https://chromedriver.storage.googleapis.com/${CHROME_VERSION}.0/chromedriver_linux64.zip')
-os.system('unzip chromedriver_linux64.zip')
-os.system('mv chromedriver /usr/bin/chromedriver && chmod +x /usr/bin/chromedriver')
-def check_chrome_installation():
-    """Check if Chrome is installed and get its version"""
-    try:
-        # First try the default 'google-chrome' command
-        result = subprocess.run(['google-chrome', '--version'],
-                              capture_output=True,
-                              text=True)
-        if result.returncode == 0:
-            return True
-    except FileNotFoundError:
-        # If 'google-chrome' fails, try 'google-chrome-stable'
-        try:
-            result = subprocess.run(['google-chrome-stable', '--version'],
-                                  capture_output=True,
-                                  text=True)
-            if result.returncode == 0:
-                return True
-        except FileNotFoundError:
-            pass
-    return False
-def setup_chrome_options():
-    """Setup Chrome options with all necessary arguments"""
-    chrome_options = Options()
-    chrome_options.add_argument('--headless=new')  # Updated headless argument
-    chrome_options.add_argument('--no-sandbox')
-    chrome_options.add_argument('--disable-dev-shm-usage')
-    chrome_options.add_argument('--disable-gpu')
-    chrome_options.add_argument('--disable-software-rasterizer')
-    chrome_options.add_argument('--disable-extensions')
-    chrome_options.add_argument('--disable-setuid-sandbox')
-    return chrome_options
-def scrape_courses_with_selenium(url, limit=50):
     options = Options()
     options.headless = True  # Headless browsing
     driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
@@ -77,129 +29,85 @@ def scrape_courses_with_selenium(url, limit=50):
     courses = []
     try:
         course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
-        for i, course in enumerate(course_elements):
-            if i >= limit:
-                break
-            title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title"
-            description = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Description"
-            lessons = course.find_element(By.CLASS_NAME, "course-card__lesson-count").text or "No Lessons"
-            price = course.find_element(By.CLASS_NAME, "course-card__price").text or "No Price"
-            image_url = course.find_element(By.TAG_NAME, "img").get_attribute("src") or "No Image"
-            courses.append({
-                "title": title,
-                "description": description,
-                "lessons": lessons,
-                "price": price,
-                "image_url": image_url,
-            })
     except Exception as e:
         print(f"Error scraping courses: {e}")
     finally:
         driver.quit()
     return courses
-class SentenceTransformersEmbeddings(Embeddings):
-    def __init__(self, model_name):
-        self.model = SentenceTransformer(model_name)
-    def embed_documents(self, texts):
-        embeddings = self.model.encode(texts, show_progress_bar=True)
-        return embeddings.tolist()
-    def embed_query(self, text):
-        embedding = self.model.encode([text], show_progress_bar=True)[0]
-        return embedding.tolist()
-def create_search_interface(vector_store):
-    def search_courses(query):
-        if not query.strip():
-            return "Please enter a search query."
-        try:
-            docs = vector_store.similarity_search(query, k=2)
-            results = []
-            for i, doc in enumerate(docs, 1):
-                result = f"\nResult {i}:\n"
-                result += f"Title: {doc.metadata['title']}\n"
-                result += f"Price: {doc.metadata['price']}\n"
-                result += f"Lessons: {doc.metadata['lessons']}\n"
-                result += f"Content: {doc.page_content}\n"
-                results.append(result)
-            return "\n---\n".join(results)
-        except Exception as e:
-            return f"Error during search: {str(e)}"
-    return search_courses
-def main():
     try:
-        # Check Chrome installation
-        if not check_chrome_installation():
-            print("Chrome is not installed. Please install Google Chrome first.")
-            return
-        # Scrape courses
-        url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
-        limit = 5
-        print("Starting course scraping...")
-        courses = scrape_courses_with_selenium(url, limit)
-        if not courses:
-            print("No courses found!")
-            return
-        print("\nScraped Courses:")
-        for course in courses:
-            print(f"Title: {course['title']}")
-            print(f"Price: {course['price']}")
-            print(f"Lessons: {course['lessons']}")
-            print("---")
-        # Create DataFrame
-        df = pd.DataFrame(courses)
-        df["combined_text"] = df["title"] + " " + df["description"]
-        texts = df["combined_text"].tolist()
-        # Initialize embedding model
-        print("\nInitializing embedding model...")
-        embedding_model = SentenceTransformersEmbeddings('all-MiniLM-L6-v2')
-        # Create Documents for FAISS
-        documents = [
-            Document(
-                page_content=text,
-                metadata={"source": f"Course {i+1}", **{k:v for k,v in courses[i].items() if k != 'description'}}
-            )
-            for i, text in enumerate(texts)
-        ]
-        # Create FAISS Vector Store
-        print("Creating vector store...")
-        vector_store = FAISS.from_documents(documents, embedding_model)
-        # Create and launch Gradio interface
-        print("\nLaunching Gradio interface...")
-        search_fn = create_search_interface(vector_store)
-        iface = gr.Interface(
-            fn=search_fn,
-            inputs=gr.Textbox(label="Search Courses", placeholder="Enter your search query..."),
-            outputs=gr.Textbox(label="Results"),
-            title="Course Search Engine",
-            description="Search for courses based on your query. The system will return the most relevant matches.",
-            examples=[
-                ["python programming course"],
-                ["machine learning basics"],
-                ["data analysis"]
-            ]
-        )
-        iface.launch()
     except Exception as e:
-        print(f"Error in main: {str(e)}")
 if __name__ == "__main__":
-    main()

 import os
+import json
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service as ChromeService
 from selenium.webdriver.common.by import By
 from webdriver_manager.chrome import ChromeDriverManager
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
+import gradio as gr
+# Function to scrape course data using Selenium
+def scrape_courses_with_selenium(url, output_file="course.json"):
     options = Options()
     options.headless = True  # Headless browsing
     driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
     courses = []
     try:
         course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
+        for course in course_elements:
+            try:
+                title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title"
+                description = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Description"
+                lessons = course.find_element(By.CLASS_NAME, "course-card__lesson-count").text or "No Lessons"
+                price = course.find_element(By.CLASS_NAME, "course-card__price").text or "No Price"
+                image_url = course.find_element(By.TAG_NAME, "img").get_attribute("src") or "No Image"
+                courses.append({
+                    "title": title,
+                    "description": description,
+                    "lessons": lessons,
+                    "price": price,
+                    "image_url": image_url,
+                })
+            except Exception as e:
+                print(f"Error extracting a course: {e}")
     except Exception as e:
         print(f"Error scraping courses: {e}")
     finally:
         driver.quit()
+    # Save scraped data to JSON file
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(courses, f, ensure_ascii=False, indent=4)
     return courses
+# Function to search for courses in the JSON file
+def search_course_by_title(query, input_file="course.json"):
     try:
+        with open(input_file, "r", encoding="utf-8") as f:
+            courses = json.load(f)
+        # Perform case-insensitive search
+        results = [course for course in courses if query.lower() in course["title"].lower()]
+        if not results:
+            return "No matching courses found."
+        output = []
+        for course in results:
+            result = f"<div style=\"margin-bottom: 20px;\">\n"
+            result += f"<img src=\"{course['image_url']}\" alt=\"{course['title']}\" style=\"width:300px;height:auto;\">\n"
+            result += f"<p><strong>Title:</strong> {course['title']}</p>\n"
+            result += f"<p><strong>Lessons:</strong> {course['lessons']}</p>\n"
+            result += f"<p><strong>Price:</strong> {course['price']}</p>\n"
+            result += f"<p><strong>Description:</strong> {course['description']}</p>\n"
+            result += "</div>\n"
+            output.append(result)
+        return "\n---\n".join(output)
+    except FileNotFoundError:
+        return f"Error: The file {input_file} was not found."
     except Exception as e:
+        return f"Error: {e}"
+# Main function to scrape and search
 if __name__ == "__main__":
+    # URL for scraping
+    url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
+    # Scrape and save data to JSON
+    print("Scraping courses...")
+    scrape_courses_with_selenium(url)
+    print("Scraping completed. Data saved to course.json.")
+    # Define Gradio interface for searching
+    def gradio_search(query):
+        return search_course_by_title(query)
+    iface = gr.Interface(
+        fn=gradio_search,
+        inputs=gr.Textbox(label="Search Courses", placeholder="Enter course title..."),
+        outputs="html",
+        title="Course Search Engine",
+        description="Search for courses by title from the scraped data stored in course.json.",
+    )
+    iface.launch()