Spaces:

nikhildsst
/

smart_search_projects

Running

App Files Files Community

nikhildsst commited on Jan 16

Commit

495c181

verified ·

1 Parent(s): 9951a4a

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -43

app.py CHANGED Viewed

@@ -4,31 +4,52 @@ from selenium import webdriver
 from selenium.webdriver.chrome.service import Service as ChromeService
 from selenium.webdriver.common.by import By
 from selenium.webdriver.chrome.options import Options
-from webdriver_manager.chrome import ChromeDriverManager
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 import gradio as gr
 # Function to scrape course data using Selenium
 def scrape_courses_with_selenium(url, output_file="course.json"):
-    options = Options()
-    options.headless = True  # Headless browsing
-    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
-    driver.get(url)
     try:
         WebDriverWait(driver, 60).until(
             EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card"))
         )
-    except Exception as e:
-        print(f"Error: {e}")
-        driver.quit()
-        return []
-    courses = []
-    try:
         course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
         for course in course_elements:
             try:
                 title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title"
@@ -46,17 +67,20 @@ def scrape_courses_with_selenium(url, output_file="course.json"):
                 })
             except Exception as e:
                 print(f"Error extracting a course: {e}")
-    except Exception as e:
-        print(f"Error scraping courses: {e}")
-    finally:
-        driver.quit()
-    # Save scraped data to JSON file
-    with open(output_file, "w", encoding="utf-8") as f:
-        json.dump(courses, f, ensure_ascii=False, indent=4)
-    return courses
 # Function to search for courses in the JSON file
 def search_course_by_title(query, input_file="course.json"):
@@ -64,41 +88,32 @@ def search_course_by_title(query, input_file="course.json"):
         with open(input_file, "r", encoding="utf-8") as f:
             courses = json.load(f)
-        # Perform case-insensitive search
         results = [course for course in courses if query.lower() in course["title"].lower()]
         if not results:
             return "No matching courses found."
         output = []
         for course in results:
-            result = f"<div style=\"margin-bottom: 20px;\">\n"
-            result += f"<img src=\"{course['image_url']}\" alt=\"{course['title']}\" style=\"width:300px;height:auto;\">\n"
-            result += f"<p><strong>Title:</strong> {course['title']}</p>\n"
-            result += f"<p><strong>Lessons:</strong> {course['lessons']}</p>\n"
-            result += f"<p><strong>Price:</strong> {course['price']}</p>\n"
-            result += f"<p><strong>Description:</strong> {course['description']}</p>\n"
-            result += "</div>\n"
             output.append(result)
-        return "\n---\n".join(output)
     except FileNotFoundError:
         return f"Error: The file {input_file} was not found."
     except Exception as e:
         return f"Error: {e}"
-# Main function to scrape and search
-if __name__ == "__main__":
-    # URL for scraping
-    url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
-    # Scrape and save data to JSON
-    print("Scraping courses...")
-    scrape_courses_with_selenium(url)
-    print("Scraping completed. Data saved to course.json.")
-    # Define Gradio interface for searching
     def gradio_search(query):
         return search_course_by_title(query)
@@ -108,6 +123,19 @@ if __name__ == "__main__":
         outputs="html",
         title="Course Search Engine",
         description="Search for courses by title from the scraped data stored in course.json.",
     )
-    iface.launch()

 from selenium.webdriver.chrome.service import Service as ChromeService
 from selenium.webdriver.common.by import By
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 import gradio as gr
+import subprocess
+# Install required system dependencies
+def setup_chrome_dependencies():
+    os.system('apt-get update')
+    os.system('apt-get install -y wget gnupg2')
+    os.system('wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -')
+    os.system('echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/sources.list.d/google-chrome.list')
+    os.system('apt-get update')
+    os.system('apt-get install -y google-chrome-stable')
+    os.system('apt-get install -y xvfb')
+# Function to get Chrome options for Hugging Face environment
+def get_chrome_options():
+    chrome_options = Options()
+    chrome_options.add_argument('--headless')
+    chrome_options.add_argument('--no-sandbox')
+    chrome_options.add_argument('--disable-dev-shm-usage')
+    chrome_options.add_argument('--disable-gpu')
+    chrome_options.binary_location = '/usr/bin/google-chrome'
+    return chrome_options
 # Function to scrape course data using Selenium
 def scrape_courses_with_selenium(url, output_file="course.json"):
     try:
+        # Setup Chrome dependencies if not already installed
+        setup_chrome_dependencies()
+        # Configure Chrome options
+        options = get_chrome_options()
+        # Start Chrome driver
+        driver = webdriver.Chrome(options=options)
+        driver.get(url)
+        # Wait for course cards to load
         WebDriverWait(driver, 60).until(
             EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card"))
         )
+        courses = []
         course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
         for course in course_elements:
             try:
                 title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title"
                 })
             except Exception as e:
                 print(f"Error extracting a course: {e}")
+                continue
+        # Save scraped data to JSON file
+        with open(output_file, "w", encoding="utf-8") as f:
+            json.dump(courses, f, ensure_ascii=False, indent=4)
+        return courses
+    except Exception as e:
+        print(f"Error: {e}")
+        return []
+    finally:
+        if 'driver' in locals():
+            driver.quit()
 # Function to search for courses in the JSON file
 def search_course_by_title(query, input_file="course.json"):
         with open(input_file, "r", encoding="utf-8") as f:
             courses = json.load(f)
         results = [course for course in courses if query.lower() in course["title"].lower()]
         if not results:
             return "No matching courses found."
         output = []
         for course in results:
+            result = f"""
+            <div style="margin-bottom: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 8px;">
+                <img src="{course['image_url']}" alt="{course['title']}" style="width:100%; max-width:300px; height:auto; border-radius: 4px;">
+                <h3 style="color: #2c3e50; margin: 10px 0;">{course['title']}</h3>
+                <p><strong>Lessons:</strong> {course['lessons']}</p>
+                <p><strong>Price:</strong> {course['price']}</p>
+                <p style="color: #666;">{course['description']}</p>
+            </div>
+            """
             output.append(result)
+        return "\n".join(output)
     except FileNotFoundError:
         return f"Error: The file {input_file} was not found."
     except Exception as e:
         return f"Error: {e}"
+# Gradio interface
+def create_interface():
     def gradio_search(query):
         return search_course_by_title(query)
         outputs="html",
         title="Course Search Engine",
         description="Search for courses by title from the scraped data stored in course.json.",
+        css="div.gradio-container {max-width: 800px; margin: auto;}"
     )
+    return iface
+# Main execution
+if __name__ == "__main__":
+    # URL for scraping
+    url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
+    print("Starting course scraping...")
+    scrape_courses_with_selenium(url)
+    print("Scraping completed. Data saved to course.json")
+    # Launch Gradio interface
+    iface = create_interface()
+    iface.launch(server_name="0.0.0.0", server_port=7860)