Spaces:

nikhildsst
/

smart_search_projects

Running

App Files Files Community

nikhildsst commited on Jan 16

Commit

0e4c9e1

verified ·

1 Parent(s): 3e9edf5

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -51

app.py CHANGED Viewed

@@ -8,6 +8,8 @@ from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 import gradio as gr
 import subprocess
 # Install required system dependencies
 def setup_chrome_dependencies():
@@ -30,57 +32,47 @@ def get_chrome_options():
     return chrome_options
 # Function to scrape course data using Selenium
-def scrape_courses_with_selenium(url, output_file="course.json"):
     try:
-        # Setup Chrome dependencies if not already installed
-        setup_chrome_dependencies()
-        # Configure Chrome options
-        options = get_chrome_options()
-        # Start Chrome driver
-        driver = webdriver.Chrome(options=options)
-        driver.get(url)
-        # Wait for course cards to load
-        WebDriverWait(driver, 60).until(
-            EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card"))
-        )
-        courses = []
-        course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
-        for course in course_elements:
-            try:
-                title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title"
-                description = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Description"
-                lessons = course.find_element(By.CLASS_NAME, "course-card__lesson-count").text or "No Lessons"
-                price = course.find_element(By.CLASS_NAME, "course-card__price").text or "No Price"
-                image_url = course.find_element(By.TAG_NAME, "img").get_attribute("src") or "No Image"
-                courses.append({
-                    "title": title,
-                    "description": description,
-                    "lessons": lessons,
-                    "price": price,
-                    "image_url": image_url,
-                })
-            except Exception as e:
-                print(f"Error extracting a course: {e}")
-                continue
-        # Save scraped data to JSON file
-        with open(output_file, "w", encoding="utf-8") as f:
-            json.dump(courses, f, ensure_ascii=False, indent=4)
-        return courses
     except Exception as e:
-        print(f"Error: {e}")
         return []
-    finally:
-        if 'driver' in locals():
-            driver.quit()
 # Function to search for courses in the JSON file
 def search_course_by_title(query, input_file="course.json"):
@@ -96,7 +88,8 @@ def search_course_by_title(query, input_file="course.json"):
         for course in results:
             result = f"""
             <div style="margin-bottom: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 8px;">
-                <img src="{course['image_url']}" alt="{course['title']}" style="width:100%; max-width:300px; height:auto; border-radius: 4px;">
                 <h3 style="color: #2c3e50; margin: 10px 0;">{course['title']}</h3>
                 <p><strong>Lessons:</strong> {course['lessons']}</p>
                 <p><strong>Price:</strong> {course['price']}</p>
@@ -122,7 +115,7 @@ def create_interface():
         inputs=gr.Textbox(label="Search Courses", placeholder="Enter course title..."),
         outputs="html",
         title="Course Search Engine",
-        description="Search for courses by title",
         css="div.gradio-container {max-width: 800px; margin: auto;}"
     )
     return iface
@@ -133,7 +126,10 @@ if __name__ == "__main__":
     url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
     print("Starting course scraping...")
-    scrape_courses_with_selenium(url)
     print("Scraping completed. Data saved to course.json")
     # Launch Gradio interface

 from selenium.webdriver.support import expected_conditions as EC
 import gradio as gr
 import subprocess
+import asyncio
+from playwright.async_api import async_playwright
 # Install required system dependencies
 def setup_chrome_dependencies():
     return chrome_options
 # Function to scrape course data using Selenium
+async def scrape_courses_with_playwright(url, output_file="course.json"):
     try:
+        async with async_playwright() as p:
+            # Launch browser with appropriate options for Hugging Face environment
+            browser = await p.chromium.launch(
+                chromium_sandbox=False,
+            )
+            # Create a new page
+            page = await browser.new_page()
+            await page.goto(url, wait_until="networkidle")
+            # Wait for course cards to load
+            await page.wait_for_selector(".course-card", timeout=60000)
+            # Extract course information
+            courses = await page.evaluate("""
+                () => {
+                    const courseElements = document.querySelectorAll('.course-card');
+                    return Array.from(courseElements).map(course => {
+                        return {
+                            title: course.querySelector('.course-card__body')?.textContent?.trim() || 'No Title',
+                            description: course.querySelector('.course-card__body')?.textContent?.trim() || 'No Description',
+                            lessons: course.querySelector('.course-card__lesson-count')?.textContent?.trim() || 'No Lessons',
+                            price: course.querySelector('.course-card__price')?.textContent?.trim() || 'No Price',
+                            image_url: course.querySelector('img')?.src || 'No Image'
+                        };
+                    });
+                }
+            """)
+            # Save scraped data to JSON file
+            with open(output_file, "w", encoding="utf-8") as f:
+                json.dump(courses, f, ensure_ascii=False, indent=4)
+            await browser.close()
+            return courses
     except Exception as e:
+        print(f"Error during scraping: {e}")
         return []
 # Function to search for courses in the JSON file
 def search_course_by_title(query, input_file="course.json"):
         for course in results:
             result = f"""
             <div style="margin-bottom: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 8px;">
+                <img src="{course['image_url']}" alt="{course['title']}"
+                     style="width:100%; max-width:300px; height:auto; border-radius: 4px;">
                 <h3 style="color: #2c3e50; margin: 10px 0;">{course['title']}</h3>
                 <p><strong>Lessons:</strong> {course['lessons']}</p>
                 <p><strong>Price:</strong> {course['price']}</p>
         inputs=gr.Textbox(label="Search Courses", placeholder="Enter course title..."),
         outputs="html",
         title="Course Search Engine",
+        description="Search for courses by title.",
         css="div.gradio-container {max-width: 800px; margin: auto;}"
     )
     return iface
     url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
     print("Starting course scraping...")
+    # Run the async scraping function
+    asyncio.run(scrape_courses_with_playwright(url))
     print("Scraping completed. Data saved to course.json")
     # Launch Gradio interface