Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,8 @@ from selenium.webdriver.support.ui import WebDriverWait
|
|
8 |
from selenium.webdriver.support import expected_conditions as EC
|
9 |
import gradio as gr
|
10 |
import subprocess
|
|
|
|
|
11 |
|
12 |
# Install required system dependencies
|
13 |
def setup_chrome_dependencies():
|
@@ -30,57 +32,47 @@ def get_chrome_options():
|
|
30 |
return chrome_options
|
31 |
|
32 |
# Function to scrape course data using Selenium
|
33 |
-
def
|
34 |
try:
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
# Save scraped data to JSON file
|
73 |
-
with open(output_file, "w", encoding="utf-8") as f:
|
74 |
-
json.dump(courses, f, ensure_ascii=False, indent=4)
|
75 |
-
|
76 |
-
return courses
|
77 |
-
|
78 |
except Exception as e:
|
79 |
-
print(f"Error: {e}")
|
80 |
return []
|
81 |
-
finally:
|
82 |
-
if 'driver' in locals():
|
83 |
-
driver.quit()
|
84 |
|
85 |
# Function to search for courses in the JSON file
|
86 |
def search_course_by_title(query, input_file="course.json"):
|
@@ -96,7 +88,8 @@ def search_course_by_title(query, input_file="course.json"):
|
|
96 |
for course in results:
|
97 |
result = f"""
|
98 |
<div style="margin-bottom: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 8px;">
|
99 |
-
<img src="{course['image_url']}" alt="{course['title']}"
|
|
|
100 |
<h3 style="color: #2c3e50; margin: 10px 0;">{course['title']}</h3>
|
101 |
<p><strong>Lessons:</strong> {course['lessons']}</p>
|
102 |
<p><strong>Price:</strong> {course['price']}</p>
|
@@ -122,7 +115,7 @@ def create_interface():
|
|
122 |
inputs=gr.Textbox(label="Search Courses", placeholder="Enter course title..."),
|
123 |
outputs="html",
|
124 |
title="Course Search Engine",
|
125 |
-
description="Search for courses by title",
|
126 |
css="div.gradio-container {max-width: 800px; margin: auto;}"
|
127 |
)
|
128 |
return iface
|
@@ -133,7 +126,10 @@ if __name__ == "__main__":
|
|
133 |
url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
|
134 |
|
135 |
print("Starting course scraping...")
|
136 |
-
|
|
|
|
|
|
|
137 |
print("Scraping completed. Data saved to course.json")
|
138 |
|
139 |
# Launch Gradio interface
|
|
|
8 |
from selenium.webdriver.support import expected_conditions as EC
|
9 |
import gradio as gr
|
10 |
import subprocess
|
11 |
+
import asyncio
|
12 |
+
from playwright.async_api import async_playwright
|
13 |
|
14 |
# Install required system dependencies
|
15 |
def setup_chrome_dependencies():
|
|
|
32 |
return chrome_options
|
33 |
|
34 |
# Function to scrape course data using Selenium
|
35 |
+
async def scrape_courses_with_playwright(url, output_file="course.json"):
|
36 |
try:
|
37 |
+
async with async_playwright() as p:
|
38 |
+
# Launch browser with appropriate options for Hugging Face environment
|
39 |
+
browser = await p.chromium.launch(
|
40 |
+
chromium_sandbox=False,
|
41 |
+
)
|
42 |
+
|
43 |
+
# Create a new page
|
44 |
+
page = await browser.new_page()
|
45 |
+
await page.goto(url, wait_until="networkidle")
|
46 |
+
|
47 |
+
# Wait for course cards to load
|
48 |
+
await page.wait_for_selector(".course-card", timeout=60000)
|
49 |
+
|
50 |
+
# Extract course information
|
51 |
+
courses = await page.evaluate("""
|
52 |
+
() => {
|
53 |
+
const courseElements = document.querySelectorAll('.course-card');
|
54 |
+
return Array.from(courseElements).map(course => {
|
55 |
+
return {
|
56 |
+
title: course.querySelector('.course-card__body')?.textContent?.trim() || 'No Title',
|
57 |
+
description: course.querySelector('.course-card__body')?.textContent?.trim() || 'No Description',
|
58 |
+
lessons: course.querySelector('.course-card__lesson-count')?.textContent?.trim() || 'No Lessons',
|
59 |
+
price: course.querySelector('.course-card__price')?.textContent?.trim() || 'No Price',
|
60 |
+
image_url: course.querySelector('img')?.src || 'No Image'
|
61 |
+
};
|
62 |
+
});
|
63 |
+
}
|
64 |
+
""")
|
65 |
+
|
66 |
+
# Save scraped data to JSON file
|
67 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
68 |
+
json.dump(courses, f, ensure_ascii=False, indent=4)
|
69 |
+
|
70 |
+
await browser.close()
|
71 |
+
return courses
|
72 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
except Exception as e:
|
74 |
+
print(f"Error during scraping: {e}")
|
75 |
return []
|
|
|
|
|
|
|
76 |
|
77 |
# Function to search for courses in the JSON file
|
78 |
def search_course_by_title(query, input_file="course.json"):
|
|
|
88 |
for course in results:
|
89 |
result = f"""
|
90 |
<div style="margin-bottom: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 8px;">
|
91 |
+
<img src="{course['image_url']}" alt="{course['title']}"
|
92 |
+
style="width:100%; max-width:300px; height:auto; border-radius: 4px;">
|
93 |
<h3 style="color: #2c3e50; margin: 10px 0;">{course['title']}</h3>
|
94 |
<p><strong>Lessons:</strong> {course['lessons']}</p>
|
95 |
<p><strong>Price:</strong> {course['price']}</p>
|
|
|
115 |
inputs=gr.Textbox(label="Search Courses", placeholder="Enter course title..."),
|
116 |
outputs="html",
|
117 |
title="Course Search Engine",
|
118 |
+
description="Search for courses by title.",
|
119 |
css="div.gradio-container {max-width: 800px; margin: auto;}"
|
120 |
)
|
121 |
return iface
|
|
|
126 |
url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
|
127 |
|
128 |
print("Starting course scraping...")
|
129 |
+
|
130 |
+
# Run the async scraping function
|
131 |
+
asyncio.run(scrape_courses_with_playwright(url))
|
132 |
+
|
133 |
print("Scraping completed. Data saved to course.json")
|
134 |
|
135 |
# Launch Gradio interface
|