nikhildsst commited on
Commit
0e4c9e1
·
verified ·
1 Parent(s): 3e9edf5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -51
app.py CHANGED
@@ -8,6 +8,8 @@ from selenium.webdriver.support.ui import WebDriverWait
8
  from selenium.webdriver.support import expected_conditions as EC
9
  import gradio as gr
10
  import subprocess
 
 
11
 
12
  # Install required system dependencies
13
  def setup_chrome_dependencies():
@@ -30,57 +32,47 @@ def get_chrome_options():
30
  return chrome_options
31
 
32
  # Function to scrape course data using Selenium
33
- def scrape_courses_with_selenium(url, output_file="course.json"):
34
  try:
35
- # Setup Chrome dependencies if not already installed
36
- setup_chrome_dependencies()
37
-
38
- # Configure Chrome options
39
- options = get_chrome_options()
40
-
41
- # Start Chrome driver
42
- driver = webdriver.Chrome(options=options)
43
- driver.get(url)
44
-
45
- # Wait for course cards to load
46
- WebDriverWait(driver, 60).until(
47
- EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card"))
48
- )
49
-
50
- courses = []
51
- course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
52
-
53
- for course in course_elements:
54
- try:
55
- title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title"
56
- description = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Description"
57
- lessons = course.find_element(By.CLASS_NAME, "course-card__lesson-count").text or "No Lessons"
58
- price = course.find_element(By.CLASS_NAME, "course-card__price").text or "No Price"
59
- image_url = course.find_element(By.TAG_NAME, "img").get_attribute("src") or "No Image"
60
-
61
- courses.append({
62
- "title": title,
63
- "description": description,
64
- "lessons": lessons,
65
- "price": price,
66
- "image_url": image_url,
67
- })
68
- except Exception as e:
69
- print(f"Error extracting a course: {e}")
70
- continue
71
-
72
- # Save scraped data to JSON file
73
- with open(output_file, "w", encoding="utf-8") as f:
74
- json.dump(courses, f, ensure_ascii=False, indent=4)
75
-
76
- return courses
77
-
78
  except Exception as e:
79
- print(f"Error: {e}")
80
  return []
81
- finally:
82
- if 'driver' in locals():
83
- driver.quit()
84
 
85
  # Function to search for courses in the JSON file
86
  def search_course_by_title(query, input_file="course.json"):
@@ -96,7 +88,8 @@ def search_course_by_title(query, input_file="course.json"):
96
  for course in results:
97
  result = f"""
98
  <div style="margin-bottom: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 8px;">
99
- <img src="{course['image_url']}" alt="{course['title']}" style="width:100%; max-width:300px; height:auto; border-radius: 4px;">
 
100
  <h3 style="color: #2c3e50; margin: 10px 0;">{course['title']}</h3>
101
  <p><strong>Lessons:</strong> {course['lessons']}</p>
102
  <p><strong>Price:</strong> {course['price']}</p>
@@ -122,7 +115,7 @@ def create_interface():
122
  inputs=gr.Textbox(label="Search Courses", placeholder="Enter course title..."),
123
  outputs="html",
124
  title="Course Search Engine",
125
- description="Search for courses by title",
126
  css="div.gradio-container {max-width: 800px; margin: auto;}"
127
  )
128
  return iface
@@ -133,7 +126,10 @@ if __name__ == "__main__":
133
  url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
134
 
135
  print("Starting course scraping...")
136
- scrape_courses_with_selenium(url)
 
 
 
137
  print("Scraping completed. Data saved to course.json")
138
 
139
  # Launch Gradio interface
 
8
  from selenium.webdriver.support import expected_conditions as EC
9
  import gradio as gr
10
  import subprocess
11
+ import asyncio
12
+ from playwright.async_api import async_playwright
13
 
14
  # Install required system dependencies
15
  def setup_chrome_dependencies():
 
32
  return chrome_options
33
 
34
  # Function to scrape course data using Selenium
35
+ async def scrape_courses_with_playwright(url, output_file="course.json"):
36
  try:
37
+ async with async_playwright() as p:
38
+ # Launch browser with appropriate options for Hugging Face environment
39
+ browser = await p.chromium.launch(
40
+ chromium_sandbox=False,
41
+ )
42
+
43
+ # Create a new page
44
+ page = await browser.new_page()
45
+ await page.goto(url, wait_until="networkidle")
46
+
47
+ # Wait for course cards to load
48
+ await page.wait_for_selector(".course-card", timeout=60000)
49
+
50
+ # Extract course information
51
+ courses = await page.evaluate("""
52
+ () => {
53
+ const courseElements = document.querySelectorAll('.course-card');
54
+ return Array.from(courseElements).map(course => {
55
+ return {
56
+ title: course.querySelector('.course-card__body')?.textContent?.trim() || 'No Title',
57
+ description: course.querySelector('.course-card__body')?.textContent?.trim() || 'No Description',
58
+ lessons: course.querySelector('.course-card__lesson-count')?.textContent?.trim() || 'No Lessons',
59
+ price: course.querySelector('.course-card__price')?.textContent?.trim() || 'No Price',
60
+ image_url: course.querySelector('img')?.src || 'No Image'
61
+ };
62
+ });
63
+ }
64
+ """)
65
+
66
+ # Save scraped data to JSON file
67
+ with open(output_file, "w", encoding="utf-8") as f:
68
+ json.dump(courses, f, ensure_ascii=False, indent=4)
69
+
70
+ await browser.close()
71
+ return courses
72
+
 
 
 
 
 
 
 
73
  except Exception as e:
74
+ print(f"Error during scraping: {e}")
75
  return []
 
 
 
76
 
77
  # Function to search for courses in the JSON file
78
  def search_course_by_title(query, input_file="course.json"):
 
88
  for course in results:
89
  result = f"""
90
  <div style="margin-bottom: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 8px;">
91
+ <img src="{course['image_url']}" alt="{course['title']}"
92
+ style="width:100%; max-width:300px; height:auto; border-radius: 4px;">
93
  <h3 style="color: #2c3e50; margin: 10px 0;">{course['title']}</h3>
94
  <p><strong>Lessons:</strong> {course['lessons']}</p>
95
  <p><strong>Price:</strong> {course['price']}</p>
 
115
  inputs=gr.Textbox(label="Search Courses", placeholder="Enter course title..."),
116
  outputs="html",
117
  title="Course Search Engine",
118
+ description="Search for courses by title.",
119
  css="div.gradio-container {max-width: 800px; margin: auto;}"
120
  )
121
  return iface
 
126
  url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
127
 
128
  print("Starting course scraping...")
129
+
130
+ # Run the async scraping function
131
+ asyncio.run(scrape_courses_with_playwright(url))
132
+
133
  print("Scraping completed. Data saved to course.json")
134
 
135
  # Launch Gradio interface