nikhildsst commited on
Commit
495c181
·
verified ·
1 Parent(s): 9951a4a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -43
app.py CHANGED
@@ -4,31 +4,52 @@ from selenium import webdriver
4
  from selenium.webdriver.chrome.service import Service as ChromeService
5
  from selenium.webdriver.common.by import By
6
  from selenium.webdriver.chrome.options import Options
7
- from webdriver_manager.chrome import ChromeDriverManager
8
  from selenium.webdriver.support.ui import WebDriverWait
9
  from selenium.webdriver.support import expected_conditions as EC
10
  import gradio as gr
11
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Function to scrape course data using Selenium
14
  def scrape_courses_with_selenium(url, output_file="course.json"):
15
- options = Options()
16
- options.headless = True # Headless browsing
17
- driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
18
- driver.get(url)
19
-
20
  try:
 
 
 
 
 
 
 
 
 
 
 
21
  WebDriverWait(driver, 60).until(
22
  EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card"))
23
  )
24
- except Exception as e:
25
- print(f"Error: {e}")
26
- driver.quit()
27
- return []
28
 
29
- courses = []
30
- try:
31
  course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
 
32
  for course in course_elements:
33
  try:
34
  title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title"
@@ -46,17 +67,20 @@ def scrape_courses_with_selenium(url, output_file="course.json"):
46
  })
47
  except Exception as e:
48
  print(f"Error extracting a course: {e}")
 
49
 
50
- except Exception as e:
51
- print(f"Error scraping courses: {e}")
52
- finally:
53
- driver.quit()
54
 
55
- # Save scraped data to JSON file
56
- with open(output_file, "w", encoding="utf-8") as f:
57
- json.dump(courses, f, ensure_ascii=False, indent=4)
58
 
59
- return courses
 
 
 
 
 
60
 
61
  # Function to search for courses in the JSON file
62
  def search_course_by_title(query, input_file="course.json"):
@@ -64,41 +88,32 @@ def search_course_by_title(query, input_file="course.json"):
64
  with open(input_file, "r", encoding="utf-8") as f:
65
  courses = json.load(f)
66
 
67
- # Perform case-insensitive search
68
  results = [course for course in courses if query.lower() in course["title"].lower()]
69
  if not results:
70
  return "No matching courses found."
71
 
72
  output = []
73
  for course in results:
74
- result = f"<div style=\"margin-bottom: 20px;\">\n"
75
- result += f"<img src=\"{course['image_url']}\" alt=\"{course['title']}\" style=\"width:300px;height:auto;\">\n"
76
- result += f"<p><strong>Title:</strong> {course['title']}</p>\n"
77
- result += f"<p><strong>Lessons:</strong> {course['lessons']}</p>\n"
78
- result += f"<p><strong>Price:</strong> {course['price']}</p>\n"
79
- result += f"<p><strong>Description:</strong> {course['description']}</p>\n"
80
- result += "</div>\n"
 
 
81
  output.append(result)
82
 
83
- return "\n---\n".join(output)
84
 
85
  except FileNotFoundError:
86
  return f"Error: The file {input_file} was not found."
87
  except Exception as e:
88
  return f"Error: {e}"
89
 
90
-
91
- # Main function to scrape and search
92
- if __name__ == "__main__":
93
- # URL for scraping
94
- url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
95
-
96
- # Scrape and save data to JSON
97
- print("Scraping courses...")
98
- scrape_courses_with_selenium(url)
99
- print("Scraping completed. Data saved to course.json.")
100
-
101
- # Define Gradio interface for searching
102
  def gradio_search(query):
103
  return search_course_by_title(query)
104
 
@@ -108,6 +123,19 @@ if __name__ == "__main__":
108
  outputs="html",
109
  title="Course Search Engine",
110
  description="Search for courses by title from the scraped data stored in course.json.",
 
111
  )
 
112
 
113
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
4
  from selenium.webdriver.chrome.service import Service as ChromeService
5
  from selenium.webdriver.common.by import By
6
  from selenium.webdriver.chrome.options import Options
 
7
  from selenium.webdriver.support.ui import WebDriverWait
8
  from selenium.webdriver.support import expected_conditions as EC
9
  import gradio as gr
10
+ import subprocess
11
+
12
+ # Install required system dependencies
13
+ def setup_chrome_dependencies():
14
+ os.system('apt-get update')
15
+ os.system('apt-get install -y wget gnupg2')
16
+ os.system('wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -')
17
+ os.system('echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/sources.list.d/google-chrome.list')
18
+ os.system('apt-get update')
19
+ os.system('apt-get install -y google-chrome-stable')
20
+ os.system('apt-get install -y xvfb')
21
+
22
+ # Function to get Chrome options for Hugging Face environment
23
+ def get_chrome_options():
24
+ chrome_options = Options()
25
+ chrome_options.add_argument('--headless')
26
+ chrome_options.add_argument('--no-sandbox')
27
+ chrome_options.add_argument('--disable-dev-shm-usage')
28
+ chrome_options.add_argument('--disable-gpu')
29
+ chrome_options.binary_location = '/usr/bin/google-chrome'
30
+ return chrome_options
31
 
32
  # Function to scrape course data using Selenium
33
  def scrape_courses_with_selenium(url, output_file="course.json"):
 
 
 
 
 
34
  try:
35
+ # Setup Chrome dependencies if not already installed
36
+ setup_chrome_dependencies()
37
+
38
+ # Configure Chrome options
39
+ options = get_chrome_options()
40
+
41
+ # Start Chrome driver
42
+ driver = webdriver.Chrome(options=options)
43
+ driver.get(url)
44
+
45
+ # Wait for course cards to load
46
  WebDriverWait(driver, 60).until(
47
  EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card"))
48
  )
 
 
 
 
49
 
50
+ courses = []
 
51
  course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
52
+
53
  for course in course_elements:
54
  try:
55
  title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title"
 
67
  })
68
  except Exception as e:
69
  print(f"Error extracting a course: {e}")
70
+ continue
71
 
72
+ # Save scraped data to JSON file
73
+ with open(output_file, "w", encoding="utf-8") as f:
74
+ json.dump(courses, f, ensure_ascii=False, indent=4)
 
75
 
76
+ return courses
 
 
77
 
78
+ except Exception as e:
79
+ print(f"Error: {e}")
80
+ return []
81
+ finally:
82
+ if 'driver' in locals():
83
+ driver.quit()
84
 
85
  # Function to search for courses in the JSON file
86
  def search_course_by_title(query, input_file="course.json"):
 
88
  with open(input_file, "r", encoding="utf-8") as f:
89
  courses = json.load(f)
90
 
 
91
  results = [course for course in courses if query.lower() in course["title"].lower()]
92
  if not results:
93
  return "No matching courses found."
94
 
95
  output = []
96
  for course in results:
97
+ result = f"""
98
+ <div style="margin-bottom: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 8px;">
99
+ <img src="{course['image_url']}" alt="{course['title']}" style="width:100%; max-width:300px; height:auto; border-radius: 4px;">
100
+ <h3 style="color: #2c3e50; margin: 10px 0;">{course['title']}</h3>
101
+ <p><strong>Lessons:</strong> {course['lessons']}</p>
102
+ <p><strong>Price:</strong> {course['price']}</p>
103
+ <p style="color: #666;">{course['description']}</p>
104
+ </div>
105
+ """
106
  output.append(result)
107
 
108
+ return "\n".join(output)
109
 
110
  except FileNotFoundError:
111
  return f"Error: The file {input_file} was not found."
112
  except Exception as e:
113
  return f"Error: {e}"
114
 
115
+ # Gradio interface
116
+ def create_interface():
 
 
 
 
 
 
 
 
 
 
117
  def gradio_search(query):
118
  return search_course_by_title(query)
119
 
 
123
  outputs="html",
124
  title="Course Search Engine",
125
  description="Search for courses by title from the scraped data stored in course.json.",
126
+ css="div.gradio-container {max-width: 800px; margin: auto;}"
127
  )
128
+ return iface
129
 
130
+ # Main execution
131
+ if __name__ == "__main__":
132
+ # URL for scraping
133
+ url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
134
+
135
+ print("Starting course scraping...")
136
+ scrape_courses_with_selenium(url)
137
+ print("Scraping completed. Data saved to course.json")
138
+
139
+ # Launch Gradio interface
140
+ iface = create_interface()
141
+ iface.launch(server_name="0.0.0.0", server_port=7860)