|
import os |
|
import json |
|
from selenium import webdriver |
|
from selenium.webdriver.chrome.service import Service as ChromeService |
|
from selenium.webdriver.common.by import By |
|
from selenium.webdriver.chrome.options import Options |
|
from selenium.webdriver.support.ui import WebDriverWait |
|
from selenium.webdriver.support import expected_conditions as EC |
|
import gradio as gr |
|
import subprocess |
|
|
|
|
|
def setup_chrome_dependencies(): |
|
os.system('apt-get update') |
|
os.system('apt-get install -y wget gnupg2') |
|
os.system('wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -') |
|
os.system('echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/sources.list.d/google-chrome.list') |
|
os.system('apt-get update') |
|
os.system('apt-get install -y google-chrome-stable') |
|
os.system('apt-get install -y xvfb') |
|
|
|
|
|
def get_chrome_options(): |
|
chrome_options = Options() |
|
chrome_options.add_argument('--headless') |
|
chrome_options.add_argument('--no-sandbox') |
|
chrome_options.add_argument('--disable-dev-shm-usage') |
|
chrome_options.add_argument('--disable-gpu') |
|
chrome_options.binary_location = '/usr/bin/google-chrome' |
|
return chrome_options |
|
|
|
|
|
def scrape_courses_with_selenium(url, output_file="course.json"): |
|
try: |
|
|
|
setup_chrome_dependencies() |
|
|
|
|
|
options = get_chrome_options() |
|
|
|
|
|
driver = webdriver.Chrome(options=options) |
|
driver.get(url) |
|
|
|
|
|
WebDriverWait(driver, 60).until( |
|
EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card")) |
|
) |
|
|
|
courses = [] |
|
course_elements = driver.find_elements(By.CLASS_NAME, "course-card") |
|
|
|
for course in course_elements: |
|
try: |
|
title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title" |
|
description = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Description" |
|
lessons = course.find_element(By.CLASS_NAME, "course-card__lesson-count").text or "No Lessons" |
|
price = course.find_element(By.CLASS_NAME, "course-card__price").text or "No Price" |
|
image_url = course.find_element(By.TAG_NAME, "img").get_attribute("src") or "No Image" |
|
|
|
courses.append({ |
|
"title": title, |
|
"description": description, |
|
"lessons": lessons, |
|
"price": price, |
|
"image_url": image_url, |
|
}) |
|
except Exception as e: |
|
print(f"Error extracting a course: {e}") |
|
continue |
|
|
|
|
|
with open(output_file, "w", encoding="utf-8") as f: |
|
json.dump(courses, f, ensure_ascii=False, indent=4) |
|
|
|
return courses |
|
|
|
except Exception as e: |
|
print(f"Error: {e}") |
|
return [] |
|
finally: |
|
if 'driver' in locals(): |
|
driver.quit() |
|
|
|
|
|
def search_course_by_title(query, input_file="course.json"): |
|
try: |
|
with open(input_file, "r", encoding="utf-8") as f: |
|
courses = json.load(f) |
|
|
|
results = [course for course in courses if query.lower() in course["title"].lower()] |
|
if not results: |
|
return "No matching courses found." |
|
|
|
output = [] |
|
for course in results: |
|
result = f""" |
|
<div style="margin-bottom: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 8px;"> |
|
<img src="{course['image_url']}" alt="{course['title']}" style="width:100%; max-width:300px; height:auto; border-radius: 4px;"> |
|
<h3 style="color: #2c3e50; margin: 10px 0;">{course['title']}</h3> |
|
<p><strong>Lessons:</strong> {course['lessons']}</p> |
|
<p><strong>Price:</strong> {course['price']}</p> |
|
<p style="color: #666;">{course['description']}</p> |
|
</div> |
|
""" |
|
output.append(result) |
|
|
|
return "\n".join(output) |
|
|
|
except FileNotFoundError: |
|
return f"Error: The file {input_file} was not found." |
|
except Exception as e: |
|
return f"Error: {e}" |
|
|
|
|
|
def create_interface(): |
|
def gradio_search(query): |
|
return search_course_by_title(query) |
|
|
|
iface = gr.Interface( |
|
fn=gradio_search, |
|
inputs=gr.Textbox(label="Search Courses", placeholder="Enter course title..."), |
|
outputs="html", |
|
title="Course Search Engine", |
|
description="Search for courses by title.", |
|
css="div.gradio-container {max-width: 800px; margin: auto;}" |
|
) |
|
return iface |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
url = "https://courses.analyticsvidhya.com/pages/all-free-courses" |
|
|
|
print("Starting course scraping...") |
|
scrape_courses_with_selenium(url) |
|
print("Scraping completed. Data saved to course.json") |
|
|
|
|
|
iface = create_interface() |
|
iface.launch(server_name="0.0.0.0", server_port=7860) |