import os import json from selenium import webdriver from selenium.webdriver.chrome.service import Service as ChromeService from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import gradio as gr import subprocess # Install required system dependencies def setup_chrome_dependencies(): os.system('apt-get update') os.system('apt-get install -y wget gnupg2') os.system('wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -') os.system('echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/sources.list.d/google-chrome.list') os.system('apt-get update') os.system('apt-get install -y google-chrome-stable') os.system('apt-get install -y xvfb') # Function to get Chrome options for Hugging Face environment def get_chrome_options(): chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-gpu') chrome_options.binary_location = '/usr/bin/google-chrome' return chrome_options # Function to scrape course data using Selenium def scrape_courses_with_selenium(url, output_file="course.json"): try: # Setup Chrome dependencies if not already installed setup_chrome_dependencies() # Configure Chrome options options = get_chrome_options() # Start Chrome driver driver = webdriver.Chrome(options=options) driver.get(url) # Wait for course cards to load WebDriverWait(driver, 60).until( EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card")) ) courses = [] course_elements = driver.find_elements(By.CLASS_NAME, "course-card") for course in course_elements: try: title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title" description = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Description" lessons = course.find_element(By.CLASS_NAME, "course-card__lesson-count").text or "No Lessons" price = course.find_element(By.CLASS_NAME, "course-card__price").text or "No Price" image_url = course.find_element(By.TAG_NAME, "img").get_attribute("src") or "No Image" courses.append({ "title": title, "description": description, "lessons": lessons, "price": price, "image_url": image_url, }) except Exception as e: print(f"Error extracting a course: {e}") continue # Save scraped data to JSON file with open(output_file, "w", encoding="utf-8") as f: json.dump(courses, f, ensure_ascii=False, indent=4) return courses except Exception as e: print(f"Error: {e}") return [] finally: if 'driver' in locals(): driver.quit() # Function to search for courses in the JSON file def search_course_by_title(query, input_file="course.json"): try: with open(input_file, "r", encoding="utf-8") as f: courses = json.load(f) results = [course for course in courses if query.lower() in course["title"].lower()] if not results: return "No matching courses found." output = [] for course in results: result = f"""
Lessons: {course['lessons']}
Price: {course['price']}
{course['description']}