import os import json from selenium import webdriver from selenium.webdriver.chrome.service import Service as ChromeService from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import gradio as gr import subprocess # Install required system dependencies def setup_chrome_dependencies(): os.system('apt-get update') os.system('apt-get install -y wget gnupg2') os.system('wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -') os.system('echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/sources.list.d/google-chrome.list') os.system('apt-get update') os.system('apt-get install -y google-chrome-stable') os.system('apt-get install -y xvfb') # Function to get Chrome options for Hugging Face environment def get_chrome_options(): chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-gpu') chrome_options.binary_location = '/usr/bin/google-chrome' return chrome_options # Function to scrape course data using Selenium def scrape_courses_with_selenium(url, output_file="course.json"): try: # Setup Chrome dependencies if not already installed setup_chrome_dependencies() # Configure Chrome options options = get_chrome_options() # Start Chrome driver driver = webdriver.Chrome(options=options) driver.get(url) # Wait for course cards to load WebDriverWait(driver, 60).until( EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card")) ) courses = [] course_elements = driver.find_elements(By.CLASS_NAME, "course-card") for course in course_elements: try: title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title" description = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Description" lessons = course.find_element(By.CLASS_NAME, "course-card__lesson-count").text or "No Lessons" price = course.find_element(By.CLASS_NAME, "course-card__price").text or "No Price" image_url = course.find_element(By.TAG_NAME, "img").get_attribute("src") or "No Image" courses.append({ "title": title, "description": description, "lessons": lessons, "price": price, "image_url": image_url, }) except Exception as e: print(f"Error extracting a course: {e}") continue # Save scraped data to JSON file with open(output_file, "w", encoding="utf-8") as f: json.dump(courses, f, ensure_ascii=False, indent=4) return courses except Exception as e: print(f"Error: {e}") return [] finally: if 'driver' in locals(): driver.quit() # Function to search for courses in the JSON file def search_course_by_title(query, input_file="course.json"): try: with open(input_file, "r", encoding="utf-8") as f: courses = json.load(f) results = [course for course in courses if query.lower() in course["title"].lower()] if not results: return "No matching courses found." output = [] for course in results: result = f"""
{course['title']}

{course['title']}

Lessons: {course['lessons']}

Price: {course['price']}

{course['description']}

""" output.append(result) return "\n".join(output) except FileNotFoundError: return f"Error: The file {input_file} was not found." except Exception as e: return f"Error: {e}" # Gradio interface def create_interface(): def gradio_search(query): return search_course_by_title(query) iface = gr.Interface( fn=gradio_search, inputs=gr.Textbox(label="Search Courses", placeholder="Enter course title..."), outputs="html", title="Course Search Engine", description="Search for courses by title.", css="div.gradio-container {max-width: 800px; margin: auto;}" ) return iface # Main execution if __name__ == "__main__": # URL for scraping url = "https://courses.analyticsvidhya.com/pages/all-free-courses" print("Starting course scraping...") scrape_courses_with_selenium(url) print("Scraping completed. Data saved to course.json") # Launch Gradio interface iface = create_interface() iface.launch(server_name="0.0.0.0", server_port=7860)