Spaces:

nikhildsst
/

smart_search_projects

Running

File size: 5,414 Bytes

import os
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import gradio as gr
import subprocess

# Install required system dependencies
def setup_chrome_dependencies():
    os.system('apt-get update')
    os.system('apt-get install -y wget gnupg2')
    os.system('wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -')
    os.system('echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/sources.list.d/google-chrome.list')
    os.system('apt-get update')
    os.system('apt-get install -y google-chrome-stable')
    os.system('apt-get install -y xvfb')

# Function to get Chrome options for Hugging Face environment
def get_chrome_options():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.binary_location = '/usr/bin/google-chrome'
    return chrome_options

# Function to scrape course data using Selenium
def scrape_courses_with_selenium(url, output_file="course.json"):
    try:
        # Setup Chrome dependencies if not already installed
        setup_chrome_dependencies()
        
        # Configure Chrome options
        options = get_chrome_options()
        
        # Start Chrome driver
        driver = webdriver.Chrome(options=options)
        driver.get(url)

        # Wait for course cards to load
        WebDriverWait(driver, 60).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card"))
        )

        courses = []
        course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
        
        for course in course_elements:
            try:
                title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title"
                description = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Description"
                lessons = course.find_element(By.CLASS_NAME, "course-card__lesson-count").text or "No Lessons"
                price = course.find_element(By.CLASS_NAME, "course-card__price").text or "No Price"
                image_url = course.find_element(By.TAG_NAME, "img").get_attribute("src") or "No Image"

                courses.append({
                    "title": title,
                    "description": description,
                    "lessons": lessons,
                    "price": price,
                    "image_url": image_url,
                })
            except Exception as e:
                print(f"Error extracting a course: {e}")
                continue

        # Save scraped data to JSON file
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(courses, f, ensure_ascii=False, indent=4)

        return courses

    except Exception as e:
        print(f"Error: {e}")
        return []
    finally:
        if 'driver' in locals():
            driver.quit()

# Function to search for courses in the JSON file
def search_course_by_title(query, input_file="course.json"):
    try:
        with open(input_file, "r", encoding="utf-8") as f:
            courses = json.load(f)

        results = [course for course in courses if query.lower() in course["title"].lower()]
        if not results:
            return "No matching courses found."

        output = []
        for course in results:
            result = f"""
            <div style="margin-bottom: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 8px;">
                <img src="{course['image_url']}" alt="{course['title']}" style="width:100%; max-width:300px; height:auto; border-radius: 4px;">
                <h3 style="color: #2c3e50; margin: 10px 0;">{course['title']}</h3>
                <p><strong>Lessons:</strong> {course['lessons']}</p>
                <p><strong>Price:</strong> {course['price']}</p>
                <p style="color: #666;">{course['description']}</p>
            </div>
            """
            output.append(result)

        return "\n".join(output)

    except FileNotFoundError:
        return f"Error: The file {input_file} was not found."
    except Exception as e:
        return f"Error: {e}"

# Gradio interface
def create_interface():
    def gradio_search(query):
        return search_course_by_title(query)

    iface = gr.Interface(
        fn=gradio_search,
        inputs=gr.Textbox(label="Search Courses", placeholder="Enter course title..."),
        outputs="html",
        title="Course Search Engine",
        description="Search for courses by title.",
        css="div.gradio-container {max-width: 800px; margin: auto;}"
    )
    return iface

# Main execution
if __name__ == "__main__":
    # URL for scraping
    url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
    
    print("Starting course scraping...")
    scrape_courses_with_selenium(url)
    print("Scraping completed. Data saved to course.json")
    
    # Launch Gradio interface
    iface = create_interface()
    iface.launch(server_name="0.0.0.0", server_port=7860)