File size: 5,414 Bytes
965e762
a29819d
2a1f334
 
 
 
 
 
a29819d
495c181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a29819d
 
454528b
bbc48ea
454528b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495c181
454528b
495c181
454528b
 
 
2a1f334
a29819d
 
bbc48ea
a29819d
 
 
 
 
 
 
 
 
495c181
 
454528b
495c181
 
 
 
 
 
a29819d
 
495c181
a29819d
 
 
bbc48ea
a29819d
 
495c181
 
a29819d
 
 
 
 
 
 
 
0e4c9e1
495c181
a29819d
495c181
a29819d
495c181
 
 
 
 
 
454528b
495c181
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import os
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import gradio as gr
import subprocess

# Install required system dependencies
def setup_chrome_dependencies():
    os.system('apt-get update')
    os.system('apt-get install -y wget gnupg2')
    os.system('wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -')
    os.system('echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/sources.list.d/google-chrome.list')
    os.system('apt-get update')
    os.system('apt-get install -y google-chrome-stable')
    os.system('apt-get install -y xvfb')

# Function to get Chrome options for Hugging Face environment
def get_chrome_options():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.binary_location = '/usr/bin/google-chrome'
    return chrome_options

# Function to scrape course data using Selenium
def scrape_courses_with_selenium(url, output_file="course.json"):
    try:
        # Setup Chrome dependencies if not already installed
        setup_chrome_dependencies()
        
        # Configure Chrome options
        options = get_chrome_options()
        
        # Start Chrome driver
        driver = webdriver.Chrome(options=options)
        driver.get(url)

        # Wait for course cards to load
        WebDriverWait(driver, 60).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card"))
        )

        courses = []
        course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
        
        for course in course_elements:
            try:
                title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title"
                description = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Description"
                lessons = course.find_element(By.CLASS_NAME, "course-card__lesson-count").text or "No Lessons"
                price = course.find_element(By.CLASS_NAME, "course-card__price").text or "No Price"
                image_url = course.find_element(By.TAG_NAME, "img").get_attribute("src") or "No Image"

                courses.append({
                    "title": title,
                    "description": description,
                    "lessons": lessons,
                    "price": price,
                    "image_url": image_url,
                })
            except Exception as e:
                print(f"Error extracting a course: {e}")
                continue

        # Save scraped data to JSON file
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(courses, f, ensure_ascii=False, indent=4)

        return courses

    except Exception as e:
        print(f"Error: {e}")
        return []
    finally:
        if 'driver' in locals():
            driver.quit()

# Function to search for courses in the JSON file
def search_course_by_title(query, input_file="course.json"):
    try:
        with open(input_file, "r", encoding="utf-8") as f:
            courses = json.load(f)

        results = [course for course in courses if query.lower() in course["title"].lower()]
        if not results:
            return "No matching courses found."

        output = []
        for course in results:
            result = f"""
            <div style="margin-bottom: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 8px;">
                <img src="{course['image_url']}" alt="{course['title']}" style="width:100%; max-width:300px; height:auto; border-radius: 4px;">
                <h3 style="color: #2c3e50; margin: 10px 0;">{course['title']}</h3>
                <p><strong>Lessons:</strong> {course['lessons']}</p>
                <p><strong>Price:</strong> {course['price']}</p>
                <p style="color: #666;">{course['description']}</p>
            </div>
            """
            output.append(result)

        return "\n".join(output)

    except FileNotFoundError:
        return f"Error: The file {input_file} was not found."
    except Exception as e:
        return f"Error: {e}"

# Gradio interface
def create_interface():
    def gradio_search(query):
        return search_course_by_title(query)

    iface = gr.Interface(
        fn=gradio_search,
        inputs=gr.Textbox(label="Search Courses", placeholder="Enter course title..."),
        outputs="html",
        title="Course Search Engine",
        description="Search for courses by title.",
        css="div.gradio-container {max-width: 800px; margin: auto;}"
    )
    return iface

# Main execution
if __name__ == "__main__":
    # URL for scraping
    url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
    
    print("Starting course scraping...")
    scrape_courses_with_selenium(url)
    print("Scraping completed. Data saved to course.json")
    
    # Launch Gradio interface
    iface = create_interface()
    iface.launch(server_name="0.0.0.0", server_port=7860)