File size: 5,414 Bytes
965e762 a29819d 2a1f334 a29819d 495c181 a29819d 454528b bbc48ea 454528b 495c181 454528b 495c181 454528b 2a1f334 a29819d bbc48ea a29819d 495c181 454528b 495c181 a29819d 495c181 a29819d bbc48ea a29819d 495c181 a29819d 0e4c9e1 495c181 a29819d 495c181 a29819d 495c181 454528b 495c181 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import os
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import gradio as gr
import subprocess
# Install required system dependencies
def setup_chrome_dependencies():
os.system('apt-get update')
os.system('apt-get install -y wget gnupg2')
os.system('wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -')
os.system('echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/sources.list.d/google-chrome.list')
os.system('apt-get update')
os.system('apt-get install -y google-chrome-stable')
os.system('apt-get install -y xvfb')
# Function to get Chrome options for Hugging Face environment
def get_chrome_options():
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.binary_location = '/usr/bin/google-chrome'
return chrome_options
# Function to scrape course data using Selenium
def scrape_courses_with_selenium(url, output_file="course.json"):
try:
# Setup Chrome dependencies if not already installed
setup_chrome_dependencies()
# Configure Chrome options
options = get_chrome_options()
# Start Chrome driver
driver = webdriver.Chrome(options=options)
driver.get(url)
# Wait for course cards to load
WebDriverWait(driver, 60).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card"))
)
courses = []
course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
for course in course_elements:
try:
title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title"
description = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Description"
lessons = course.find_element(By.CLASS_NAME, "course-card__lesson-count").text or "No Lessons"
price = course.find_element(By.CLASS_NAME, "course-card__price").text or "No Price"
image_url = course.find_element(By.TAG_NAME, "img").get_attribute("src") or "No Image"
courses.append({
"title": title,
"description": description,
"lessons": lessons,
"price": price,
"image_url": image_url,
})
except Exception as e:
print(f"Error extracting a course: {e}")
continue
# Save scraped data to JSON file
with open(output_file, "w", encoding="utf-8") as f:
json.dump(courses, f, ensure_ascii=False, indent=4)
return courses
except Exception as e:
print(f"Error: {e}")
return []
finally:
if 'driver' in locals():
driver.quit()
# Function to search for courses in the JSON file
def search_course_by_title(query, input_file="course.json"):
try:
with open(input_file, "r", encoding="utf-8") as f:
courses = json.load(f)
results = [course for course in courses if query.lower() in course["title"].lower()]
if not results:
return "No matching courses found."
output = []
for course in results:
result = f"""
<div style="margin-bottom: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 8px;">
<img src="{course['image_url']}" alt="{course['title']}" style="width:100%; max-width:300px; height:auto; border-radius: 4px;">
<h3 style="color: #2c3e50; margin: 10px 0;">{course['title']}</h3>
<p><strong>Lessons:</strong> {course['lessons']}</p>
<p><strong>Price:</strong> {course['price']}</p>
<p style="color: #666;">{course['description']}</p>
</div>
"""
output.append(result)
return "\n".join(output)
except FileNotFoundError:
return f"Error: The file {input_file} was not found."
except Exception as e:
return f"Error: {e}"
# Gradio interface
def create_interface():
def gradio_search(query):
return search_course_by_title(query)
iface = gr.Interface(
fn=gradio_search,
inputs=gr.Textbox(label="Search Courses", placeholder="Enter course title..."),
outputs="html",
title="Course Search Engine",
description="Search for courses by title.",
css="div.gradio-container {max-width: 800px; margin: auto;}"
)
return iface
# Main execution
if __name__ == "__main__":
# URL for scraping
url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
print("Starting course scraping...")
scrape_courses_with_selenium(url)
print("Scraping completed. Data saved to course.json")
# Launch Gradio interface
iface = create_interface()
iface.launch(server_name="0.0.0.0", server_port=7860) |