nikhildsst's picture
Update app.py
454528b verified
import os
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import gradio as gr
import subprocess
# Install required system dependencies
def setup_chrome_dependencies():
os.system('apt-get update')
os.system('apt-get install -y wget gnupg2')
os.system('wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -')
os.system('echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/sources.list.d/google-chrome.list')
os.system('apt-get update')
os.system('apt-get install -y google-chrome-stable')
os.system('apt-get install -y xvfb')
# Function to get Chrome options for Hugging Face environment
def get_chrome_options():
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.binary_location = '/usr/bin/google-chrome'
return chrome_options
# Function to scrape course data using Selenium
def scrape_courses_with_selenium(url, output_file="course.json"):
try:
# Setup Chrome dependencies if not already installed
setup_chrome_dependencies()
# Configure Chrome options
options = get_chrome_options()
# Start Chrome driver
driver = webdriver.Chrome(options=options)
driver.get(url)
# Wait for course cards to load
WebDriverWait(driver, 60).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card"))
)
courses = []
course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
for course in course_elements:
try:
title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title"
description = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Description"
lessons = course.find_element(By.CLASS_NAME, "course-card__lesson-count").text or "No Lessons"
price = course.find_element(By.CLASS_NAME, "course-card__price").text or "No Price"
image_url = course.find_element(By.TAG_NAME, "img").get_attribute("src") or "No Image"
courses.append({
"title": title,
"description": description,
"lessons": lessons,
"price": price,
"image_url": image_url,
})
except Exception as e:
print(f"Error extracting a course: {e}")
continue
# Save scraped data to JSON file
with open(output_file, "w", encoding="utf-8") as f:
json.dump(courses, f, ensure_ascii=False, indent=4)
return courses
except Exception as e:
print(f"Error: {e}")
return []
finally:
if 'driver' in locals():
driver.quit()
# Function to search for courses in the JSON file
def search_course_by_title(query, input_file="course.json"):
try:
with open(input_file, "r", encoding="utf-8") as f:
courses = json.load(f)
results = [course for course in courses if query.lower() in course["title"].lower()]
if not results:
return "No matching courses found."
output = []
for course in results:
result = f"""
<div style="margin-bottom: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 8px;">
<img src="{course['image_url']}" alt="{course['title']}" style="width:100%; max-width:300px; height:auto; border-radius: 4px;">
<h3 style="color: #2c3e50; margin: 10px 0;">{course['title']}</h3>
<p><strong>Lessons:</strong> {course['lessons']}</p>
<p><strong>Price:</strong> {course['price']}</p>
<p style="color: #666;">{course['description']}</p>
</div>
"""
output.append(result)
return "\n".join(output)
except FileNotFoundError:
return f"Error: The file {input_file} was not found."
except Exception as e:
return f"Error: {e}"
# Gradio interface
def create_interface():
def gradio_search(query):
return search_course_by_title(query)
iface = gr.Interface(
fn=gradio_search,
inputs=gr.Textbox(label="Search Courses", placeholder="Enter course title..."),
outputs="html",
title="Course Search Engine",
description="Search for courses by title.",
css="div.gradio-container {max-width: 800px; margin: auto;}"
)
return iface
# Main execution
if __name__ == "__main__":
# URL for scraping
url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
print("Starting course scraping...")
scrape_courses_with_selenium(url)
print("Scraping completed. Data saved to course.json")
# Launch Gradio interface
iface = create_interface()
iface.launch(server_name="0.0.0.0", server_port=7860)