Spaces:

nikhildsst
/

smart_search_projects

Running

App Files Files Community

smart_search_projects / app.py

nikhildsst

Update app.py

454528b verified about 1 month ago

raw

history blame contribute delete

5.41 kB

	import os
	import json
	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service as ChromeService
	from selenium.webdriver.common.by import By
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	import gradio as gr
	import subprocess

	# Install required system dependencies
	def setup_chrome_dependencies():
	os.system('apt-get update')
	os.system('apt-get install -y wget gnupg2')
	os.system('wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub \| apt-key add -')
	os.system('echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/sources.list.d/google-chrome.list')
	os.system('apt-get update')
	os.system('apt-get install -y google-chrome-stable')
	os.system('apt-get install -y xvfb')

	# Function to get Chrome options for Hugging Face environment
	def get_chrome_options():
	chrome_options = Options()
	chrome_options.add_argument('--headless')
	chrome_options.add_argument('--no-sandbox')
	chrome_options.add_argument('--disable-dev-shm-usage')
	chrome_options.add_argument('--disable-gpu')
	chrome_options.binary_location = '/usr/bin/google-chrome'
	return chrome_options

	# Function to scrape course data using Selenium
	def scrape_courses_with_selenium(url, output_file="course.json"):
	try:
	# Setup Chrome dependencies if not already installed
	setup_chrome_dependencies()

	# Configure Chrome options
	options = get_chrome_options()

	# Start Chrome driver
	driver = webdriver.Chrome(options=options)
	driver.get(url)

	# Wait for course cards to load
	WebDriverWait(driver, 60).until(
	EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card"))
	)

	courses = []
	course_elements = driver.find_elements(By.CLASS_NAME, "course-card")

	for course in course_elements:
	try:
	title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title"
	description = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Description"
	lessons = course.find_element(By.CLASS_NAME, "course-card__lesson-count").text or "No Lessons"
	price = course.find_element(By.CLASS_NAME, "course-card__price").text or "No Price"
	image_url = course.find_element(By.TAG_NAME, "img").get_attribute("src") or "No Image"

	courses.append({
	"title": title,
	"description": description,
	"lessons": lessons,
	"price": price,
	"image_url": image_url,
	})
	except Exception as e:
	print(f"Error extracting a course: {e}")
	continue

	# Save scraped data to JSON file
	with open(output_file, "w", encoding="utf-8") as f:
	json.dump(courses, f, ensure_ascii=False, indent=4)

	return courses

	except Exception as e:
	print(f"Error: {e}")
	return []
	finally:
	if 'driver' in locals():
	driver.quit()

	# Function to search for courses in the JSON file
	def search_course_by_title(query, input_file="course.json"):
	try:
	with open(input_file, "r", encoding="utf-8") as f:
	courses = json.load(f)

	results = [course for course in courses if query.lower() in course["title"].lower()]
	if not results:
	return "No matching courses found."

	output = []
	for course in results:
	result = f"""
	<div style="margin-bottom: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 8px;">
	<img src="{course['image_url']}" alt="{course['title']}" style="width:100%; max-width:300px; height:auto; border-radius: 4px;">
	<h3 style="color: #2c3e50; margin: 10px 0;">{course['title']}</h3>
	<p><strong>Lessons:</strong> {course['lessons']}</p>
	<p><strong>Price:</strong> {course['price']}</p>
	<p style="color: #666;">{course['description']}</p>
	</div>
	"""
	output.append(result)

	return "\n".join(output)

	except FileNotFoundError:
	return f"Error: The file {input_file} was not found."
	except Exception as e:
	return f"Error: {e}"

	# Gradio interface
	def create_interface():
	def gradio_search(query):
	return search_course_by_title(query)

	iface = gr.Interface(
	fn=gradio_search,
	inputs=gr.Textbox(label="Search Courses", placeholder="Enter course title..."),
	outputs="html",
	title="Course Search Engine",
	description="Search for courses by title.",
	css="div.gradio-container {max-width: 800px; margin: auto;}"
	)
	return iface

	# Main execution
	if __name__ == "__main__":
	# URL for scraping
	url = "https://courses.analyticsvidhya.com/pages/all-free-courses"

	print("Starting course scraping...")
	scrape_courses_with_selenium(url)
	print("Scraping completed. Data saved to course.json")

	# Launch Gradio interface
	iface = create_interface()
	iface.launch(server_name="0.0.0.0", server_port=7860)