nikhildsst commited on
Commit
a29819d
·
verified ·
1 Parent(s): 15f82a0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -170
app.py CHANGED
@@ -1,7 +1,5 @@
1
  import os
2
- import subprocess
3
- import pandas as pd
4
- import gradio as gr
5
  from selenium import webdriver
6
  from selenium.webdriver.chrome.service import Service as ChromeService
7
  from selenium.webdriver.common.by import By
@@ -9,57 +7,11 @@ from selenium.webdriver.chrome.options import Options
9
  from webdriver_manager.chrome import ChromeDriverManager
10
  from selenium.webdriver.support.ui import WebDriverWait
11
  from selenium.webdriver.support import expected_conditions as EC
12
- from sentence_transformers import SentenceTransformer
13
- from langchain.embeddings.base import Embeddings
14
- from langchain.docstore.document import Document
15
- from langchain_community.vectorstores import FAISS
16
- import numpy as np
17
- import shutil
18
-
19
-
20
- # Install Google Chrome and ChromeDriver if not already installed
21
- os.system('wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb')
22
- os.system('apt-get update && apt-get install -y ./google-chrome-stable_current_amd64.deb')
23
- os.system('CHROME_VERSION=$(google-chrome --version | awk \'{print $3}\' | cut -d \'.\' -f 1)')
24
- os.system('wget https://chromedriver.storage.googleapis.com/${CHROME_VERSION}.0/chromedriver_linux64.zip')
25
- os.system('unzip chromedriver_linux64.zip')
26
- os.system('mv chromedriver /usr/bin/chromedriver && chmod +x /usr/bin/chromedriver')
27
-
28
- def check_chrome_installation():
29
- """Check if Chrome is installed and get its version"""
30
- try:
31
- # First try the default 'google-chrome' command
32
- result = subprocess.run(['google-chrome', '--version'],
33
- capture_output=True,
34
- text=True)
35
- if result.returncode == 0:
36
- return True
37
- except FileNotFoundError:
38
- # If 'google-chrome' fails, try 'google-chrome-stable'
39
- try:
40
- result = subprocess.run(['google-chrome-stable', '--version'],
41
- capture_output=True,
42
- text=True)
43
- if result.returncode == 0:
44
- return True
45
- except FileNotFoundError:
46
- pass
47
- return False
48
-
49
- def setup_chrome_options():
50
- """Setup Chrome options with all necessary arguments"""
51
- chrome_options = Options()
52
- chrome_options.add_argument('--headless=new') # Updated headless argument
53
- chrome_options.add_argument('--no-sandbox')
54
- chrome_options.add_argument('--disable-dev-shm-usage')
55
- chrome_options.add_argument('--disable-gpu')
56
- chrome_options.add_argument('--disable-software-rasterizer')
57
- chrome_options.add_argument('--disable-extensions')
58
- chrome_options.add_argument('--disable-setuid-sandbox')
59
- return chrome_options
60
-
61
-
62
- def scrape_courses_with_selenium(url, limit=50):
63
  options = Options()
64
  options.headless = True # Headless browsing
65
  driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
@@ -77,129 +29,85 @@ def scrape_courses_with_selenium(url, limit=50):
77
  courses = []
78
  try:
79
  course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
80
- for i, course in enumerate(course_elements):
81
- if i >= limit:
82
- break
83
- title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title"
84
- description = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Description"
85
- lessons = course.find_element(By.CLASS_NAME, "course-card__lesson-count").text or "No Lessons"
86
- price = course.find_element(By.CLASS_NAME, "course-card__price").text or "No Price"
87
- image_url = course.find_element(By.TAG_NAME, "img").get_attribute("src") or "No Image"
88
-
89
- courses.append({
90
- "title": title,
91
- "description": description,
92
- "lessons": lessons,
93
- "price": price,
94
- "image_url": image_url,
95
- })
 
 
96
  except Exception as e:
97
  print(f"Error scraping courses: {e}")
98
  finally:
99
  driver.quit()
100
-
 
 
 
 
101
  return courses
102
 
103
- class SentenceTransformersEmbeddings(Embeddings):
104
- def __init__(self, model_name):
105
- self.model = SentenceTransformer(model_name)
106
-
107
- def embed_documents(self, texts):
108
- embeddings = self.model.encode(texts, show_progress_bar=True)
109
- return embeddings.tolist()
110
-
111
- def embed_query(self, text):
112
- embedding = self.model.encode([text], show_progress_bar=True)[0]
113
- return embedding.tolist()
114
-
115
- def create_search_interface(vector_store):
116
- def search_courses(query):
117
- if not query.strip():
118
- return "Please enter a search query."
119
-
120
- try:
121
- docs = vector_store.similarity_search(query, k=2)
122
- results = []
123
- for i, doc in enumerate(docs, 1):
124
- result = f"\nResult {i}:\n"
125
- result += f"Title: {doc.metadata['title']}\n"
126
- result += f"Price: {doc.metadata['price']}\n"
127
- result += f"Lessons: {doc.metadata['lessons']}\n"
128
- result += f"Content: {doc.page_content}\n"
129
- results.append(result)
130
- return "\n---\n".join(results)
131
- except Exception as e:
132
- return f"Error during search: {str(e)}"
133
-
134
- return search_courses
135
-
136
- def main():
137
  try:
138
- # Check Chrome installation
139
- if not check_chrome_installation():
140
- print("Chrome is not installed. Please install Google Chrome first.")
141
- return
142
-
143
- # Scrape courses
144
- url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
145
- limit = 5
146
- print("Starting course scraping...")
147
- courses = scrape_courses_with_selenium(url, limit)
148
-
149
- if not courses:
150
- print("No courses found!")
151
- return
152
-
153
- print("\nScraped Courses:")
154
- for course in courses:
155
- print(f"Title: {course['title']}")
156
- print(f"Price: {course['price']}")
157
- print(f"Lessons: {course['lessons']}")
158
- print("---")
159
-
160
- # Create DataFrame
161
- df = pd.DataFrame(courses)
162
- df["combined_text"] = df["title"] + " " + df["description"]
163
- texts = df["combined_text"].tolist()
164
-
165
- # Initialize embedding model
166
- print("\nInitializing embedding model...")
167
- embedding_model = SentenceTransformersEmbeddings('all-MiniLM-L6-v2')
168
-
169
- # Create Documents for FAISS
170
- documents = [
171
- Document(
172
- page_content=text,
173
- metadata={"source": f"Course {i+1}", **{k:v for k,v in courses[i].items() if k != 'description'}}
174
- )
175
- for i, text in enumerate(texts)
176
- ]
177
-
178
- # Create FAISS Vector Store
179
- print("Creating vector store...")
180
- vector_store = FAISS.from_documents(documents, embedding_model)
181
-
182
- # Create and launch Gradio interface
183
- print("\nLaunching Gradio interface...")
184
- search_fn = create_search_interface(vector_store)
185
-
186
- iface = gr.Interface(
187
- fn=search_fn,
188
- inputs=gr.Textbox(label="Search Courses", placeholder="Enter your search query..."),
189
- outputs=gr.Textbox(label="Results"),
190
- title="Course Search Engine",
191
- description="Search for courses based on your query. The system will return the most relevant matches.",
192
- examples=[
193
- ["python programming course"],
194
- ["machine learning basics"],
195
- ["data analysis"]
196
- ]
197
- )
198
-
199
- iface.launch()
200
-
201
  except Exception as e:
202
- print(f"Error in main: {str(e)}")
 
203
 
 
204
  if __name__ == "__main__":
205
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import json
 
 
3
  from selenium import webdriver
4
  from selenium.webdriver.chrome.service import Service as ChromeService
5
  from selenium.webdriver.common.by import By
 
7
  from webdriver_manager.chrome import ChromeDriverManager
8
  from selenium.webdriver.support.ui import WebDriverWait
9
  from selenium.webdriver.support import expected_conditions as EC
10
+ import gradio as gr
11
+
12
+
13
+ # Function to scrape course data using Selenium
14
+ def scrape_courses_with_selenium(url, output_file="course.json"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  options = Options()
16
  options.headless = True # Headless browsing
17
  driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
 
29
  courses = []
30
  try:
31
  course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
32
+ for course in course_elements:
33
+ try:
34
+ title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title"
35
+ description = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Description"
36
+ lessons = course.find_element(By.CLASS_NAME, "course-card__lesson-count").text or "No Lessons"
37
+ price = course.find_element(By.CLASS_NAME, "course-card__price").text or "No Price"
38
+ image_url = course.find_element(By.TAG_NAME, "img").get_attribute("src") or "No Image"
39
+
40
+ courses.append({
41
+ "title": title,
42
+ "description": description,
43
+ "lessons": lessons,
44
+ "price": price,
45
+ "image_url": image_url,
46
+ })
47
+ except Exception as e:
48
+ print(f"Error extracting a course: {e}")
49
+
50
  except Exception as e:
51
  print(f"Error scraping courses: {e}")
52
  finally:
53
  driver.quit()
54
+
55
+ # Save scraped data to JSON file
56
+ with open(output_file, "w", encoding="utf-8") as f:
57
+ json.dump(courses, f, ensure_ascii=False, indent=4)
58
+
59
  return courses
60
 
61
+ # Function to search for courses in the JSON file
62
+ def search_course_by_title(query, input_file="course.json"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  try:
64
+ with open(input_file, "r", encoding="utf-8") as f:
65
+ courses = json.load(f)
66
+
67
+ # Perform case-insensitive search
68
+ results = [course for course in courses if query.lower() in course["title"].lower()]
69
+ if not results:
70
+ return "No matching courses found."
71
+
72
+ output = []
73
+ for course in results:
74
+ result = f"<div style=\"margin-bottom: 20px;\">\n"
75
+ result += f"<img src=\"{course['image_url']}\" alt=\"{course['title']}\" style=\"width:300px;height:auto;\">\n"
76
+ result += f"<p><strong>Title:</strong> {course['title']}</p>\n"
77
+ result += f"<p><strong>Lessons:</strong> {course['lessons']}</p>\n"
78
+ result += f"<p><strong>Price:</strong> {course['price']}</p>\n"
79
+ result += f"<p><strong>Description:</strong> {course['description']}</p>\n"
80
+ result += "</div>\n"
81
+ output.append(result)
82
+
83
+ return "\n---\n".join(output)
84
+
85
+ except FileNotFoundError:
86
+ return f"Error: The file {input_file} was not found."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  except Exception as e:
88
+ return f"Error: {e}"
89
+
90
 
91
+ # Main function to scrape and search
92
  if __name__ == "__main__":
93
+ # URL for scraping
94
+ url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
95
+
96
+ # Scrape and save data to JSON
97
+ print("Scraping courses...")
98
+ scrape_courses_with_selenium(url)
99
+ print("Scraping completed. Data saved to course.json.")
100
+
101
+ # Define Gradio interface for searching
102
+ def gradio_search(query):
103
+ return search_course_by_title(query)
104
+
105
+ iface = gr.Interface(
106
+ fn=gradio_search,
107
+ inputs=gr.Textbox(label="Search Courses", placeholder="Enter course title..."),
108
+ outputs="html",
109
+ title="Course Search Engine",
110
+ description="Search for courses by title from the scraped data stored in course.json.",
111
+ )
112
+
113
+ iface.launch()