nikhildsst commited on
Commit
bbc48ea
·
verified ·
1 Parent(s): cf98c27

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +214 -84
app.py CHANGED
@@ -14,47 +14,150 @@ from langchain.docstore.document import Document
14
  from langchain_community.vectorstores import FAISS
15
  import gradio as gr
16
  import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- # Function to scrape course data using Selenium
19
- def scrape_courses_with_selenium(url, limit=50):
20
- options = Options()
21
- options.headless = True # Headless browsing
22
- driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
23
- driver.get(url)
24
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  try:
26
- WebDriverWait(driver, 60).until(
27
- EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  )
29
- except Exception as e:
30
- print(f"Error: {e}")
31
- driver.quit()
32
- return []
33
 
34
- courses = []
 
35
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
 
 
37
  for i, course in enumerate(course_elements):
38
  if i >= limit:
39
  break
40
- title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title"
41
- description = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Description"
42
- lessons = course.find_element(By.CLASS_NAME, "course-card__lesson-count").text or "No Lessons"
43
- price = course.find_element(By.CLASS_NAME, "course-card__price").text or "No Price"
44
- image_url = course.find_element(By.TAG_NAME, "img").get_attribute("src") or "No Image"
45
-
46
- courses.append({
47
- "title": title,
48
- "description": description,
49
- "lessons": lessons,
50
- "price": price,
51
- "image_url": image_url,
52
- })
 
 
 
 
 
 
 
 
 
 
 
53
  except Exception as e:
54
- print(f"Error scraping courses: {e}")
 
 
55
  finally:
56
- driver.quit()
57
-
 
 
 
58
  return courses
59
 
60
  class SentenceTransformersEmbeddings(Embeddings):
@@ -62,74 +165,101 @@ class SentenceTransformersEmbeddings(Embeddings):
62
  self.model = SentenceTransformer(model_name)
63
 
64
  def embed_documents(self, texts):
65
- # Generates embeddings for a list of documents
66
  embeddings = self.model.encode(texts, show_progress_bar=True)
67
- return embeddings.tolist() # Convert numpy array to list
68
 
69
  def embed_query(self, text):
70
- # Generates embedding for a single query
71
  embedding = self.model.encode([text], show_progress_bar=True)[0]
72
- return embedding.tolist() # Convert numpy array to list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  def main():
75
- # URL for scraping
76
- url = "https://courses.analyticsvidhya.com/collections/courses"
77
- limit = 5 # Number of courses to scrape
78
- courses = scrape_courses_with_selenium(url, limit)
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- if not courses:
81
- print("No courses found!")
82
- return
 
 
 
83
 
84
- # Print course information
85
- for course in courses:
86
- print(f"Title: {course['title']}, Description: {course['description']}, Price: {course['price']}, Lessons: {course['lessons']}")
 
87
 
88
- # Convert Data to DataFrame
89
- df = pd.DataFrame(courses)
 
90
 
91
- # Combine title and description for embeddings
92
- df["combined_text"] = df["title"] + " " + df["description"]
93
- texts = df["combined_text"].tolist()
 
 
 
 
 
94
 
95
- # Initialize embedding model
96
- embedding_model = SentenceTransformersEmbeddings('all-MiniLM-L6-v2')
 
97
 
98
- # Create Documents for FAISS
99
- documents = [
100
- Document(
101
- page_content=text,
102
- metadata={"source": f"Course {i+1}", **{k:v for k,v in courses[i].items() if k != 'description'}}
 
 
 
 
 
 
 
 
 
 
103
  )
104
- for i, text in enumerate(texts)
105
- ]
106
-
107
- # Create FAISS Vector Store
108
- vector_store = FAISS.from_documents(documents, embedding_model)
109
-
110
- # Define search function
111
- def smart_search(query):
112
- docs = vector_store.similarity_search(query, k=2)
113
- results = []
114
- for doc in docs:
115
- result = f"\nTitle: {doc.metadata['title']}\n"
116
- result += f"Price: {doc.metadata['price']}\n"
117
- result += f"Lessons: {doc.metadata['lessons']}\n"
118
- result += f"Content: {doc.page_content}\n"
119
- results.append(result)
120
- return "\n---\n".join(results)
121
-
122
- # Create Gradio interface
123
- iface = gr.Interface(
124
- fn=smart_search,
125
- inputs=gr.Textbox(label="Search Courses", placeholder="Enter your search query..."),
126
- outputs=gr.Textbox(label="Results"),
127
- title="Course Search Engine",
128
- description="Search for courses based on your query. The system will return the most relevant matches.",
129
- )
130
-
131
- # Launch the interface
132
- iface.launch()
133
 
134
  if __name__ == "__main__":
135
  main()
 
14
  from langchain_community.vectorstores import FAISS
15
  import gradio as gr
16
  import numpy as np
17
+ import subprocess
18
+ import shutil
19
+
20
+ import os
21
+
22
+ # Install Google Chrome and ChromeDriver if not already installed
23
+ os.system('wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb')
24
+ os.system('apt-get update && apt-get install -y ./google-chrome-stable_current_amd64.deb')
25
+ os.system('CHROME_VERSION=$(google-chrome --version | awk \'{print $3}\' | cut -d \'.\' -f 1)')
26
+ os.system('wget https://chromedriver.storage.googleapis.com/${CHROME_VERSION}.0/chromedriver_linux64.zip')
27
+ os.system('unzip chromedriver_linux64.zip')
28
+ os.system('mv chromedriver /usr/bin/chromedriver && chmod +x /usr/bin/chromedriver')
29
 
 
 
 
 
 
 
30
 
31
+ def check_and_install_chrome():
32
+ # Check if Google Chrome is already installed
33
+ if check_chrome_installation():
34
+ print("Google Chrome is already installed.")
35
+ return
36
+
37
+ print("Installing Google Chrome...")
38
+ os.system('wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb')
39
+ os.system('apt-get update && apt-get install -y ./google-chrome-stable_current_amd64.deb')
40
+
41
+
42
+ def check_chrome_version():
43
  try:
44
+ result = subprocess.run(['google-chrome', '--version'], capture_output=True, text=True)
45
+ if result.returncode == 0:
46
+ print("Chrome version:", result.stdout)
47
+ return result.stdout.strip()
48
+ except FileNotFoundError:
49
+ print("Google Chrome is not installed.")
50
+ return None
51
+
52
+ from selenium.webdriver.common.by import By
53
+ from selenium.webdriver.support.ui import WebDriverWait
54
+ from selenium.webdriver.support import expected_conditions as EC
55
+
56
+ def wait_for_element_by_class(driver, class_name, timeout=60):
57
+ try:
58
+ element = WebDriverWait(driver, timeout).until(
59
+ EC.presence_of_all_elements_located((By.CLASS_NAME, class_name))
60
  )
61
+ return element
62
+ except TimeoutException:
63
+ print(f"Timed out waiting for element with class: {class_name}")
64
+ return None
65
 
66
+ def check_chrome_installation():
67
+ """Check if Chrome is installed and get its version"""
68
  try:
69
+ # First try the default 'google-chrome' command
70
+ result = subprocess.run(['google-chrome', '--version'],
71
+ capture_output=True,
72
+ text=True)
73
+ if result.returncode == 0:
74
+ return True
75
+ except FileNotFoundError:
76
+ # If 'google-chrome' fails, try 'google-chrome-stable'
77
+ try:
78
+ result = subprocess.run(['google-chrome-stable', '--version'],
79
+ capture_output=True,
80
+ text=True)
81
+ if result.returncode == 0:
82
+ return True
83
+ except FileNotFoundError:
84
+ pass
85
+ return False
86
+
87
+ def setup_chrome_options():
88
+ """Setup Chrome options with all necessary arguments"""
89
+ chrome_options = Options()
90
+ chrome_options.add_argument('--headless=new') # Updated headless argument
91
+ chrome_options.add_argument('--no-sandbox')
92
+ chrome_options.add_argument('--disable-dev-shm-usage')
93
+ chrome_options.add_argument('--disable-gpu')
94
+ chrome_options.add_argument('--disable-software-rasterizer')
95
+ chrome_options.add_argument('--disable-extensions')
96
+ chrome_options.add_argument('--disable-setuid-sandbox')
97
+ chrome_options.binary_location = "/usr/bin/google-chrome" # Specify Chrome binary location
98
+ return chrome_options
99
+
100
+ def scrape_courses_with_selenium(url, limit=10):
101
+ """Scrape courses with improved error handling"""
102
+ if not check_chrome_installation():
103
+ raise RuntimeError("Google Chrome is not installed. Please install it first.")
104
+
105
+ chrome_options = setup_chrome_options()
106
+
107
+ try:
108
+ service = ChromeService(ChromeDriverManager().install())
109
+ driver = webdriver.Chrome(service=service, options=chrome_options)
110
+
111
+ print("Successfully initialized Chrome driver")
112
+ driver.get(url)
113
+ print(f"Successfully navigated to {url}")
114
+
115
+ # Wait for course cards to load
116
+ WebDriverWait(driver, 60).until(
117
+ EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card"))
118
+ )
119
+
120
+ courses = []
121
  course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
122
+ print(f"Found {len(course_elements)} course elements")
123
+
124
  for i, course in enumerate(course_elements):
125
  if i >= limit:
126
  break
127
+ try:
128
+ # Updated selectors based on the website structure
129
+ title_elem = course.find_element(By.CSS_SELECTOR, ".course-card__title")
130
+ desc_elem = course.find_element(By.CSS_SELECTOR, ".course-card__description")
131
+ lessons_elem = course.find_element(By.CSS_SELECTOR, ".course-card__lesson-count")
132
+ price_elem = course.find_element(By.CSS_SELECTOR, ".course-card__price")
133
+
134
+ title = title_elem.text if title_elem else "No Title"
135
+ description = desc_elem.text if desc_elem else "No Description"
136
+ lessons = lessons_elem.text if lessons_elem else "No Lessons"
137
+ price = price_elem.text if price_elem else "No Price"
138
+
139
+ courses.append({
140
+ "title": title,
141
+ "description": description,
142
+ "lessons": lessons,
143
+ "price": price
144
+ })
145
+ print(f"Successfully scraped course {i+1}")
146
+
147
+ except Exception as e:
148
+ print(f"Error scraping course {i+1}: {str(e)}")
149
+ continue
150
+
151
  except Exception as e:
152
+ print(f"Error during scraping: {str(e)}")
153
+ raise
154
+
155
  finally:
156
+ try:
157
+ driver.quit()
158
+ except:
159
+ pass
160
+
161
  return courses
162
 
163
  class SentenceTransformersEmbeddings(Embeddings):
 
165
  self.model = SentenceTransformer(model_name)
166
 
167
  def embed_documents(self, texts):
 
168
  embeddings = self.model.encode(texts, show_progress_bar=True)
169
+ return embeddings.tolist()
170
 
171
  def embed_query(self, text):
 
172
  embedding = self.model.encode([text], show_progress_bar=True)[0]
173
+ return embedding.tolist()
174
+
175
+ def create_search_interface(vector_store):
176
+ def search_courses(query):
177
+ if not query.strip():
178
+ return "Please enter a search query."
179
+
180
+ try:
181
+ docs = vector_store.similarity_search(query, k=2)
182
+ results = []
183
+ for i, doc in enumerate(docs, 1):
184
+ result = f"\nResult {i}:\n"
185
+ result += f"Title: {doc.metadata['title']}\n"
186
+ result += f"Price: {doc.metadata['price']}\n"
187
+ result += f"Lessons: {doc.metadata['lessons']}\n"
188
+ result += f"Content: {doc.page_content}\n"
189
+ results.append(result)
190
+ return "\n---\n".join(results)
191
+ except Exception as e:
192
+ return f"Error during search: {str(e)}"
193
+
194
+ return search_courses
195
 
196
  def main():
197
+ try:
198
+ # Check Chrome installation
199
+ if not check_chrome_installation():
200
+ print("Chrome is not installed. Please install Google Chrome first.")
201
+ return
202
+
203
+ # Scrape courses
204
+ url = "https://courses.analyticsvidhya.com/collections/courses"
205
+ limit = 5
206
+ print("Starting course scraping...")
207
+ courses = scrape_courses_with_selenium(url, limit)
208
+
209
+ if not courses:
210
+ print("No courses found!")
211
+ return
212
 
213
+ print("\nScraped Courses:")
214
+ for course in courses:
215
+ print(f"Title: {course['title']}")
216
+ print(f"Price: {course['price']}")
217
+ print(f"Lessons: {course['lessons']}")
218
+ print("---")
219
 
220
+ # Create DataFrame
221
+ df = pd.DataFrame(courses)
222
+ df["combined_text"] = df["title"] + " " + df["description"]
223
+ texts = df["combined_text"].tolist()
224
 
225
+ # Initialize embedding model
226
+ print("\nInitializing embedding model...")
227
+ embedding_model = SentenceTransformersEmbeddings('all-MiniLM-L6-v2')
228
 
229
+ # Create Documents for FAISS
230
+ documents = [
231
+ Document(
232
+ page_content=text,
233
+ metadata={"source": f"Course {i+1}", **{k:v for k,v in courses[i].items() if k != 'description'}}
234
+ )
235
+ for i, text in enumerate(texts)
236
+ ]
237
 
238
+ # Create FAISS Vector Store
239
+ print("Creating vector store...")
240
+ vector_store = FAISS.from_documents(documents, embedding_model)
241
 
242
+ # Create and launch Gradio interface
243
+ print("\nLaunching Gradio interface...")
244
+ search_fn = create_search_interface(vector_store)
245
+
246
+ iface = gr.Interface(
247
+ fn=search_fn,
248
+ inputs=gr.Textbox(label="Search Courses", placeholder="Enter your search query..."),
249
+ outputs=gr.Textbox(label="Results"),
250
+ title="Course Search Engine",
251
+ description="Search for courses based on your query. The system will return the most relevant matches.",
252
+ examples=[
253
+ ["python programming course"],
254
+ ["machine learning basics"],
255
+ ["data analysis"]
256
+ ]
257
  )
258
+
259
+ iface.launch()
260
+
261
+ except Exception as e:
262
+ print(f"Error in main: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
 
264
  if __name__ == "__main__":
265
  main()