Update app.py
Browse files
app.py
CHANGED
@@ -14,47 +14,150 @@ from langchain.docstore.document import Document
|
|
14 |
from langchain_community.vectorstores import FAISS
|
15 |
import gradio as gr
|
16 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
# Function to scrape course data using Selenium
|
19 |
-
def scrape_courses_with_selenium(url, limit=50):
|
20 |
-
options = Options()
|
21 |
-
options.headless = True # Headless browsing
|
22 |
-
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
23 |
-
driver.get(url)
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
try:
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
)
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
return
|
33 |
|
34 |
-
|
|
|
35 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
|
|
|
|
|
37 |
for i, course in enumerate(course_elements):
|
38 |
if i >= limit:
|
39 |
break
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
except Exception as e:
|
54 |
-
print(f"Error scraping
|
|
|
|
|
55 |
finally:
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
58 |
return courses
|
59 |
|
60 |
class SentenceTransformersEmbeddings(Embeddings):
|
@@ -62,74 +165,101 @@ class SentenceTransformersEmbeddings(Embeddings):
|
|
62 |
self.model = SentenceTransformer(model_name)
|
63 |
|
64 |
def embed_documents(self, texts):
|
65 |
-
# Generates embeddings for a list of documents
|
66 |
embeddings = self.model.encode(texts, show_progress_bar=True)
|
67 |
-
return embeddings.tolist()
|
68 |
|
69 |
def embed_query(self, text):
|
70 |
-
# Generates embedding for a single query
|
71 |
embedding = self.model.encode([text], show_progress_bar=True)[0]
|
72 |
-
return embedding.tolist()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
def main():
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
87 |
|
88 |
-
|
89 |
-
|
|
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
-
|
96 |
-
|
|
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
)
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
# Define search function
|
111 |
-
def smart_search(query):
|
112 |
-
docs = vector_store.similarity_search(query, k=2)
|
113 |
-
results = []
|
114 |
-
for doc in docs:
|
115 |
-
result = f"\nTitle: {doc.metadata['title']}\n"
|
116 |
-
result += f"Price: {doc.metadata['price']}\n"
|
117 |
-
result += f"Lessons: {doc.metadata['lessons']}\n"
|
118 |
-
result += f"Content: {doc.page_content}\n"
|
119 |
-
results.append(result)
|
120 |
-
return "\n---\n".join(results)
|
121 |
-
|
122 |
-
# Create Gradio interface
|
123 |
-
iface = gr.Interface(
|
124 |
-
fn=smart_search,
|
125 |
-
inputs=gr.Textbox(label="Search Courses", placeholder="Enter your search query..."),
|
126 |
-
outputs=gr.Textbox(label="Results"),
|
127 |
-
title="Course Search Engine",
|
128 |
-
description="Search for courses based on your query. The system will return the most relevant matches.",
|
129 |
-
)
|
130 |
-
|
131 |
-
# Launch the interface
|
132 |
-
iface.launch()
|
133 |
|
134 |
if __name__ == "__main__":
|
135 |
main()
|
|
|
14 |
from langchain_community.vectorstores import FAISS
|
15 |
import gradio as gr
|
16 |
import numpy as np
|
17 |
+
import subprocess
|
18 |
+
import shutil
|
19 |
+
|
20 |
+
import os
|
21 |
+
|
22 |
+
# Install Google Chrome and ChromeDriver if not already installed
|
23 |
+
os.system('wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb')
|
24 |
+
os.system('apt-get update && apt-get install -y ./google-chrome-stable_current_amd64.deb')
|
25 |
+
os.system('CHROME_VERSION=$(google-chrome --version | awk \'{print $3}\' | cut -d \'.\' -f 1)')
|
26 |
+
os.system('wget https://chromedriver.storage.googleapis.com/${CHROME_VERSION}.0/chromedriver_linux64.zip')
|
27 |
+
os.system('unzip chromedriver_linux64.zip')
|
28 |
+
os.system('mv chromedriver /usr/bin/chromedriver && chmod +x /usr/bin/chromedriver')
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
+
def check_and_install_chrome():
|
32 |
+
# Check if Google Chrome is already installed
|
33 |
+
if check_chrome_installation():
|
34 |
+
print("Google Chrome is already installed.")
|
35 |
+
return
|
36 |
+
|
37 |
+
print("Installing Google Chrome...")
|
38 |
+
os.system('wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb')
|
39 |
+
os.system('apt-get update && apt-get install -y ./google-chrome-stable_current_amd64.deb')
|
40 |
+
|
41 |
+
|
42 |
+
def check_chrome_version():
|
43 |
try:
|
44 |
+
result = subprocess.run(['google-chrome', '--version'], capture_output=True, text=True)
|
45 |
+
if result.returncode == 0:
|
46 |
+
print("Chrome version:", result.stdout)
|
47 |
+
return result.stdout.strip()
|
48 |
+
except FileNotFoundError:
|
49 |
+
print("Google Chrome is not installed.")
|
50 |
+
return None
|
51 |
+
|
52 |
+
from selenium.webdriver.common.by import By
|
53 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
54 |
+
from selenium.webdriver.support import expected_conditions as EC
|
55 |
+
|
56 |
+
def wait_for_element_by_class(driver, class_name, timeout=60):
|
57 |
+
try:
|
58 |
+
element = WebDriverWait(driver, timeout).until(
|
59 |
+
EC.presence_of_all_elements_located((By.CLASS_NAME, class_name))
|
60 |
)
|
61 |
+
return element
|
62 |
+
except TimeoutException:
|
63 |
+
print(f"Timed out waiting for element with class: {class_name}")
|
64 |
+
return None
|
65 |
|
66 |
+
def check_chrome_installation():
|
67 |
+
"""Check if Chrome is installed and get its version"""
|
68 |
try:
|
69 |
+
# First try the default 'google-chrome' command
|
70 |
+
result = subprocess.run(['google-chrome', '--version'],
|
71 |
+
capture_output=True,
|
72 |
+
text=True)
|
73 |
+
if result.returncode == 0:
|
74 |
+
return True
|
75 |
+
except FileNotFoundError:
|
76 |
+
# If 'google-chrome' fails, try 'google-chrome-stable'
|
77 |
+
try:
|
78 |
+
result = subprocess.run(['google-chrome-stable', '--version'],
|
79 |
+
capture_output=True,
|
80 |
+
text=True)
|
81 |
+
if result.returncode == 0:
|
82 |
+
return True
|
83 |
+
except FileNotFoundError:
|
84 |
+
pass
|
85 |
+
return False
|
86 |
+
|
87 |
+
def setup_chrome_options():
|
88 |
+
"""Setup Chrome options with all necessary arguments"""
|
89 |
+
chrome_options = Options()
|
90 |
+
chrome_options.add_argument('--headless=new') # Updated headless argument
|
91 |
+
chrome_options.add_argument('--no-sandbox')
|
92 |
+
chrome_options.add_argument('--disable-dev-shm-usage')
|
93 |
+
chrome_options.add_argument('--disable-gpu')
|
94 |
+
chrome_options.add_argument('--disable-software-rasterizer')
|
95 |
+
chrome_options.add_argument('--disable-extensions')
|
96 |
+
chrome_options.add_argument('--disable-setuid-sandbox')
|
97 |
+
chrome_options.binary_location = "/usr/bin/google-chrome" # Specify Chrome binary location
|
98 |
+
return chrome_options
|
99 |
+
|
100 |
+
def scrape_courses_with_selenium(url, limit=10):
|
101 |
+
"""Scrape courses with improved error handling"""
|
102 |
+
if not check_chrome_installation():
|
103 |
+
raise RuntimeError("Google Chrome is not installed. Please install it first.")
|
104 |
+
|
105 |
+
chrome_options = setup_chrome_options()
|
106 |
+
|
107 |
+
try:
|
108 |
+
service = ChromeService(ChromeDriverManager().install())
|
109 |
+
driver = webdriver.Chrome(service=service, options=chrome_options)
|
110 |
+
|
111 |
+
print("Successfully initialized Chrome driver")
|
112 |
+
driver.get(url)
|
113 |
+
print(f"Successfully navigated to {url}")
|
114 |
+
|
115 |
+
# Wait for course cards to load
|
116 |
+
WebDriverWait(driver, 60).until(
|
117 |
+
EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card"))
|
118 |
+
)
|
119 |
+
|
120 |
+
courses = []
|
121 |
course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
|
122 |
+
print(f"Found {len(course_elements)} course elements")
|
123 |
+
|
124 |
for i, course in enumerate(course_elements):
|
125 |
if i >= limit:
|
126 |
break
|
127 |
+
try:
|
128 |
+
# Updated selectors based on the website structure
|
129 |
+
title_elem = course.find_element(By.CSS_SELECTOR, ".course-card__title")
|
130 |
+
desc_elem = course.find_element(By.CSS_SELECTOR, ".course-card__description")
|
131 |
+
lessons_elem = course.find_element(By.CSS_SELECTOR, ".course-card__lesson-count")
|
132 |
+
price_elem = course.find_element(By.CSS_SELECTOR, ".course-card__price")
|
133 |
+
|
134 |
+
title = title_elem.text if title_elem else "No Title"
|
135 |
+
description = desc_elem.text if desc_elem else "No Description"
|
136 |
+
lessons = lessons_elem.text if lessons_elem else "No Lessons"
|
137 |
+
price = price_elem.text if price_elem else "No Price"
|
138 |
+
|
139 |
+
courses.append({
|
140 |
+
"title": title,
|
141 |
+
"description": description,
|
142 |
+
"lessons": lessons,
|
143 |
+
"price": price
|
144 |
+
})
|
145 |
+
print(f"Successfully scraped course {i+1}")
|
146 |
+
|
147 |
+
except Exception as e:
|
148 |
+
print(f"Error scraping course {i+1}: {str(e)}")
|
149 |
+
continue
|
150 |
+
|
151 |
except Exception as e:
|
152 |
+
print(f"Error during scraping: {str(e)}")
|
153 |
+
raise
|
154 |
+
|
155 |
finally:
|
156 |
+
try:
|
157 |
+
driver.quit()
|
158 |
+
except:
|
159 |
+
pass
|
160 |
+
|
161 |
return courses
|
162 |
|
163 |
class SentenceTransformersEmbeddings(Embeddings):
|
|
|
165 |
self.model = SentenceTransformer(model_name)
|
166 |
|
167 |
def embed_documents(self, texts):
|
|
|
168 |
embeddings = self.model.encode(texts, show_progress_bar=True)
|
169 |
+
return embeddings.tolist()
|
170 |
|
171 |
def embed_query(self, text):
|
|
|
172 |
embedding = self.model.encode([text], show_progress_bar=True)[0]
|
173 |
+
return embedding.tolist()
|
174 |
+
|
175 |
+
def create_search_interface(vector_store):
|
176 |
+
def search_courses(query):
|
177 |
+
if not query.strip():
|
178 |
+
return "Please enter a search query."
|
179 |
+
|
180 |
+
try:
|
181 |
+
docs = vector_store.similarity_search(query, k=2)
|
182 |
+
results = []
|
183 |
+
for i, doc in enumerate(docs, 1):
|
184 |
+
result = f"\nResult {i}:\n"
|
185 |
+
result += f"Title: {doc.metadata['title']}\n"
|
186 |
+
result += f"Price: {doc.metadata['price']}\n"
|
187 |
+
result += f"Lessons: {doc.metadata['lessons']}\n"
|
188 |
+
result += f"Content: {doc.page_content}\n"
|
189 |
+
results.append(result)
|
190 |
+
return "\n---\n".join(results)
|
191 |
+
except Exception as e:
|
192 |
+
return f"Error during search: {str(e)}"
|
193 |
+
|
194 |
+
return search_courses
|
195 |
|
196 |
def main():
|
197 |
+
try:
|
198 |
+
# Check Chrome installation
|
199 |
+
if not check_chrome_installation():
|
200 |
+
print("Chrome is not installed. Please install Google Chrome first.")
|
201 |
+
return
|
202 |
+
|
203 |
+
# Scrape courses
|
204 |
+
url = "https://courses.analyticsvidhya.com/collections/courses"
|
205 |
+
limit = 5
|
206 |
+
print("Starting course scraping...")
|
207 |
+
courses = scrape_courses_with_selenium(url, limit)
|
208 |
+
|
209 |
+
if not courses:
|
210 |
+
print("No courses found!")
|
211 |
+
return
|
212 |
|
213 |
+
print("\nScraped Courses:")
|
214 |
+
for course in courses:
|
215 |
+
print(f"Title: {course['title']}")
|
216 |
+
print(f"Price: {course['price']}")
|
217 |
+
print(f"Lessons: {course['lessons']}")
|
218 |
+
print("---")
|
219 |
|
220 |
+
# Create DataFrame
|
221 |
+
df = pd.DataFrame(courses)
|
222 |
+
df["combined_text"] = df["title"] + " " + df["description"]
|
223 |
+
texts = df["combined_text"].tolist()
|
224 |
|
225 |
+
# Initialize embedding model
|
226 |
+
print("\nInitializing embedding model...")
|
227 |
+
embedding_model = SentenceTransformersEmbeddings('all-MiniLM-L6-v2')
|
228 |
|
229 |
+
# Create Documents for FAISS
|
230 |
+
documents = [
|
231 |
+
Document(
|
232 |
+
page_content=text,
|
233 |
+
metadata={"source": f"Course {i+1}", **{k:v for k,v in courses[i].items() if k != 'description'}}
|
234 |
+
)
|
235 |
+
for i, text in enumerate(texts)
|
236 |
+
]
|
237 |
|
238 |
+
# Create FAISS Vector Store
|
239 |
+
print("Creating vector store...")
|
240 |
+
vector_store = FAISS.from_documents(documents, embedding_model)
|
241 |
|
242 |
+
# Create and launch Gradio interface
|
243 |
+
print("\nLaunching Gradio interface...")
|
244 |
+
search_fn = create_search_interface(vector_store)
|
245 |
+
|
246 |
+
iface = gr.Interface(
|
247 |
+
fn=search_fn,
|
248 |
+
inputs=gr.Textbox(label="Search Courses", placeholder="Enter your search query..."),
|
249 |
+
outputs=gr.Textbox(label="Results"),
|
250 |
+
title="Course Search Engine",
|
251 |
+
description="Search for courses based on your query. The system will return the most relevant matches.",
|
252 |
+
examples=[
|
253 |
+
["python programming course"],
|
254 |
+
["machine learning basics"],
|
255 |
+
["data analysis"]
|
256 |
+
]
|
257 |
)
|
258 |
+
|
259 |
+
iface.launch()
|
260 |
+
|
261 |
+
except Exception as e:
|
262 |
+
print(f"Error in main: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
|
264 |
if __name__ == "__main__":
|
265 |
main()
|