nikhildsst commited on
Commit
965e762
·
verified ·
1 Parent(s): 523c0b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -104
app.py CHANGED
@@ -1,137 +1,135 @@
1
- import requests
2
- from bs4 import BeautifulSoup
3
- import pandas as pd
4
- from sentence_transformers import SentenceTransformer
5
- from langchain_community.vectorstores import FAISS
6
- from langchain.docstore.document import Document
7
- import gradio as gr
8
  from selenium import webdriver
9
  from selenium.webdriver.chrome.service import Service as ChromeService
10
- from webdriver_manager.chrome import ChromeDriverManager
11
  from selenium.webdriver.common.by import By
12
  from selenium.webdriver.chrome.options import Options
13
- import time
14
-
15
  from selenium.webdriver.support.ui import WebDriverWait
16
  from selenium.webdriver.support import expected_conditions as EC
17
- from selenium.webdriver.common.by import By
18
-
19
-
20
-
21
-
22
-
23
- # Step 1: Scrape Course Data
24
- def scrape_courses(url):
25
- response = requests.get(url)
26
- soup = BeautifulSoup(response.content, "html.parser")
27
-
28
- # Debug: Print the soup structure
29
- # print(soup.prettify())
30
-
31
- courses = []
32
- for course in soup.find_all("div", class_="course-block"):
33
- title = course.find("div", class_="course-title").get_text(strip=True) if course.find("div", class_="course-title") else "No Title"
34
- description = course.find("div", class_="course-description").get_text(strip=True) if course.find("div", class_="course-description") else "No Description"
35
- courses.append({"title": title, "description": description})
36
-
37
- if not courses:
38
- print("No data found! Please check the website structure or the scraping logic.")
39
- return courses
40
-
41
 
42
- def scrape_courses_with_selenium(url):
43
- # Set up Selenium WebDriver options
44
  options = Options()
45
- options.headless = True # Run in headless mode
46
- service = ChromeService(executable_path="/path/to/your/chromedriver") # Manually set the path to the driver
47
- driver = webdriver.Chrome(service=service, options=options)
48
-
49
- # Open the webpage
50
  driver.get(url)
51
-
52
- # Wait for course-block elements to be present
53
  try:
54
- WebDriverWait(driver, 20).until(
55
- EC.presence_of_all_elements_located((By.CLASS_NAME, "course-block"))
56
  )
57
  except Exception as e:
58
  print(f"Error: {e}")
59
  driver.quit()
60
  return []
61
 
62
- # Now scrape the courses
63
  courses = []
64
  try:
65
- course_elements = driver.find_elements(By.CLASS_NAME, "course-title")
66
- print(f"Found {len(course_elements)} courses") # Debugging line
67
- for course in course_elements:
68
- title = course.find_element(By.CLASS_NAME, "course-title").text if course.find_element(By.CLASS_NAME, "course-title") else "No Title"
69
- description = course.find_element(By.CLASS_NAME, "course-description").text if course.find_element(By.CLASS_NAME, "course-description") else "No Description"
70
- courses.append({"title": title, "description": description})
 
 
 
 
 
 
 
 
 
 
 
71
  except Exception as e:
72
  print(f"Error scraping courses: {e}")
 
 
73
 
74
- driver.quit()
75
  return courses
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- # Example usage
79
- url = "https://courses.analyticsvidhya.com/pages/all-free-courses" # Replace with the actual URL
80
- courses = scrape_courses_with_selenium(url)
 
 
 
 
 
 
81
 
82
- # Print or process the data as needed
83
- if courses:
84
  for course in courses:
85
- print(f"Title: {course['title']}, Description: {course['description']}")
86
- else:
87
- print("No courses found!")
88
-
89
- # Step 2: Convert Data to DataFrame
90
- df = pd.DataFrame(courses)
91
-
92
- # Check if DataFrame is empty
93
- if df.empty:
94
- print("DataFrame is empty. No valid data was scraped.")
95
- exit()
96
-
97
- # Combine title and description for embeddings
98
- df["combined_text"] = df["title"] + " " + df["description"]
99
-
100
- # Step 3: Generate Embeddings Using SentenceTransformers
101
- embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
102
- course_embeddings = embedding_model.encode(df["combined_text"].tolist(), show_progress_bar=True)
103
-
104
- # Step 4: Store Embeddings in FAISS Vector Store
105
- documents = [
106
- Document(
107
- page_content=text,
108
- metadata={"source": f"Course {i+1}"}
109
- )
110
- for i, text in enumerate(df["combined_text"].tolist())
111
- ]
112
 
113
- vector_store = FAISS.from_documents(documents, embedding_model)
 
114
 
115
- # Step 5: Build the Smart Search System
116
- retriever = vector_store.as_retriever()
 
117
 
118
- # Mock QA Chain
119
- def mock_qa_chain(question):
120
- docs = retriever.get_relevant_documents(question)
121
- context = "\n".join([doc.page_content for doc in docs])
122
- return f"Mock Answer based on context:\n{context}\n\nSources: {', '.join([doc.metadata['source'] for doc in docs])}"
123
 
124
- # Step 6: Gradio Interface Function
125
- def smart_search(query):
126
- return mock_qa_chain(query)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- # Step 7: Deploying with Gradio
129
- iface = gr.Interface(
130
- fn=smart_search,
131
- inputs=gr.Textbox(label="Ask a Question", placeholder="Enter your question here..."),
132
- outputs=gr.Textbox(label="Answer"),
133
- live=True
134
- )
135
 
136
  if __name__ == "__main__":
137
- iface.launch()
 
1
+ import os
 
 
 
 
 
 
2
  from selenium import webdriver
3
  from selenium.webdriver.chrome.service import Service as ChromeService
 
4
  from selenium.webdriver.common.by import By
5
  from selenium.webdriver.chrome.options import Options
6
+ from webdriver_manager.chrome import ChromeDriverManager
 
7
  from selenium.webdriver.support.ui import WebDriverWait
8
  from selenium.webdriver.support import expected_conditions as EC
9
+ import time
10
+ import pandas as pd
11
+ from sentence_transformers import SentenceTransformer
12
+ from langchain.embeddings.base import Embeddings
13
+ from langchain.docstore.document import Document
14
+ from langchain_community.vectorstores import FAISS
15
+ import gradio as gr
16
+ import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ # Function to scrape course data using Selenium
19
+ def scrape_courses_with_selenium(url, limit=10):
20
  options = Options()
21
+ options.headless = True # Headless browsing
22
+ driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
 
 
 
23
  driver.get(url)
24
+
 
25
  try:
26
+ WebDriverWait(driver, 60).until(
27
+ EC.presence_of_all_elements_located((By.CLASS_NAME, "course-card"))
28
  )
29
  except Exception as e:
30
  print(f"Error: {e}")
31
  driver.quit()
32
  return []
33
 
 
34
  courses = []
35
  try:
36
+ course_elements = driver.find_elements(By.CLASS_NAME, "course-card")
37
+ for i, course in enumerate(course_elements):
38
+ if i >= limit:
39
+ break
40
+ title = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Title"
41
+ description = course.find_element(By.CLASS_NAME, "course-card__body").text or "No Description"
42
+ lessons = course.find_element(By.CLASS_NAME, "course-card__lesson-count").text or "No Lessons"
43
+ price = course.find_element(By.CLASS_NAME, "course-card__price").text or "No Price"
44
+ image_url = course.find_element(By.TAG_NAME, "img").get_attribute("src") or "No Image"
45
+
46
+ courses.append({
47
+ "title": title,
48
+ "description": description,
49
+ "lessons": lessons,
50
+ "price": price,
51
+ "image_url": image_url,
52
+ })
53
  except Exception as e:
54
  print(f"Error scraping courses: {e}")
55
+ finally:
56
+ driver.quit()
57
 
 
58
  return courses
59
 
60
+ class SentenceTransformersEmbeddings(Embeddings):
61
+ def __init__(self, model_name):
62
+ self.model = SentenceTransformer(model_name)
63
+
64
+ def embed_documents(self, texts):
65
+ # Generates embeddings for a list of documents
66
+ embeddings = self.model.encode(texts, show_progress_bar=True)
67
+ return embeddings.tolist() # Convert numpy array to list
68
+
69
+ def embed_query(self, text):
70
+ # Generates embedding for a single query
71
+ embedding = self.model.encode([text], show_progress_bar=True)[0]
72
+ return embedding.tolist() # Convert numpy array to list
73
 
74
+ def main():
75
+ # URL for scraping
76
+ url = "https://courses.analyticsvidhya.com/collections/courses"
77
+ limit = 5 # Number of courses to scrape
78
+ courses = scrape_courses_with_selenium(url, limit)
79
+
80
+ if not courses:
81
+ print("No courses found!")
82
+ return
83
 
84
+ # Print course information
 
85
  for course in courses:
86
+ print(f"Title: {course['title']}, Description: {course['description']}, Price: {course['price']}, Lessons: {course['lessons']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
+ # Convert Data to DataFrame
89
+ df = pd.DataFrame(courses)
90
 
91
+ # Combine title and description for embeddings
92
+ df["combined_text"] = df["title"] + " " + df["description"]
93
+ texts = df["combined_text"].tolist()
94
 
95
+ # Initialize embedding model
96
+ embedding_model = SentenceTransformersEmbeddings('all-MiniLM-L6-v2')
 
 
 
97
 
98
+ # Create Documents for FAISS
99
+ documents = [
100
+ Document(
101
+ page_content=text,
102
+ metadata={"source": f"Course {i+1}", **{k:v for k,v in courses[i].items() if k != 'description'}}
103
+ )
104
+ for i, text in enumerate(texts)
105
+ ]
106
+
107
+ # Create FAISS Vector Store
108
+ vector_store = FAISS.from_documents(documents, embedding_model)
109
+
110
+ # Define search function
111
+ def smart_search(query):
112
+ docs = vector_store.similarity_search(query, k=2)
113
+ results = []
114
+ for doc in docs:
115
+ result = f"\nTitle: {doc.metadata['title']}\n"
116
+ result += f"Price: {doc.metadata['price']}\n"
117
+ result += f"Lessons: {doc.metadata['lessons']}\n"
118
+ result += f"Content: {doc.page_content}\n"
119
+ results.append(result)
120
+ return "\n---\n".join(results)
121
+
122
+ # Create Gradio interface
123
+ iface = gr.Interface(
124
+ fn=smart_search,
125
+ inputs=gr.Textbox(label="Search Courses", placeholder="Enter your search query..."),
126
+ outputs=gr.Textbox(label="Results"),
127
+ title="Course Search Engine",
128
+ description="Search for courses based on your query. The system will return the most relevant matches.",
129
+ )
130
 
131
+ # Launch the interface
132
+ iface.launch()
 
 
 
 
 
133
 
134
  if __name__ == "__main__":
135
+ main()