nikhildsst commited on
Commit
2a1f334
·
verified ·
1 Parent(s): 339a135

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -64
app.py CHANGED
@@ -1,52 +1,106 @@
 
 
 
 
1
  from langchain_community.vectorstores import FAISS
2
- from langchain_community.embeddings import OpenAIEmbeddings
3
- from langchain.prompts import PromptTemplate
4
- from langchain.chains import RetrievalQAWithSourcesChain
5
- from langchain_openai import OpenAIEmbeddings, OpenAI
6
- # Gradio imports
7
- import gradio as gr
8
-
9
-
10
  from langchain.docstore.document import Document
11
- import pandas as pd
12
- import os
13
-
14
- # Setting OpenAI API key
15
- os.environ["OPENAI_API_KEY"] = "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"
16
-
17
- # Step 1: Load Course Data (as an example dataset)
18
- course_data = [
19
- {
20
- "title": "Introduction to Data Science",
21
- "description": "Learn the basics of data science including Python, statistics, and visualization.",
22
- "curriculum": "Python basics, statistics, visualization, case studies"
23
- },
24
- {
25
- "title": "Machine Learning Basics",
26
- "description": "Understand the fundamentals of machine learning algorithms and their applications.",
27
- "curriculum": "Supervised learning, unsupervised learning, regression, classification"
28
- },
29
- {
30
- "title": "Deep Learning Essentials",
31
- "description": "Dive into deep learning concepts including neural networks and TensorFlow.",
32
- "curriculum": "Neural networks, TensorFlow basics, image classification"
33
- }
34
- ]
35
-
36
- # Convert the course data into a DataFrame
37
- df = pd.DataFrame(course_data)
38
-
39
- # Combine title, description, and curriculum into a single searchable text column
40
- df["combined_text"] = df["title"] + " " + df["description"] + " " + df["curriculum"]
41
-
42
- # Step 2: Generate Embeddings for the Data
43
- embedding_model = OpenAIEmbeddings()
44
-
45
- # Generate embeddings for the combined text
46
- course_embeddings = embedding_model.embed_documents(df["combined_text"].tolist())
47
-
48
-
49
- # Step 3: Store the Embeddings in a Vector Database (FAISS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  documents = [
51
  Document(
52
  page_content=text,
@@ -57,26 +111,20 @@ documents = [
57
 
58
  vector_store = FAISS.from_documents(documents, embedding_model)
59
 
60
- # Step 4: Build the Smart Search System
61
- prompt_template = PromptTemplate(
62
- input_variables=["context", "question"],
63
- template="Use the following context to answer the question.\nContext: {context}\nQuestion: {question}\nAnswer:"
64
- )
65
-
66
  retriever = vector_store.as_retriever()
67
- qa_chain = RetrievalQAWithSourcesChain.from_chain_type(
68
- llm=OpenAI(temperature=0),
69
- chain_type="stuff",
70
- retriever=retriever,
71
- return_source_documents=True
72
- )
73
 
74
- # Step 5: Gradio Interface
 
 
 
 
 
 
75
  def smart_search(query):
76
- result = qa_chain({"question": query})
77
- return result['answer']
78
 
79
- # Creating a Gradio interface
80
  iface = gr.Interface(
81
  fn=smart_search,
82
  inputs=gr.Textbox(label="Ask a Question", placeholder="Enter your question here..."),
@@ -86,4 +134,3 @@ iface = gr.Interface(
86
 
87
  if __name__ == "__main__":
88
  iface.launch()
89
-
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ from sentence_transformers import SentenceTransformer
5
  from langchain_community.vectorstores import FAISS
 
 
 
 
 
 
 
 
6
  from langchain.docstore.document import Document
7
+ import gradio as gr
8
+ from selenium import webdriver
9
+ from selenium.webdriver.chrome.service import Service as ChromeService
10
+ from selenium.webdriver.common.by import By
11
+ from selenium.webdriver.chrome.options import Options
12
+ from webdriver_manager.chrome import ChromeDriverManager
13
+ import time
14
+
15
+ from selenium.webdriver.support.ui import WebDriverWait
16
+ from selenium.webdriver.support import expected_conditions as EC
17
+ from selenium.webdriver.common.by import By
18
+
19
+
20
+
21
+
22
+
23
+ # Step 1: Scrape Course Data
24
+ def scrape_courses(url):
25
+ response = requests.get(url)
26
+ soup = BeautifulSoup(response.content, "html.parser")
27
+
28
+ # Debug: Print the soup structure
29
+ # print(soup.prettify())
30
+
31
+ courses = []
32
+ for course in soup.find_all("div", class_="course-block"):
33
+ title = course.find("div", class_="course-title").get_text(strip=True) if course.find("div", class_="course-title") else "No Title"
34
+ description = course.find("div", class_="course-description").get_text(strip=True) if course.find("div", class_="course-description") else "No Description"
35
+ courses.append({"title": title, "description": description})
36
+
37
+ if not courses:
38
+ print("No data found! Please check the website structure or the scraping logic.")
39
+ return courses
40
+
41
+
42
+ def scrape_courses_with_selenium(url):
43
+ # Set up Selenium WebDriver options
44
+ options = Options()
45
+ options.headless = True # Run in headless mode
46
+ driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
47
+
48
+ # Open the webpage
49
+ driver.get(url)
50
+
51
+ # Wait for course-block elements to be present
52
+ try:
53
+ WebDriverWait(driver, 20).until(
54
+ EC.presence_of_all_elements_located((By.CLASS_NAME, "course-block"))
55
+ )
56
+ except Exception as e:
57
+ print(f"Error: {e}")
58
+ driver.quit()
59
+ return []
60
+
61
+ # Now scrape the courses
62
+ courses = []
63
+ try:
64
+ course_elements = driver.find_elements(By.CLASS_NAME, "course-title")
65
+ print(f"Found {len(course_elements)} courses") # Debugging line
66
+ for course in course_elements:
67
+ title = course.find_element(By.CLASS_NAME, "course-title").text if course.find_element(By.CLASS_NAME, "course-title") else "No Title"
68
+ description = course.find_element(By.CLASS_NAME, "course-description").text if course.find_element(By.CLASS_NAME, "course-description") else "No Description"
69
+ courses.append({"title": title, "description": description})
70
+ except Exception as e:
71
+ print(f"Error scraping courses: {e}")
72
+
73
+ driver.quit()
74
+ return courses
75
+
76
+
77
+ # Example usage
78
+ url = "https://courses.analyticsvidhya.com/pages/all-free-courses" # Replace with the actual URL
79
+ courses = scrape_courses_with_selenium(url)
80
+
81
+ # Print or process the data as needed
82
+ if courses:
83
+ for course in courses:
84
+ print(f"Title: {course['title']}, Description: {course['description']}")
85
+ else:
86
+ print("No courses found!")
87
+
88
+ # Step 2: Convert Data to DataFrame
89
+ df = pd.DataFrame(courses)
90
+
91
+ # Check if DataFrame is empty
92
+ if df.empty:
93
+ print("DataFrame is empty. No valid data was scraped.")
94
+ exit()
95
+
96
+ # Combine title and description for embeddings
97
+ df["combined_text"] = df["title"] + " " + df["description"]
98
+
99
+ # Step 3: Generate Embeddings Using SentenceTransformers
100
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
101
+ course_embeddings = embedding_model.encode(df["combined_text"].tolist(), show_progress_bar=True)
102
+
103
+ # Step 4: Store Embeddings in FAISS Vector Store
104
  documents = [
105
  Document(
106
  page_content=text,
 
111
 
112
  vector_store = FAISS.from_documents(documents, embedding_model)
113
 
114
+ # Step 5: Build the Smart Search System
 
 
 
 
 
115
  retriever = vector_store.as_retriever()
 
 
 
 
 
 
116
 
117
+ # Mock QA Chain
118
+ def mock_qa_chain(question):
119
+ docs = retriever.get_relevant_documents(question)
120
+ context = "\n".join([doc.page_content for doc in docs])
121
+ return f"Mock Answer based on context:\n{context}\n\nSources: {', '.join([doc.metadata['source'] for doc in docs])}"
122
+
123
+ # Step 6: Gradio Interface Function
124
  def smart_search(query):
125
+ return mock_qa_chain(query)
 
126
 
127
+ # Step 7: Deploying with Gradio
128
  iface = gr.Interface(
129
  fn=smart_search,
130
  inputs=gr.Textbox(label="Ask a Question", placeholder="Enter your question here..."),
 
134
 
135
  if __name__ == "__main__":
136
  iface.launch()