awacke1 commited on
Commit
2b0f7f4
Β·
verified Β·
1 Parent(s): cc67713

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +191 -127
app.py CHANGED
@@ -6,18 +6,26 @@ from sklearn.metrics.pairwise import cosine_similarity
6
  import torch
7
  import json
8
  import os
 
9
  from pathlib import Path
10
  from datetime import datetime
11
  import edge_tts
12
  import asyncio
13
  import base64
 
 
 
 
 
 
14
  import streamlit.components.v1 as components
15
 
16
  # Page configuration
17
  st.set_page_config(
18
- page_title="Video Search with Speech",
19
  page_icon="πŸŽ₯",
20
- layout="wide"
 
21
  )
22
 
23
  # Initialize session state
@@ -25,8 +33,21 @@ if 'search_history' not in st.session_state:
25
  st.session_state['search_history'] = []
26
  if 'last_voice_input' not in st.session_state:
27
  st.session_state['last_voice_input'] = ""
 
 
 
 
28
 
29
- # Initialize the speech component
 
 
 
 
 
 
 
 
 
30
  speech_component = components.declare_component("speech_recognition", path="mycomponent")
31
 
32
  class VideoSearch:
@@ -35,59 +56,87 @@ class VideoSearch:
35
  self.load_dataset()
36
 
37
  def fetch_dataset_rows(self):
38
- """Fetch dataset from Hugging Face API"""
39
- import requests
40
-
41
- # Fetch first rows from the dataset
42
- url = "https://datasets-server.huggingface.co/first-rows?dataset=omegalabsinc%2Fomega-multimodal&config=default&split=train"
43
- response = requests.get(url)
44
-
45
- if response.status_code == 200:
46
- data = response.json()
47
- # Extract the rows from the response
48
- rows = data.get('rows', [])
49
- return pd.DataFrame(rows)
50
- else:
51
- st.error(f"Error fetching dataset: {response.status_code}")
52
- return None
53
 
54
- def get_dataset_splits(self):
55
- """Get available dataset splits"""
56
- import requests
57
-
58
- url = "https://datasets-server.huggingface.co/splits?dataset=omegalabsinc%2Fomega-multimodal"
59
- response = requests.get(url)
60
-
61
- if response.status_code == 200:
62
- splits_data = response.json()
63
- return splits_data
64
- else:
65
- st.error(f"Error fetching splits: {response.status_code}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  return None
67
-
68
  def load_dataset(self):
69
- """Load the Omega Multimodal dataset"""
70
  try:
71
- # Fetch dataset from Hugging Face API
72
  self.dataset = self.fetch_dataset_rows()
73
-
74
  if self.dataset is not None:
75
- # Get dataset splits info
76
- splits_info = self.get_dataset_splits()
77
- if splits_info:
78
- st.sidebar.write("Available splits:", splits_info)
79
-
80
  self.prepare_features()
81
  else:
82
  self.create_dummy_data()
83
-
84
  except Exception as e:
85
  st.error(f"Error loading dataset: {e}")
86
  self.create_dummy_data()
87
 
88
  def prepare_features(self):
89
- """Prepare and cache embeddings"""
90
- # Convert string representations of embeddings back to numpy arrays
91
  try:
92
  self.video_embeds = np.array([json.loads(e) if isinstance(e, str) else e
93
  for e in self.dataset.video_embed])
@@ -95,38 +144,17 @@ class VideoSearch:
95
  for e in self.dataset.description_embed])
96
  except Exception as e:
97
  st.error(f"Error preparing features: {e}")
98
- # Create random embeddings as fallback
99
  num_rows = len(self.dataset)
100
  self.video_embeds = np.random.randn(num_rows, 384)
101
  self.text_embeds = np.random.randn(num_rows, 384)
102
 
103
- def create_dummy_data(self):
104
- """Create dummy data for testing"""
105
- self.dataset = pd.DataFrame({
106
- 'video_id': [f'video_{i}' for i in range(10)],
107
- 'youtube_id': ['dQw4w9WgXcQ'] * 10,
108
- 'description': ['Sample video description'] * 10,
109
- 'views': [1000] * 10,
110
- 'start_time': [0] * 10,
111
- 'end_time': [60] * 10
112
- })
113
- # Create dummy embeddings
114
- self.video_embeds = np.random.randn(10, 384) # Match model dimensions
115
- self.text_embeds = np.random.randn(10, 384)
116
-
117
-
118
  def search(self, query, top_k=5):
119
- """Search videos using query"""
120
  query_embedding = self.text_model.encode([query])[0]
121
 
122
- # Compute similarities
123
  video_sims = cosine_similarity([query_embedding], self.video_embeds)[0]
124
  text_sims = cosine_similarity([query_embedding], self.text_embeds)[0]
125
 
126
- # Combine similarities
127
  combined_sims = 0.5 * video_sims + 0.5 * text_sims
128
-
129
- # Get top results
130
  top_indices = np.argsort(combined_sims)[-top_k:][::-1]
131
 
132
  results = []
@@ -140,30 +168,76 @@ class VideoSearch:
140
  'relevance_score': float(combined_sims[idx]),
141
  'views': self.dataset.iloc[idx]['views']
142
  })
143
-
144
  return results
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  async def generate_speech(text, voice="en-US-AriaNeural"):
147
  """Generate speech using Edge TTS"""
148
  if not text.strip():
149
  return None
150
 
151
- communicate = edge_tts.Communicate(text, voice)
152
- audio_file = f"speech_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
153
- await communicate.save(audio_file)
154
- return audio_file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  def main():
157
- st.title("πŸŽ₯ Video Search with Speech Recognition")
158
 
159
- # Initialize video search
160
  search = VideoSearch()
161
 
162
- # Create tabs
163
- tab1, tab2, tab3 = st.tabs(["πŸ” Search", "πŸŽ™οΈ Voice Input", "πŸ’Ύ History"])
164
 
165
  with tab1:
166
- st.subheader("Search Videos")
167
 
168
  # Text search
169
  query = st.text_input("Enter your search query:")
@@ -203,74 +277,64 @@ def main():
203
  audio_file = asyncio.run(generate_speech(summary))
204
  if audio_file:
205
  st.audio(audio_file)
206
- # Cleanup audio file
207
- if os.path.exists(audio_file):
208
- os.remove(audio_file)
209
 
210
  with tab2:
211
- st.subheader("Voice Input")
212
 
213
- # Speech recognition component
214
- voice_input = speech_component()
215
-
216
- if voice_input and voice_input != st.session_state['last_voice_input']:
217
- st.session_state['last_voice_input'] = voice_input
218
- st.markdown("**Transcribed Text:**")
219
- st.write(voice_input)
220
 
221
- if st.button("πŸ” Search Videos"):
222
- results = search.search(voice_input, num_results)
223
- st.session_state['search_history'].append({
224
- 'query': voice_input,
225
- 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
226
- 'results': results
227
- })
228
- for i, result in enumerate(results, 1):
229
- with st.expander(f"Result {i}: {result['description'][:100]}...", expanded=i==1):
230
- st.write(result['description'])
231
- if result['youtube_id']:
232
- st.video(f"https://youtube.com/watch?v={result['youtube_id']}&t={result['start_time']}")
 
 
 
 
 
 
233
 
234
  with tab3:
235
- st.subheader("Search History")
 
236
 
237
- if st.button("πŸ—‘οΈ Clear History"):
238
- st.session_state['search_history'] = []
239
- st.experimental_rerun()
 
 
240
 
241
- for i, entry in enumerate(reversed(st.session_state['search_history'])):
242
- with st.expander(f"Query: {entry['query']} ({entry['timestamp']})", expanded=False):
243
- st.markdown(f"**Original Query:** {entry['query']}")
244
- st.markdown(f"**Time:** {entry['timestamp']}")
245
-
246
- for j, result in enumerate(entry['results'], 1):
247
- st.markdown(f"**Result {j}:**")
248
- st.write(result['description'])
249
- if result['youtube_id']:
250
- st.video(f"https://youtube.com/watch?v={result['youtube_id']}&t={result['start_time']}")
251
 
252
- # Sidebar configuration
253
  with st.sidebar:
254
- st.subheader("βš™οΈ Configuration")
255
- st.markdown("**Video Search Settings**")
256
- st.slider("Default Results:", 1, 10, 5, key="default_results")
257
 
258
- st.markdown("**Voice Settings**")
 
 
 
 
 
 
 
 
259
  st.selectbox("TTS Voice:",
260
  ["en-US-AriaNeural", "en-US-GuyNeural", "en-GB-SoniaNeural"],
261
  key="tts_voice")
262
-
263
- st.markdown("**Model Settings**")
264
- st.selectbox("Text Embedding Model:",
265
- ["all-MiniLM-L6-v2", "paraphrase-multilingual-MiniLM-L12-v2"],
266
- key="embedding_model")
267
-
268
- if st.button("πŸ“₯ Download Search History"):
269
- # Convert history to JSON
270
- history_json = json.dumps(st.session_state['search_history'], indent=2)
271
- b64 = base64.b64encode(history_json.encode()).decode()
272
- href = f'<a href="data:file/json;base64,{b64}" download="search_history.json">Download JSON</a>'
273
- st.markdown(href, unsafe_allow_html=True)
274
 
275
  if __name__ == "__main__":
276
  main()
 
6
  import torch
7
  import json
8
  import os
9
+ import glob
10
  from pathlib import Path
11
  from datetime import datetime
12
  import edge_tts
13
  import asyncio
14
  import base64
15
+ import requests
16
+ import plotly.graph_objects as go
17
+ from gradio_client import Client
18
+ from collections import defaultdict
19
+ from bs4 import BeautifulSoup
20
+ from audio_recorder_streamlit import audio_recorder
21
  import streamlit.components.v1 as components
22
 
23
  # Page configuration
24
  st.set_page_config(
25
+ page_title="Video Search & Research Assistant",
26
  page_icon="πŸŽ₯",
27
+ layout="wide",
28
+ initial_sidebar_state="auto",
29
  )
30
 
31
  # Initialize session state
 
33
  st.session_state['search_history'] = []
34
  if 'last_voice_input' not in st.session_state:
35
  st.session_state['last_voice_input'] = ""
36
+ if 'transcript_history' not in st.session_state:
37
+ st.session_state['transcript_history'] = []
38
+ if 'should_rerun' not in st.session_state:
39
+ st.session_state['should_rerun'] = False
40
 
41
+ # Custom styling
42
+ st.markdown("""
43
+ <style>
44
+ .main { background: linear-gradient(to right, #1a1a1a, #2d2d2d); color: #fff; }
45
+ .stMarkdown { font-family: 'Helvetica Neue', sans-serif; }
46
+ .stButton>button { margin-right: 0.5rem; }
47
+ </style>
48
+ """, unsafe_allow_html=True)
49
+
50
+ # Initialize components
51
  speech_component = components.declare_component("speech_recognition", path="mycomponent")
52
 
53
  class VideoSearch:
 
56
  self.load_dataset()
57
 
58
  def fetch_dataset_rows(self):
59
+ """Fetch dataset from Hugging Face API with debug and caching"""
60
+ try:
61
+ # First try to load from local cache
62
+ cache_file = "dataset_cache.json"
63
+ if os.path.exists(cache_file):
64
+ st.info("Loading from cache...")
65
+ with open(cache_file, 'r', encoding='utf-8') as f:
66
+ data = json.load(f)
67
+ return pd.DataFrame(data)
68
+
69
+ st.info("Fetching from Hugging Face API...")
70
+ url = "https://datasets-server.huggingface.co/first-rows?dataset=omegalabsinc%2Fomega-multimodal&config=default&split=train"
 
 
 
71
 
72
+ # Add debug output
73
+ st.write(f"Requesting URL: {url}")
74
+
75
+ response = requests.get(url, timeout=30)
76
+ st.write(f"Response status: {response.status_code}")
77
+
78
+ if response.status_code == 200:
79
+ data = response.json()
80
+
81
+ # Debug output
82
+ st.write("Response structure:", list(data.keys()))
83
+
84
+ if 'rows' in data:
85
+ rows = data['rows']
86
+
87
+ # Cache the response
88
+ with open(cache_file, 'w', encoding='utf-8') as f:
89
+ json.dump(rows, f)
90
+
91
+ df = pd.DataFrame(rows)
92
+
93
+ # Debug output
94
+ st.write("DataFrame columns:", list(df.columns))
95
+ st.write("Number of rows:", len(df))
96
+
97
+ return df
98
+ else:
99
+ st.error("No 'rows' found in API response")
100
+ st.write("API Response:", data)
101
+
102
+ # Try loading example data
103
+ example_file = "example_data.json"
104
+ if os.path.exists(example_file):
105
+ st.info("Loading example data...")
106
+ with open(example_file, 'r', encoding='utf-8') as f:
107
+ example_data = json.load(f)
108
+ return pd.DataFrame(example_data)
109
+
110
+ return None
111
+ else:
112
+ st.error(f"API request failed with status code: {response.status_code}")
113
+ if response.status_code == 404:
114
+ st.error("Dataset not found - check the dataset name and configuration")
115
+ try:
116
+ error_details = response.json()
117
+ st.write("Error details:", error_details)
118
+ except:
119
+ st.write("Raw response:", response.text)
120
+ return None
121
+
122
+ except Exception as e:
123
+ st.error(f"Error fetching dataset: {str(e)}")
124
+ import traceback
125
+ st.write("Traceback:", traceback.format_exc())
126
  return None
127
+
128
  def load_dataset(self):
 
129
  try:
 
130
  self.dataset = self.fetch_dataset_rows()
 
131
  if self.dataset is not None:
 
 
 
 
 
132
  self.prepare_features()
133
  else:
134
  self.create_dummy_data()
 
135
  except Exception as e:
136
  st.error(f"Error loading dataset: {e}")
137
  self.create_dummy_data()
138
 
139
  def prepare_features(self):
 
 
140
  try:
141
  self.video_embeds = np.array([json.loads(e) if isinstance(e, str) else e
142
  for e in self.dataset.video_embed])
 
144
  for e in self.dataset.description_embed])
145
  except Exception as e:
146
  st.error(f"Error preparing features: {e}")
 
147
  num_rows = len(self.dataset)
148
  self.video_embeds = np.random.randn(num_rows, 384)
149
  self.text_embeds = np.random.randn(num_rows, 384)
150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  def search(self, query, top_k=5):
 
152
  query_embedding = self.text_model.encode([query])[0]
153
 
 
154
  video_sims = cosine_similarity([query_embedding], self.video_embeds)[0]
155
  text_sims = cosine_similarity([query_embedding], self.text_embeds)[0]
156
 
 
157
  combined_sims = 0.5 * video_sims + 0.5 * text_sims
 
 
158
  top_indices = np.argsort(combined_sims)[-top_k:][::-1]
159
 
160
  results = []
 
168
  'relevance_score': float(combined_sims[idx]),
169
  'views': self.dataset.iloc[idx]['views']
170
  })
 
171
  return results
172
 
173
+ def perform_arxiv_search(query, vocal_summary=True, extended_refs=False):
174
+ """Perform Arxiv search with audio summaries"""
175
+ try:
176
+ client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
177
+ refs = client.predict(query, 20, "Semantic Search",
178
+ "mistralai/Mixtral-8x7B-Instruct-v0.1",
179
+ api_name="/update_with_rag_md")[0]
180
+ response = client.predict(query, "mistralai/Mixtral-8x7B-Instruct-v0.1",
181
+ True, api_name="/ask_llm")
182
+
183
+ result = f"### πŸ”Ž {query}\n\n{response}\n\n{refs}"
184
+ st.markdown(result)
185
+
186
+ if vocal_summary:
187
+ audio_file = asyncio.run(generate_speech(response[:500]))
188
+ if audio_file:
189
+ st.audio(audio_file)
190
+ os.remove(audio_file)
191
+
192
+ return result
193
+ except Exception as e:
194
+ st.error(f"Error in Arxiv search: {e}")
195
+ return None
196
+
197
  async def generate_speech(text, voice="en-US-AriaNeural"):
198
  """Generate speech using Edge TTS"""
199
  if not text.strip():
200
  return None
201
 
202
+ try:
203
+ communicate = edge_tts.Communicate(text, voice)
204
+ audio_file = f"speech_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
205
+ await communicate.save(audio_file)
206
+ return audio_file
207
+ except Exception as e:
208
+ st.error(f"Error generating speech: {e}")
209
+ return None
210
+
211
+ def process_audio_input(audio_bytes):
212
+ """Process audio input from recorder"""
213
+ if audio_bytes:
214
+ # Save temporary file
215
+ audio_path = f"temp_audio_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"
216
+ with open(audio_path, "wb") as f:
217
+ f.write(audio_bytes)
218
+
219
+ # Here you would typically use a speech-to-text service
220
+ # For now, we'll just acknowledge the recording
221
+ st.success("Audio recorded successfully!")
222
+
223
+ # Cleanup
224
+ if os.path.exists(audio_path):
225
+ os.remove(audio_path)
226
+
227
+ return True
228
+ return False
229
 
230
  def main():
231
+ st.title("πŸŽ₯ Video Search & Research Assistant")
232
 
233
+ # Initialize search
234
  search = VideoSearch()
235
 
236
+ # Create main tabs
237
+ tab1, tab2, tab3 = st.tabs(["πŸ” Video Search", "πŸŽ™οΈ Voice & Audio", "πŸ“š Arxiv Research"])
238
 
239
  with tab1:
240
+ st.subheader("Search Video Dataset")
241
 
242
  # Text search
243
  query = st.text_input("Enter your search query:")
 
277
  audio_file = asyncio.run(generate_speech(summary))
278
  if audio_file:
279
  st.audio(audio_file)
280
+ os.remove(audio_file)
 
 
281
 
282
  with tab2:
283
+ st.subheader("Voice Input & Audio Recording")
284
 
285
+ col1, col2 = st.columns(2)
286
+ with col1:
287
+ st.write("πŸŽ™οΈ Speech Recognition")
288
+ voice_input = speech_component()
 
 
 
289
 
290
+ if voice_input and voice_input != st.session_state['last_voice_input']:
291
+ st.session_state['last_voice_input'] = voice_input
292
+ st.markdown("**Transcribed Text:**")
293
+ st.write(voice_input)
294
+
295
+ if st.button("πŸ” Search Videos"):
296
+ results = search.search(voice_input, num_results)
297
+ for i, result in enumerate(results, 1):
298
+ with st.expander(f"Result {i}", expanded=i==1):
299
+ st.write(result['description'])
300
+ if result['youtube_id']:
301
+ st.video(f"https://youtube.com/watch?v={result['youtube_id']}&t={result['start_time']}")
302
+
303
+ with col2:
304
+ st.write("🎡 Audio Recorder")
305
+ audio_bytes = audio_recorder()
306
+ if audio_bytes:
307
+ process_audio_input(audio_bytes)
308
 
309
  with tab3:
310
+ st.subheader("Arxiv Research")
311
+ arxiv_query = st.text_input("πŸ” Research Query:")
312
 
313
+ col1, col2 = st.columns(2)
314
+ with col1:
315
+ vocal_summary = st.checkbox("Generate Audio Summary", value=True)
316
+ with col2:
317
+ extended_refs = st.checkbox("Include Extended References", value=False)
318
 
319
+ if st.button("πŸ” Search Arxiv") and arxiv_query:
320
+ perform_arxiv_search(arxiv_query, vocal_summary, extended_refs)
 
 
 
 
 
 
 
 
321
 
322
+ # Sidebar for history and settings
323
  with st.sidebar:
324
+ st.subheader("βš™οΈ Settings & History")
 
 
325
 
326
+ if st.button("πŸ—‘οΈ Clear History"):
327
+ st.session_state['search_history'] = []
328
+ st.experimental_rerun()
329
+
330
+ st.markdown("### Recent Searches")
331
+ for entry in reversed(st.session_state['search_history'][-5:]):
332
+ st.markdown(f"**{entry['timestamp']}**: {entry['query']}")
333
+
334
+ st.markdown("### Voice Settings")
335
  st.selectbox("TTS Voice:",
336
  ["en-US-AriaNeural", "en-US-GuyNeural", "en-GB-SoniaNeural"],
337
  key="tts_voice")
 
 
 
 
 
 
 
 
 
 
 
 
338
 
339
  if __name__ == "__main__":
340
  main()