awacke1 commited on
Commit
3f8c47a
Β·
verified Β·
1 Parent(s): cbd471e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -59
app.py CHANGED
@@ -33,6 +33,7 @@ st.set_page_config(
33
  )
34
  load_dotenv()
35
 
 
36
  USER_NAMES = [
37
  "Aria", "Guy", "Sonia", "Tony", "Jenny", "Davis", "Libby", "Clara", "Liam", "Natasha", "William"
38
  ]
@@ -45,6 +46,12 @@ ENGLISH_VOICES = [
45
 
46
  USER_VOICES = dict(zip(USER_NAMES, ENGLISH_VOICES))
47
 
 
 
 
 
 
 
48
  if 'user_name' not in st.session_state:
49
  st.session_state['user_name'] = USER_NAMES[0]
50
  if 'old_val' not in st.session_state:
@@ -53,14 +60,11 @@ if 'viewing_prefix' not in st.session_state:
53
  st.session_state['viewing_prefix'] = None
54
  if 'should_rerun' not in st.session_state:
55
  st.session_state['should_rerun'] = False
 
 
56
 
57
- FILE_EMOJIS = {
58
- "md": "πŸ“",
59
- "mp3": "🎡",
60
- }
61
-
62
  def get_high_info_terms(text: str) -> list:
63
- # Expanded stop words
64
  stop_words = set([
65
  'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
66
  'by', 'from', 'up', 'about', 'into', 'over', 'after', 'is', 'are', 'was', 'were',
@@ -71,7 +75,6 @@ def get_high_info_terms(text: str) -> list:
71
  'other', 'some', 'such', 'than', 'too', 'very', 'just', 'there', 'as', 'if', 'while'
72
  ])
73
 
74
- # Key phrases tailored to your interests
75
  key_phrases = [
76
  'artificial intelligence', 'machine learning', 'deep learning', 'neural networks',
77
  'natural language processing', 'healthcare systems', 'clinical medicine',
@@ -81,16 +84,14 @@ def get_high_info_terms(text: str) -> list:
81
  'quantum mechanics', 'biomedical engineering', 'computational biology'
82
  ]
83
 
84
- # Preserve key phrases and remove them from the text
85
  preserved_phrases = []
86
  lower_text = text.lower()
87
  for phrase in key_phrases:
88
  if phrase in lower_text:
89
  preserved_phrases.append(phrase)
90
  text = text.replace(phrase, '')
91
- break # Stop after the first matching key phrase
92
 
93
- # Extract words and filter high-info terms
94
  words = re.findall(r'\b\w+(?:-\w+)*\b', text)
95
  high_info_words = [
96
  word.lower() for word in words
@@ -100,7 +101,6 @@ def get_high_info_terms(text: str) -> list:
100
  and any(c.isalpha() for c in word)
101
  ]
102
 
103
- # Combine preserved phrases and filtered words, ensuring uniqueness
104
  unique_terms = []
105
  seen = set()
106
  for term in preserved_phrases + high_info_words:
@@ -108,7 +108,6 @@ def get_high_info_terms(text: str) -> list:
108
  seen.add(term)
109
  unique_terms.append(term)
110
 
111
- # Return only the top 5 terms
112
  return unique_terms[:5]
113
 
114
  def clean_text_for_filename(text: str) -> str:
@@ -120,12 +119,9 @@ def clean_text_for_filename(text: str) -> str:
120
  return '_'.join(filtered)[:200]
121
 
122
  def generate_filename(prompt, response, file_type="md"):
123
- # Adjust timezone to Central Time
124
  central_tz = pytz.timezone('America/Chicago')
125
  central_time = datetime.now(central_tz)
126
-
127
- # Format the prefix to include the required format
128
- prefix = central_time.strftime("%m-%d-%y_%I-%M-%p_") # e.g., 12-20-24_11-34-AM_
129
 
130
  combined = (prompt + " " + response).strip()
131
  info_terms = get_high_info_terms(combined)
@@ -160,6 +156,7 @@ def clean_for_speech(text: str) -> str:
160
  text = re.sub(r"\s+", " ", text).strip()
161
  return text
162
 
 
163
  async def edge_tts_generate_audio(text, voice="en-US-AriaNeural", rate=0, pitch=0):
164
  text = clean_for_speech(text)
165
  if not text.strip():
@@ -184,6 +181,7 @@ def play_and_download_audio(file_path):
184
  dl_link = f'<a href="data:audio/mpeg;base64,{base64.b64encode(open(file_path,"rb").read()).decode()}" download="{os.path.basename(file_path)}">Download {os.path.basename(file_path)}</a>'
185
  st.markdown(dl_link, unsafe_allow_html=True)
186
 
 
187
  def load_files_for_sidebar():
188
  md_files = glob.glob("*.md")
189
  mp3_files = glob.glob("*.mp3")
@@ -213,6 +211,33 @@ def extract_keywords_from_md(files):
213
  text += " " + c
214
  return get_high_info_terms(text)
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  def display_file_manager_sidebar(groups, sorted_prefixes):
217
  st.sidebar.title("🎡 Audio & Docs Manager")
218
 
@@ -263,40 +288,143 @@ def display_file_manager_sidebar(groups, sorted_prefixes):
263
  ctime = datetime.fromtimestamp(os.path.getmtime(f)).strftime("%Y-%m-%d %H:%M:%S")
264
  st.write(f"**{fname}** - {ctime}")
265
 
266
- def create_zip_of_files(md_files, mp3_files):
267
- md_files = [f for f in md_files if os.path.basename(f).lower() != 'readme.md']
268
- all_files = md_files + mp3_files
269
- if not all_files:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  return None
271
 
272
- all_content = []
273
- for f in all_files:
274
- if f.endswith('.md'):
275
- with open(f,'r',encoding='utf-8') as file:
276
- all_content.append(file.read())
277
- elif f.endswith('.mp3'):
278
- all_content.append(os.path.basename(f))
279
-
280
- combined_content = " ".join(all_content)
281
- info_terms = get_high_info_terms(combined_content)
282
-
283
- timestamp = datetime.now().strftime("%y%m_%H%M")
284
- name_text = '_'.join(term.replace(' ', '-') for term in info_terms[:3])
285
- zip_name = f"{timestamp}_{name_text}.zip"
286
-
287
- with zipfile.ZipFile(zip_name,'w') as z:
288
- for f in all_files:
289
- z.write(f)
290
-
291
- return zip_name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
- def perform_ai_lookup(q, vocal_summary=True, extended_refs=False, titles_summary=True, full_audio=False):
294
- """Perform Arxiv search (via your RAG pattern) and generate audio summaries."""
 
295
  start = time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
297
- # The next lines call your RAG pipeline
298
- refs = client.predict(q,20,"Semantic Search","mistralai/Mixtral-8x7B-Instruct-v0.1",api_name="/update_with_rag_md")[0]
299
- r2 = client.predict(q,"mistralai/Mixtral-8x7B-Instruct-v0.1",True,api_name="/ask_llm")
300
 
301
  result = f"### πŸ”Ž {q}\n\n{r2}\n\n{refs}"
302
 
@@ -340,16 +468,22 @@ def perform_ai_lookup(q, vocal_summary=True, extended_refs=False, titles_summary
340
  st.write("### πŸ”– Titles")
341
  play_and_download_audio(audio_file_titles)
342
 
343
- # show text last after playback interfaces. For the big one lets add a feature later that breaks into their own.
344
  st.markdown(result)
 
 
 
 
 
 
 
 
345
 
346
  elapsed = time.time()-start
347
  st.write(f"**Total Elapsed:** {elapsed:.2f} s")
348
 
349
- create_file(q, result, "md")
350
-
351
- return result
352
 
 
353
  def main():
354
  st.session_state['user_name'] = st.selectbox("Current User:", USER_NAMES, index=0)
355
 
@@ -398,15 +532,18 @@ def main():
398
  # Save user input
399
  create_file(st.session_state['user_name'], voice_text, "md")
400
 
401
- # Perform ArXiv search automatically
402
- with st.spinner("Searching ArXiv..."):
403
- # Always do vocal_summary = True, extended_refs=False, titles_summary=True, full_audio=False
404
- result = perform_ai_lookup(voice_text, vocal_summary=True, extended_refs=False, titles_summary=True, full_audio=False)
 
 
 
 
 
 
405
 
406
- # Update old_val
407
  st.session_state['old_val'] = voice_text
408
- # Clear the text by rerunning
409
- #st.rerun()
410
 
411
  st.write("Speak a query to run an ArXiv search and hear the results.")
412
 
@@ -453,11 +590,15 @@ def main():
453
 
454
  with tabs[2]:
455
  st.subheader("βš™οΈ Settings")
456
- st.write("Currently no additional settings.")
 
 
 
 
457
 
458
  if st.session_state.should_rerun:
459
  st.session_state.should_rerun = False
460
  st.rerun()
461
 
462
- if __name__=="__main__":
463
- main()
 
33
  )
34
  load_dotenv()
35
 
36
+ # -------------------- Constants --------------------
37
  USER_NAMES = [
38
  "Aria", "Guy", "Sonia", "Tony", "Jenny", "Davis", "Libby", "Clara", "Liam", "Natasha", "William"
39
  ]
 
46
 
47
  USER_VOICES = dict(zip(USER_NAMES, ENGLISH_VOICES))
48
 
49
+ FILE_EMOJIS = {
50
+ "md": "πŸ“",
51
+ "mp3": "🎡",
52
+ }
53
+
54
+ # -------------------- Session State Initialization --------------------
55
  if 'user_name' not in st.session_state:
56
  st.session_state['user_name'] = USER_NAMES[0]
57
  if 'old_val' not in st.session_state:
 
60
  st.session_state['viewing_prefix'] = None
61
  if 'should_rerun' not in st.session_state:
62
  st.session_state['should_rerun'] = False
63
+ if 'use_streaming' not in st.session_state:
64
+ st.session_state['use_streaming'] = True
65
 
66
+ # -------------------- Helper Functions --------------------
 
 
 
 
67
  def get_high_info_terms(text: str) -> list:
 
68
  stop_words = set([
69
  'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
70
  'by', 'from', 'up', 'about', 'into', 'over', 'after', 'is', 'are', 'was', 'were',
 
75
  'other', 'some', 'such', 'than', 'too', 'very', 'just', 'there', 'as', 'if', 'while'
76
  ])
77
 
 
78
  key_phrases = [
79
  'artificial intelligence', 'machine learning', 'deep learning', 'neural networks',
80
  'natural language processing', 'healthcare systems', 'clinical medicine',
 
84
  'quantum mechanics', 'biomedical engineering', 'computational biology'
85
  ]
86
 
 
87
  preserved_phrases = []
88
  lower_text = text.lower()
89
  for phrase in key_phrases:
90
  if phrase in lower_text:
91
  preserved_phrases.append(phrase)
92
  text = text.replace(phrase, '')
93
+ break
94
 
 
95
  words = re.findall(r'\b\w+(?:-\w+)*\b', text)
96
  high_info_words = [
97
  word.lower() for word in words
 
101
  and any(c.isalpha() for c in word)
102
  ]
103
 
 
104
  unique_terms = []
105
  seen = set()
106
  for term in preserved_phrases + high_info_words:
 
108
  seen.add(term)
109
  unique_terms.append(term)
110
 
 
111
  return unique_terms[:5]
112
 
113
  def clean_text_for_filename(text: str) -> str:
 
119
  return '_'.join(filtered)[:200]
120
 
121
  def generate_filename(prompt, response, file_type="md"):
 
122
  central_tz = pytz.timezone('America/Chicago')
123
  central_time = datetime.now(central_tz)
124
+ prefix = central_time.strftime("%m-%d-%y_%I-%M-%p_")
 
 
125
 
126
  combined = (prompt + " " + response).strip()
127
  info_terms = get_high_info_terms(combined)
 
156
  text = re.sub(r"\s+", " ", text).strip()
157
  return text
158
 
159
+ # -------------------- Audio Functions --------------------
160
  async def edge_tts_generate_audio(text, voice="en-US-AriaNeural", rate=0, pitch=0):
161
  text = clean_for_speech(text)
162
  if not text.strip():
 
181
  dl_link = f'<a href="data:audio/mpeg;base64,{base64.b64encode(open(file_path,"rb").read()).decode()}" download="{os.path.basename(file_path)}">Download {os.path.basename(file_path)}</a>'
182
  st.markdown(dl_link, unsafe_allow_html=True)
183
 
184
+ # -------------------- File Management Functions --------------------
185
  def load_files_for_sidebar():
186
  md_files = glob.glob("*.md")
187
  mp3_files = glob.glob("*.mp3")
 
211
  text += " " + c
212
  return get_high_info_terms(text)
213
 
214
+ def create_zip_of_files(md_files, mp3_files):
215
+ md_files = [f for f in md_files if os.path.basename(f).lower() != 'readme.md']
216
+ all_files = md_files + mp3_files
217
+ if not all_files:
218
+ return None
219
+
220
+ all_content = []
221
+ for f in all_files:
222
+ if f.endswith('.md'):
223
+ with open(f,'r',encoding='utf-8') as file:
224
+ all_content.append(file.read())
225
+ elif f.endswith('.mp3'):
226
+ all_content.append(os.path.basename(f))
227
+
228
+ combined_content = " ".join(all_content)
229
+ info_terms = get_high_info_terms(combined_content)
230
+
231
+ timestamp = datetime.now().strftime("%y%m_%H%M")
232
+ name_text = '_'.join(term.replace(' ', '-') for term in info_terms[:3])
233
+ zip_name = f"{timestamp}_{name_text}.zip"
234
+
235
+ with zipfile.ZipFile(zip_name,'w') as z:
236
+ for f in all_files:
237
+ z.write(f)
238
+
239
+ return zip_name
240
+
241
  def display_file_manager_sidebar(groups, sorted_prefixes):
242
  st.sidebar.title("🎡 Audio & Docs Manager")
243
 
 
288
  ctime = datetime.fromtimestamp(os.path.getmtime(f)).strftime("%Y-%m-%d %H:%M:%S")
289
  st.write(f"**{fname}** - {ctime}")
290
 
291
+ # -------------------- xAI API Functions --------------------
292
+ def call_xai_api_batch(query: str) -> dict:
293
+ """
294
+ Call the xAI API in batch mode for complete responses.
295
+ """
296
+ headers = {
297
+ "Content-Type": "application/json",
298
+ "Authorization": f"Bearer {os.environ.get('xai')}"
299
+ }
300
+
301
+ data = {
302
+ "messages": [
303
+ {
304
+ "role": "system",
305
+ "content": "You are a helpful scientific research assistant. Analyze the following research query and provide initial insights."
306
+ },
307
+ {
308
+ "role": "user",
309
+ "content": query
310
+ }
311
+ ],
312
+ "model": "grok-2-1212",
313
+ "stream": False,
314
+ "temperature": 0.7
315
+ }
316
+
317
+ try:
318
+ response = requests.post(
319
+ "https://api.x.ai/v1/chat/completions",
320
+ headers=headers,
321
+ json=data,
322
+ timeout=30
323
+ )
324
+ response.raise_for_status()
325
+ return response.json()
326
+ except requests.exceptions.RequestException as e:
327
+ st.error(f"Error in batch xAI API call: {str(e)}")
328
  return None
329
 
330
+ def stream_xai_response(query: str, placeholder) -> str:
331
+ """
332
+ Stream the xAI API response and display it in real-time.
333
+ Returns the complete response text.
334
+ """
335
+ headers = {
336
+ "Content-Type": "application/json",
337
+ "Authorization": f"Bearer {os.environ.get('xai')}"
338
+ }
339
+
340
+ data = {
341
+ "messages": [
342
+ {
343
+ "role": "system",
344
+ "content": "You are a helpful scientific research assistant. Analyze the following research query and provide initial insights."
345
+ },
346
+ {
347
+ "role": "user",
348
+ "content": query
349
+ }
350
+ ],
351
+ "model": "grok-2-1212",
352
+ "stream": True,
353
+ "temperature": 0.7
354
+ }
355
+
356
+ try:
357
+ response = requests.post(
358
+ "https://api.x.ai/v1/chat/completions",
359
+ headers=headers,
360
+ json=data,
361
+ stream=True,
362
+ timeout=30
363
+ )
364
+ response.raise_for_status()
365
+
366
+ full_response = ""
367
+
368
+ for line in response.iter_lines():
369
+ if line:
370
+ line = line.decode('utf-8')
371
+ if line.startswith('data: '):
372
+ json_str = line[6:] # Remove 'data: ' prefix
373
+ if json_str == '[DONE]':
374
+ break
375
+ try:
376
+ chunk = json.loads(json_str)
377
+ if chunk["choices"][0]["delta"].get("content"):
378
+ content = chunk["choices"][0]["delta"]["content"]
379
+ full_response += content
380
+ # Update the placeholder with accumulated text
381
+ placeholder.markdown(full_response + "β–Œ")
382
+ except json.JSONDecodeError:
383
+ continue
384
+
385
+ # Final update without the cursor
386
+ placeholder.markdown(full_response)
387
+ return full_response
388
+
389
+ except requests.exceptions.RequestException as e:
390
+ st.error(f"Error in streaming xAI API call: {str(e)}")
391
+ return None
392
 
393
+ # -------------------- Main AI Lookup Function --------------------
394
+ def perform_ai_lookup(q, vocal_summary=True, extended_refs=False, titles_summary=True, full_audio=False, use_streaming=True):
395
+ """Perform Arxiv search with initial xAI insights."""
396
  start = time.time()
397
+
398
+ # First, get xAI insights
399
+ st.write("### πŸ€– Initial AI Insights")
400
+ initial_insights = None
401
+
402
+ if use_streaming:
403
+ # Create a placeholder for streaming text
404
+ streaming_placeholder = st.empty()
405
+ with st.spinner("Getting streaming AI insights..."):
406
+ initial_insights = stream_xai_response(q, streaming_placeholder)
407
+ else:
408
+ with st.spinner("Getting batch AI insights..."):
409
+ xai_response = call_xai_api_batch(q)
410
+ if xai_response and 'choices' in xai_response:
411
+ initial_insights = xai_response['choices'][0]['message']['content']
412
+ st.markdown(initial_insights)
413
+
414
+ # Generate audio for xAI insights if enabled
415
+ if vocal_summary and initial_insights:
416
+ insights_text = clean_for_speech(initial_insights)
417
+ if insights_text.strip():
418
+ audio_file_insights = speak_with_edge_tts(insights_text)
419
+ if audio_file_insights:
420
+ st.write("### 🎀 AI Insights Audio")
421
+ play_and_download_audio(audio_file_insights)
422
+
423
+ # Proceed with existing ArXiv search
424
+ st.write("### πŸ“š ArXiv Results")
425
  client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
426
+ refs = client.predict(q, 20, "Semantic Search", "mistralai/Mixtral-8x7B-Instruct-v0.1", api_name="/update_with_rag_md")[0]
427
+ r2 = client.predict(q, "mistralai/Mixtral-8x7B-Instruct-v0.1", True, api_name="/ask_llm")
 
428
 
429
  result = f"### πŸ”Ž {q}\n\n{r2}\n\n{refs}"
430
 
 
468
  st.write("### πŸ”– Titles")
469
  play_and_download_audio(audio_file_titles)
470
 
 
471
  st.markdown(result)
472
+
473
+ # Save complete results including xAI insights
474
+ if initial_insights:
475
+ full_result = f"### πŸ€– Initial AI Insights\n\n{initial_insights}\n\n{result}"
476
+ else:
477
+ full_result = result
478
+
479
+ create_file(q, full_result, "md")
480
 
481
  elapsed = time.time()-start
482
  st.write(f"**Total Elapsed:** {elapsed:.2f} s")
483
 
484
+ return full_result
 
 
485
 
486
+ # -------------------- Main Application --------------------
487
  def main():
488
  st.session_state['user_name'] = st.selectbox("Current User:", USER_NAMES, index=0)
489
 
 
532
  # Save user input
533
  create_file(st.session_state['user_name'], voice_text, "md")
534
 
535
+ # Perform AI lookup with current streaming setting
536
+ with st.spinner("Processing..."):
537
+ result = perform_ai_lookup(
538
+ voice_text,
539
+ vocal_summary=True,
540
+ extended_refs=False,
541
+ titles_summary=True,
542
+ full_audio=False,
543
+ use_streaming=st.session_state['use_streaming']
544
+ )
545
 
 
546
  st.session_state['old_val'] = voice_text
 
 
547
 
548
  st.write("Speak a query to run an ArXiv search and hear the results.")
549
 
 
590
 
591
  with tabs[2]:
592
  st.subheader("βš™οΈ Settings")
593
+ st.session_state['use_streaming'] = st.toggle(
594
+ "Use streaming responses",
595
+ value=st.session_state['use_streaming'],
596
+ help="Enable to see AI responses as they are generated in real-time"
597
+ )
598
 
599
  if st.session_state.should_rerun:
600
  st.session_state.should_rerun = False
601
  st.rerun()
602
 
603
+ if __name__ == "__main__":
604
+ main()