Spaces:

jobsm
/

AI_FOR_DISABLED

Sleeping

App Files Files Community

jobsm commited on Sep 18, 2024

Commit

1d38dc5

verified ·

1 Parent(s): feebf81

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -15

app.py CHANGED Viewed

@@ -6,6 +6,9 @@ import cv2
 import string
 import numpy as np
 import tensorflow as tf
 # Load models
 whisper_model = whisper.load_model("base")
@@ -19,6 +22,13 @@ def load_sign_language_model():
 sign_language_model = load_sign_language_model()
 # Audio-based functions
@@ -58,9 +68,31 @@ def search_text(text, api_key):
         return {"error": str(e)}
-def inference_audio(audio, sentiment_option, api_key):
     if audio is None:
-        return "No audio file provided.", "", "", ""
     audio = whisper.load_audio(audio)
     audio = whisper.pad_or_trim(audio)
@@ -78,7 +110,11 @@ def inference_audio(audio, sentiment_option, api_key):
     search_results = search_text(result.text, api_key)
-    return lang.upper(), result.text, sentiment_output, search_results
 # Image-based functions
@@ -100,6 +136,12 @@ def get_explanation(letter, api_key):
         response_data = response.json()
         explanation = response_data.get("contents", [{}])[0].get("parts", [{}])[
             0].get("text", "No explanation available.")
         return explanation
     except requests.RequestException as e:
         return f"Error fetching explanation: {e}"
@@ -125,17 +167,21 @@ def classify_sign_language(image, api_key):
 # Gradio interface
-def process_input(input_type, audio=None, image=None, sentiment_option=None, api_key=None):
     if input_type == "Audio":
-        return inference_audio(audio, sentiment_option, api_key)
     elif input_type == "Image":
         pred, explanation = classify_sign_language(image, api_key)
-        return "N/A", pred, "N/A", explanation
-def main():
     with gr.Blocks() as demo:
-        gr.Markdown("# 🤖 Speak & Sign AI Assistant")
         # Layout: Split user input and bot response sides
         with gr.Row():
@@ -160,6 +206,14 @@ def main():
                 image_input = gr.Image(
                     label="Upload Image", type="pil", visible=False)
                 # Change input visibility based on selection
                 def update_visibility(input_type):
                     if input_type == "Audio":
@@ -184,13 +238,19 @@ def main():
                 sentiment_output = gr.Textbox(
                     label="Sentiment Analysis Results", interactive=False)
                 search_results = gr.Textbox(
-                    label="Explanation or API Search Results", interactive=False)
-                submit_btn.click(process_input, inputs=[input_type, audio_input, image_input, sentiment_option, api_key_input], outputs=[
-                                 lang_str, text, sentiment_output, search_results])
     demo.launch(share=True)
-if __name__ == "__main__":
-    main()

 import string
 import numpy as np
 import tensorflow as tf
+import edge_tts
+import asyncio
+import tempfile
 # Load models
 whisper_model = whisper.load_model("base")
 sign_language_model = load_sign_language_model()
+# Get all available voices
+async def get_voices():
+    voices = await edge_tts.list_voices()
+    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
 # Audio-based functions
         return {"error": str(e)}
+async def text_to_speech(text, voice, rate, pitch):
+    if not text.strip():
+        return None, gr.Warning("Please enter text to convert.")
+    if not voice:
+        return None, gr.Warning("Please select a voice.")
+    voice_short_name = voice.split(" - ")[0]
+    rate_str = f"{rate:+d}%"
+    pitch_str = f"{pitch:+d}Hz"
+    communicate = edge_tts.Communicate(
+        text, voice_short_name, rate=rate_str, pitch=pitch_str)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+        tmp_path = tmp_file.name
+        await communicate.save(tmp_path)
+    return tmp_path, None
+async def tts_interface(text, voice, rate, pitch):
+    audio, warning = await text_to_speech(text, voice, rate, pitch)
+    return audio, warning
+def inference_audio(audio, sentiment_option, api_key, tts_voice, tts_rate, tts_pitch):
     if audio is None:
+        return "No audio file provided.", "", "", "", None
     audio = whisper.load_audio(audio)
     audio = whisper.pad_or_trim(audio)
     search_results = search_text(result.text, api_key)
+    # Generate audio for explanation
+    explanation_audio, _ = asyncio.run(tts_interface(
+        search_results, tts_voice, tts_rate, tts_pitch))
+    return lang.upper(), result.text, sentiment_output, search_results, explanation_audio
 # Image-based functions
         response_data = response.json()
         explanation = response_data.get("contents", [{}])[0].get("parts", [{}])[
             0].get("text", "No explanation available.")
+        # Remove unnecessary symbols and formatting
+        explanation = explanation.replace(
+            "*", "").replace("#", "").replace("$", "").replace("\n", " ").strip()
+        # Remove additional special characters, if needed
+        explanation = explanation.translate(
+            str.maketrans('', '', string.punctuation))
         return explanation
     except requests.RequestException as e:
         return f"Error fetching explanation: {e}"
 # Gradio interface
+def process_input(input_type, audio=None, image=None, sentiment_option=None, api_key=None, tts_voice=None, tts_rate=0, tts_pitch=0):
     if input_type == "Audio":
+        return inference_audio(audio, sentiment_option, api_key, tts_voice, tts_rate, tts_pitch)
     elif input_type == "Image":
         pred, explanation = classify_sign_language(image, api_key)
+        explanation_audio, _ = asyncio.run(tts_interface(
+            explanation, tts_voice, tts_rate, tts_pitch))
+        return "N/A", pred, "N/A", explanation, explanation_audio
+async def main():
+    voices = await get_voices()
     with gr.Blocks() as demo:
+        gr.Markdown("# Speak & Sign AI Assistant")
         # Layout: Split user input and bot response sides
         with gr.Row():
                 image_input = gr.Image(
                     label="Upload Image", type="pil", visible=False)
+                # TTS settings for explanation
+                tts_voice = gr.Dropdown(label="Select Voice", choices=[
+                ] + list(voices.keys()), value="")
+                tts_rate = gr.Slider(
+                    minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1)
+                tts_pitch = gr.Slider(
+                    minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
                 # Change input visibility based on selection
                 def update_visibility(input_type):
                     if input_type == "Audio":
                 sentiment_output = gr.Textbox(
                     label="Sentiment Analysis Results", interactive=False)
                 search_results = gr.Textbox(
+                    label="Explanation or Search Results", interactive=False)
+                audio_output = gr.Audio(
+                    label="Generated Explanation Audio", type="filepath", interactive=False)
+        # Submit button action
+        submit_btn.click(
+            process_input,
+            inputs=[input_type, audio_input, image_input, sentiment_option,
+                    api_key_input, tts_voice, tts_rate, tts_pitch],
+            outputs=[lang_str, text, sentiment_output,
+                     search_results, audio_output]
+        )
     demo.launch(share=True)
+asyncio.run(main())