Spaces:

jobsm
/

AI_FOR_DISABLED

Sleeping

App Files Files Community

jobsm commited on 15 days ago

Commit

0c0dfca

verified ·

1 Parent(s): 42b3635

newupdate

Browse files

Files changed (1) hide show

app.py +47 -136

app.py CHANGED Viewed

@@ -13,61 +13,54 @@ import tempfile
 # Load models
 whisper_model = whisper.load_model("base")
 sentiment_analysis = pipeline(
-    "sentiment-analysis", framework="pt", model="SamLowe/roberta-base-go_emotions")
 def load_sign_language_model():
-    return tf.keras.models.load_model('best_model.h5')
 sign_language_model = load_sign_language_model()
-# Get all available voices
 async def get_voices():
     voices = await edge_tts.list_voices()
-    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
 # Audio-based functions
 def analyze_sentiment(text):
     results = sentiment_analysis(text)
-    sentiment_results = {result['label']: result['score']
-                         for result in results}
     return sentiment_results
 def display_sentiment_results(sentiment_results, option):
     sentiment_text = ""
     for sentiment, score in sentiment_results.items():
         if option == "Sentiment Only":
             sentiment_text += f"{sentiment}\n"
         elif option == "Sentiment + Score":
-            sentiment_text += f"{sentiment}: {score}\n"
     return sentiment_text
 def search_text(text, api_key):
     api_endpoint = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent"
     headers = {"Content-Type": "application/json"}
     payload = {"contents": [{"parts": [{"text": text}]}]}
     try:
-        response = requests.post(
-            api_endpoint, headers=headers, json=payload, params={"key": api_key})
         response.raise_for_status()
         response_json = response.json()
-        if 'candidates' in response_json and len(response_json['candidates']) > 0:
-            content_parts = response_json['candidates'][0]['content']['parts']
-            if len(content_parts) > 0:
-                return content_parts[0]['text'].strip()
         return "No relevant content found."
     except requests.exceptions.RequestException as e:
         return {"error": str(e)}
 async def text_to_speech(text, voice, rate, pitch):
     if not text.strip():
         return None, gr.Warning("Please enter text to convert.")
@@ -77,20 +70,18 @@ async def text_to_speech(text, voice, rate, pitch):
     voice_short_name = voice.split(" - ")[0]
     rate_str = f"{rate:+d}%"
     pitch_str = f"{pitch:+d}Hz"
-    communicate = edge_tts.Communicate(
-        text, voice_short_name, rate=rate_str, pitch=pitch_str)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
         tmp_path = tmp_file.name
         await communicate.save(tmp_path)
     return tmp_path, None
 async def tts_interface(text, voice, rate, pitch):
-    audio, warning = await text_to_speech(text, voice, rate, pitch)
-    return audio, warning
-def inference_audio(audio, sentiment_option, api_key, tts_voice, tts_rate, tts_pitch):
     if audio is None:
         return "No audio file provided.", "", "", "", None
@@ -105,49 +96,15 @@ def inference_audio(audio, sentiment_option, api_key, tts_voice, tts_rate, tts_p
     result = whisper.decode(whisper_model, mel, options)
     sentiment_results = analyze_sentiment(result.text)
-    sentiment_output = display_sentiment_results(
-        sentiment_results, sentiment_option)
     search_results = search_text(result.text, api_key)
-    # Generate audio for explanation
-    explanation_audio, _ = asyncio.run(tts_interface(
-        search_results, tts_voice, tts_rate, tts_pitch))
     return lang.upper(), result.text, sentiment_output, search_results, explanation_audio
-# Image-based functions
-def get_explanation(letter, api_key):
-    url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent"
-    headers = {"Content-Type": "application/json"}
-    data = {
-        "contents": [
-            {"parts": [{"text": f"Explain how the American Sign Language letter '{letter}' is shown, its significance, and why it is represented this way."}]}
-        ]
-    }
-    params = {"key": api_key}
-    try:
-        response = requests.post(url, headers=headers,
-                                 json=data, params=params)
-        response.raise_for_status()
-        response_data = response.json()
-        explanation = response_data.get("contents", [{}])[0].get("parts", [{}])[
-            0].get("text", "No explanation available.")
-        # Remove unnecessary symbols and formatting
-        explanation = explanation.replace(
-            "*", "").replace("#", "").replace("$", "").replace("\n", " ").strip()
-        # Remove additional special characters, if needed
-        explanation = explanation.translate(
-            str.maketrans('', '', string.punctuation))
-        return explanation
-    except requests.RequestException as e:
-        return f"Error fetching explanation: {e}"
-def classify_sign_language(image, api_key):
     img = np.array(image)
     gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
     gray_img = cv2.resize(gray_img, (28, 28))
@@ -160,22 +117,16 @@ def classify_sign_language(image, api_key):
     output = output + 1 if output > 7 else output
     pred = uppercase_alphabet[output]
-    explanation = get_explanation(pred, api_key)
-    return pred, explanation
-# Gradio interface
-def process_input(input_type, audio=None, image=None, sentiment_option=None, api_key=None, tts_voice=None, tts_rate=0, tts_pitch=0):
     if input_type == "Audio":
-        return inference_audio(audio, sentiment_option, api_key, tts_voice, tts_rate, tts_pitch)
     elif input_type == "Image":
-        pred, explanation = classify_sign_language(image, api_key)
-        explanation_audio, _ = asyncio.run(tts_interface(
-            explanation, tts_voice, tts_rate, tts_pitch))
-        return "N/A", pred, "N/A", explanation, explanation_audio
 async def main():
     voices = await get_voices()
@@ -183,74 +134,34 @@ async def main():
     with gr.Blocks() as demo:
         gr.Markdown("# Speak & Sign AI Assistant")
-        # Layout: Split user input and bot response sides
         with gr.Row():
-            # User Input Side
             with gr.Column():
                 gr.Markdown("### User Input")
-                # Input selection
-                input_type = gr.Radio(label="Choose Input Type", choices=[
-                                      "Audio", "Image"], value="Audio")
-                # API key input
-                api_key_input = gr.Textbox(
-                    label="API Key", placeholder="Your API key here", type="password")
-                # Audio input
-                audio_input = gr.Audio(
-                    label="Upload or Record Audio", type="filepath", visible=True)
-                sentiment_option = gr.Radio(choices=[
-                                            "Sentiment Only", "Sentiment + Score"], label="Sentiment Output", value="Sentiment Only", visible=True)
-                # Image input
-                image_input = gr.Image(
-                    label="Upload Image", type="pil", visible=False)
-                # TTS settings for explanation
-                tts_voice = gr.Dropdown(label="Select Voice", choices=[
-                ] + list(voices.keys()), value="")
-                tts_rate = gr.Slider(
-                    minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1)
-                tts_pitch = gr.Slider(
-                    minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
-                # Change input visibility based on selection
-                def update_visibility(input_type):
-                    if input_type == "Audio":
-                        return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)
-                    else:
-                        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
-                input_type.change(update_visibility, inputs=input_type, outputs=[
-                                  audio_input, sentiment_option, image_input])
-                # Submit button
                 submit_btn = gr.Button("Submit")
-            # Bot Response Side
             with gr.Column():
                 gr.Markdown("### Bot Response")
-                lang_str = gr.Textbox(
-                    label="Detected Language", interactive=False)
-                text = gr.Textbox(
-                    label="Transcription or Prediction", interactive=False)
-                sentiment_output = gr.Textbox(
-                    label="Sentiment Analysis Results", interactive=False)
-                search_results = gr.Textbox(
-                    label="Explanation or Search Results", interactive=False)
-                audio_output = gr.Audio(
-                    label="Generated Explanation Audio", type="filepath", interactive=False)
-        # Submit button action
-        submit_btn.click(
-            process_input,
-            inputs=[input_type, audio_input, image_input, sentiment_option,
-                    api_key_input, tts_voice, tts_rate, tts_pitch],
-            outputs=[lang_str, text, sentiment_output,
-                     search_results, audio_output]
-        )
     demo.launch(share=True)
-asyncio.run(main())

 # Load models
 whisper_model = whisper.load_model("base")
 sentiment_analysis = pipeline(
+    "sentiment-analysis", framework="pt", model="SamLowe/roberta-base-go_emotions"
+)
 def load_sign_language_model():
+    return tf.keras.models.load_model("best_model.h5")
 sign_language_model = load_sign_language_model()
+# Get available voices asynchronously
 async def get_voices():
     voices = await edge_tts.list_voices()
+    return {
+        f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v["ShortName"]
+        for v in voices
+    }
 # Audio-based functions
 def analyze_sentiment(text):
     results = sentiment_analysis(text)
+    sentiment_results = {result["label"]: result["score"] for result in results}
     return sentiment_results
 def display_sentiment_results(sentiment_results, option):
     sentiment_text = ""
     for sentiment, score in sentiment_results.items():
         if option == "Sentiment Only":
             sentiment_text += f"{sentiment}\n"
         elif option == "Sentiment + Score":
+            sentiment_text += f"{sentiment}: {score:.2f}\n"
     return sentiment_text
 def search_text(text, api_key):
     api_endpoint = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent"
     headers = {"Content-Type": "application/json"}
     payload = {"contents": [{"parts": [{"text": text}]}]}
     try:
+        response = requests.post(api_endpoint, headers=headers, json=payload, params={"key": api_key})
         response.raise_for_status()
         response_json = response.json()
+        if "candidates" in response_json and response_json["candidates"]:
+            content_parts = response_json["candidates"][0]["content"]["parts"]
+            if content_parts:
+                return content_parts[0]["text"].strip()
         return "No relevant content found."
     except requests.exceptions.RequestException as e:
         return {"error": str(e)}
 async def text_to_speech(text, voice, rate, pitch):
     if not text.strip():
         return None, gr.Warning("Please enter text to convert.")
     voice_short_name = voice.split(" - ")[0]
     rate_str = f"{rate:+d}%"
     pitch_str = f"{pitch:+d}Hz"
+    communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
         tmp_path = tmp_file.name
         await communicate.save(tmp_path)
     return tmp_path, None
 async def tts_interface(text, voice, rate, pitch):
+    return await text_to_speech(text, voice, rate, pitch)
+async def inference_audio(audio, sentiment_option, api_key, tts_voice, tts_rate, tts_pitch):
     if audio is None:
         return "No audio file provided.", "", "", "", None
     result = whisper.decode(whisper_model, mel, options)
     sentiment_results = analyze_sentiment(result.text)
+    sentiment_output = display_sentiment_results(sentiment_results, sentiment_option)
     search_results = search_text(result.text, api_key)
+    explanation_audio, _ = await tts_interface(search_results, tts_voice, tts_rate, tts_pitch)
     return lang.upper(), result.text, sentiment_output, search_results, explanation_audio
+async def classify_sign_language(image, api_key):
     img = np.array(image)
     gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
     gray_img = cv2.resize(gray_img, (28, 28))
     output = output + 1 if output > 7 else output
     pred = uppercase_alphabet[output]
+    explanation = search_text(f"Explain the American Sign Language letter '{pred}'.", api_key)
+    explanation_audio, _ = await tts_interface(explanation, None, 0, 0)
+    return pred, explanation, explanation_audio
+async def process_input(input_type, audio=None, image=None, sentiment_option=None, api_key=None, tts_voice=None, tts_rate=0, tts_pitch=0):
     if input_type == "Audio":
+        return await inference_audio(audio, sentiment_option, api_key, tts_voice, tts_rate, tts_pitch)
     elif input_type == "Image":
+        return await classify_sign_language(image, api_key)
 async def main():
     voices = await get_voices()
     with gr.Blocks() as demo:
         gr.Markdown("# Speak & Sign AI Assistant")
         with gr.Row():
             with gr.Column():
                 gr.Markdown("### User Input")
+                input_type = gr.Radio(label="Choose Input Type", choices=["Audio", "Image"], value="Audio")
+                api_key_input = gr.Textbox(label="API Key", placeholder="Your API key here", type="password")
+                audio_input = gr.Audio(label="Upload or Record Audio", type="filepath")
+                sentiment_option = gr.Radio(choices=["Sentiment Only", "Sentiment + Score"], label="Sentiment Output", value="Sentiment Only")
+                image_input = gr.Image(label="Upload Image", type="pil", visible=False)
+                tts_voice = gr.Dropdown(label="Select Voice", choices=[""] + list(voices.keys()), value="")
+                tts_rate = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1)
+                tts_pitch = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
+                def update_visibility(input_type):
+                    return gr.update(visible=input_type == "Audio"), gr.update(visible=input_type == "Image")
+                input_type.change(update_visibility, inputs=[input_type], outputs=[audio_input, image_input])
                 submit_btn = gr.Button("Submit")
             with gr.Column():
                 gr.Markdown("### Bot Response")
+                lang_str = gr.Textbox(label="Detected Language", interactive=False)
+                text = gr.Textbox(label="Transcription or Prediction", interactive=False)
+                sentiment_output = gr.Textbox(label="Sentiment Analysis Results", interactive=False)
+                search_results = gr.Textbox(label="Explanation", interactive=False)
+                audio_output = gr.Audio(label="Generated Explanation Audio", type="filepath", interactive=False)
+        submit_btn.click(process_input, inputs=[input_type, audio_input, image_input, sentiment_option, api_key_input, tts_voice, tts_rate, tts_pitch], outputs=[lang_str, text, sentiment_output, search_results, audio_output])
     demo.launch(share=True)
+asyncio.create_task(main())