jobsm commited on
Commit
1d38dc5
·
verified ·
1 Parent(s): feebf81

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -15
app.py CHANGED
@@ -6,6 +6,9 @@ import cv2
6
  import string
7
  import numpy as np
8
  import tensorflow as tf
 
 
 
9
 
10
  # Load models
11
  whisper_model = whisper.load_model("base")
@@ -19,6 +22,13 @@ def load_sign_language_model():
19
 
20
  sign_language_model = load_sign_language_model()
21
 
 
 
 
 
 
 
 
22
  # Audio-based functions
23
 
24
 
@@ -58,9 +68,31 @@ def search_text(text, api_key):
58
  return {"error": str(e)}
59
 
60
 
61
- def inference_audio(audio, sentiment_option, api_key):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  if audio is None:
63
- return "No audio file provided.", "", "", ""
64
 
65
  audio = whisper.load_audio(audio)
66
  audio = whisper.pad_or_trim(audio)
@@ -78,7 +110,11 @@ def inference_audio(audio, sentiment_option, api_key):
78
 
79
  search_results = search_text(result.text, api_key)
80
 
81
- return lang.upper(), result.text, sentiment_output, search_results
 
 
 
 
82
 
83
  # Image-based functions
84
 
@@ -100,6 +136,12 @@ def get_explanation(letter, api_key):
100
  response_data = response.json()
101
  explanation = response_data.get("contents", [{}])[0].get("parts", [{}])[
102
  0].get("text", "No explanation available.")
 
 
 
 
 
 
103
  return explanation
104
  except requests.RequestException as e:
105
  return f"Error fetching explanation: {e}"
@@ -125,17 +167,21 @@ def classify_sign_language(image, api_key):
125
  # Gradio interface
126
 
127
 
128
- def process_input(input_type, audio=None, image=None, sentiment_option=None, api_key=None):
129
  if input_type == "Audio":
130
- return inference_audio(audio, sentiment_option, api_key)
131
  elif input_type == "Image":
132
  pred, explanation = classify_sign_language(image, api_key)
133
- return "N/A", pred, "N/A", explanation
 
 
 
134
 
 
 
135
 
136
- def main():
137
  with gr.Blocks() as demo:
138
- gr.Markdown("# 🤖 Speak & Sign AI Assistant")
139
 
140
  # Layout: Split user input and bot response sides
141
  with gr.Row():
@@ -160,6 +206,14 @@ def main():
160
  image_input = gr.Image(
161
  label="Upload Image", type="pil", visible=False)
162
 
 
 
 
 
 
 
 
 
163
  # Change input visibility based on selection
164
  def update_visibility(input_type):
165
  if input_type == "Audio":
@@ -184,13 +238,19 @@ def main():
184
  sentiment_output = gr.Textbox(
185
  label="Sentiment Analysis Results", interactive=False)
186
  search_results = gr.Textbox(
187
- label="Explanation or API Search Results", interactive=False)
188
-
189
- submit_btn.click(process_input, inputs=[input_type, audio_input, image_input, sentiment_option, api_key_input], outputs=[
190
- lang_str, text, sentiment_output, search_results])
 
 
 
 
 
 
 
 
191
 
192
  demo.launch(share=True)
193
 
194
-
195
- if __name__ == "__main__":
196
- main()
 
6
  import string
7
  import numpy as np
8
  import tensorflow as tf
9
+ import edge_tts
10
+ import asyncio
11
+ import tempfile
12
 
13
  # Load models
14
  whisper_model = whisper.load_model("base")
 
22
 
23
  sign_language_model = load_sign_language_model()
24
 
25
+ # Get all available voices
26
+
27
+
28
+ async def get_voices():
29
+ voices = await edge_tts.list_voices()
30
+ return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
31
+
32
  # Audio-based functions
33
 
34
 
 
68
  return {"error": str(e)}
69
 
70
 
71
+ async def text_to_speech(text, voice, rate, pitch):
72
+ if not text.strip():
73
+ return None, gr.Warning("Please enter text to convert.")
74
+ if not voice:
75
+ return None, gr.Warning("Please select a voice.")
76
+
77
+ voice_short_name = voice.split(" - ")[0]
78
+ rate_str = f"{rate:+d}%"
79
+ pitch_str = f"{pitch:+d}Hz"
80
+ communicate = edge_tts.Communicate(
81
+ text, voice_short_name, rate=rate_str, pitch=pitch_str)
82
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
83
+ tmp_path = tmp_file.name
84
+ await communicate.save(tmp_path)
85
+ return tmp_path, None
86
+
87
+
88
+ async def tts_interface(text, voice, rate, pitch):
89
+ audio, warning = await text_to_speech(text, voice, rate, pitch)
90
+ return audio, warning
91
+
92
+
93
+ def inference_audio(audio, sentiment_option, api_key, tts_voice, tts_rate, tts_pitch):
94
  if audio is None:
95
+ return "No audio file provided.", "", "", "", None
96
 
97
  audio = whisper.load_audio(audio)
98
  audio = whisper.pad_or_trim(audio)
 
110
 
111
  search_results = search_text(result.text, api_key)
112
 
113
+ # Generate audio for explanation
114
+ explanation_audio, _ = asyncio.run(tts_interface(
115
+ search_results, tts_voice, tts_rate, tts_pitch))
116
+
117
+ return lang.upper(), result.text, sentiment_output, search_results, explanation_audio
118
 
119
  # Image-based functions
120
 
 
136
  response_data = response.json()
137
  explanation = response_data.get("contents", [{}])[0].get("parts", [{}])[
138
  0].get("text", "No explanation available.")
139
+ # Remove unnecessary symbols and formatting
140
+ explanation = explanation.replace(
141
+ "*", "").replace("#", "").replace("$", "").replace("\n", " ").strip()
142
+ # Remove additional special characters, if needed
143
+ explanation = explanation.translate(
144
+ str.maketrans('', '', string.punctuation))
145
  return explanation
146
  except requests.RequestException as e:
147
  return f"Error fetching explanation: {e}"
 
167
  # Gradio interface
168
 
169
 
170
+ def process_input(input_type, audio=None, image=None, sentiment_option=None, api_key=None, tts_voice=None, tts_rate=0, tts_pitch=0):
171
  if input_type == "Audio":
172
+ return inference_audio(audio, sentiment_option, api_key, tts_voice, tts_rate, tts_pitch)
173
  elif input_type == "Image":
174
  pred, explanation = classify_sign_language(image, api_key)
175
+ explanation_audio, _ = asyncio.run(tts_interface(
176
+ explanation, tts_voice, tts_rate, tts_pitch))
177
+ return "N/A", pred, "N/A", explanation, explanation_audio
178
+
179
 
180
+ async def main():
181
+ voices = await get_voices()
182
 
 
183
  with gr.Blocks() as demo:
184
+ gr.Markdown("# Speak & Sign AI Assistant")
185
 
186
  # Layout: Split user input and bot response sides
187
  with gr.Row():
 
206
  image_input = gr.Image(
207
  label="Upload Image", type="pil", visible=False)
208
 
209
+ # TTS settings for explanation
210
+ tts_voice = gr.Dropdown(label="Select Voice", choices=[
211
+ ] + list(voices.keys()), value="")
212
+ tts_rate = gr.Slider(
213
+ minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1)
214
+ tts_pitch = gr.Slider(
215
+ minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
216
+
217
  # Change input visibility based on selection
218
  def update_visibility(input_type):
219
  if input_type == "Audio":
 
238
  sentiment_output = gr.Textbox(
239
  label="Sentiment Analysis Results", interactive=False)
240
  search_results = gr.Textbox(
241
+ label="Explanation or Search Results", interactive=False)
242
+ audio_output = gr.Audio(
243
+ label="Generated Explanation Audio", type="filepath", interactive=False)
244
+
245
+ # Submit button action
246
+ submit_btn.click(
247
+ process_input,
248
+ inputs=[input_type, audio_input, image_input, sentiment_option,
249
+ api_key_input, tts_voice, tts_rate, tts_pitch],
250
+ outputs=[lang_str, text, sentiment_output,
251
+ search_results, audio_output]
252
+ )
253
 
254
  demo.launch(share=True)
255
 
256
+ asyncio.run(main())