Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,9 @@ import cv2
|
|
6 |
import string
|
7 |
import numpy as np
|
8 |
import tensorflow as tf
|
|
|
|
|
|
|
9 |
|
10 |
# Load models
|
11 |
whisper_model = whisper.load_model("base")
|
@@ -19,6 +22,13 @@ def load_sign_language_model():
|
|
19 |
|
20 |
sign_language_model = load_sign_language_model()
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
# Audio-based functions
|
23 |
|
24 |
|
@@ -58,9 +68,31 @@ def search_text(text, api_key):
|
|
58 |
return {"error": str(e)}
|
59 |
|
60 |
|
61 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
if audio is None:
|
63 |
-
return "No audio file provided.", "", "", ""
|
64 |
|
65 |
audio = whisper.load_audio(audio)
|
66 |
audio = whisper.pad_or_trim(audio)
|
@@ -78,7 +110,11 @@ def inference_audio(audio, sentiment_option, api_key):
|
|
78 |
|
79 |
search_results = search_text(result.text, api_key)
|
80 |
|
81 |
-
|
|
|
|
|
|
|
|
|
82 |
|
83 |
# Image-based functions
|
84 |
|
@@ -100,6 +136,12 @@ def get_explanation(letter, api_key):
|
|
100 |
response_data = response.json()
|
101 |
explanation = response_data.get("contents", [{}])[0].get("parts", [{}])[
|
102 |
0].get("text", "No explanation available.")
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
return explanation
|
104 |
except requests.RequestException as e:
|
105 |
return f"Error fetching explanation: {e}"
|
@@ -125,17 +167,21 @@ def classify_sign_language(image, api_key):
|
|
125 |
# Gradio interface
|
126 |
|
127 |
|
128 |
-
def process_input(input_type, audio=None, image=None, sentiment_option=None, api_key=None):
|
129 |
if input_type == "Audio":
|
130 |
-
return inference_audio(audio, sentiment_option, api_key)
|
131 |
elif input_type == "Image":
|
132 |
pred, explanation = classify_sign_language(image, api_key)
|
133 |
-
|
|
|
|
|
|
|
134 |
|
|
|
|
|
135 |
|
136 |
-
def main():
|
137 |
with gr.Blocks() as demo:
|
138 |
-
gr.Markdown("#
|
139 |
|
140 |
# Layout: Split user input and bot response sides
|
141 |
with gr.Row():
|
@@ -160,6 +206,14 @@ def main():
|
|
160 |
image_input = gr.Image(
|
161 |
label="Upload Image", type="pil", visible=False)
|
162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
# Change input visibility based on selection
|
164 |
def update_visibility(input_type):
|
165 |
if input_type == "Audio":
|
@@ -184,13 +238,19 @@ def main():
|
|
184 |
sentiment_output = gr.Textbox(
|
185 |
label="Sentiment Analysis Results", interactive=False)
|
186 |
search_results = gr.Textbox(
|
187 |
-
label="Explanation or
|
188 |
-
|
189 |
-
|
190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
demo.launch(share=True)
|
193 |
|
194 |
-
|
195 |
-
if __name__ == "__main__":
|
196 |
-
main()
|
|
|
6 |
import string
|
7 |
import numpy as np
|
8 |
import tensorflow as tf
|
9 |
+
import edge_tts
|
10 |
+
import asyncio
|
11 |
+
import tempfile
|
12 |
|
13 |
# Load models
|
14 |
whisper_model = whisper.load_model("base")
|
|
|
22 |
|
23 |
sign_language_model = load_sign_language_model()
|
24 |
|
25 |
+
# Get all available voices
|
26 |
+
|
27 |
+
|
28 |
+
async def get_voices():
|
29 |
+
voices = await edge_tts.list_voices()
|
30 |
+
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
|
31 |
+
|
32 |
# Audio-based functions
|
33 |
|
34 |
|
|
|
68 |
return {"error": str(e)}
|
69 |
|
70 |
|
71 |
+
async def text_to_speech(text, voice, rate, pitch):
|
72 |
+
if not text.strip():
|
73 |
+
return None, gr.Warning("Please enter text to convert.")
|
74 |
+
if not voice:
|
75 |
+
return None, gr.Warning("Please select a voice.")
|
76 |
+
|
77 |
+
voice_short_name = voice.split(" - ")[0]
|
78 |
+
rate_str = f"{rate:+d}%"
|
79 |
+
pitch_str = f"{pitch:+d}Hz"
|
80 |
+
communicate = edge_tts.Communicate(
|
81 |
+
text, voice_short_name, rate=rate_str, pitch=pitch_str)
|
82 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
|
83 |
+
tmp_path = tmp_file.name
|
84 |
+
await communicate.save(tmp_path)
|
85 |
+
return tmp_path, None
|
86 |
+
|
87 |
+
|
88 |
+
async def tts_interface(text, voice, rate, pitch):
|
89 |
+
audio, warning = await text_to_speech(text, voice, rate, pitch)
|
90 |
+
return audio, warning
|
91 |
+
|
92 |
+
|
93 |
+
def inference_audio(audio, sentiment_option, api_key, tts_voice, tts_rate, tts_pitch):
|
94 |
if audio is None:
|
95 |
+
return "No audio file provided.", "", "", "", None
|
96 |
|
97 |
audio = whisper.load_audio(audio)
|
98 |
audio = whisper.pad_or_trim(audio)
|
|
|
110 |
|
111 |
search_results = search_text(result.text, api_key)
|
112 |
|
113 |
+
# Generate audio for explanation
|
114 |
+
explanation_audio, _ = asyncio.run(tts_interface(
|
115 |
+
search_results, tts_voice, tts_rate, tts_pitch))
|
116 |
+
|
117 |
+
return lang.upper(), result.text, sentiment_output, search_results, explanation_audio
|
118 |
|
119 |
# Image-based functions
|
120 |
|
|
|
136 |
response_data = response.json()
|
137 |
explanation = response_data.get("contents", [{}])[0].get("parts", [{}])[
|
138 |
0].get("text", "No explanation available.")
|
139 |
+
# Remove unnecessary symbols and formatting
|
140 |
+
explanation = explanation.replace(
|
141 |
+
"*", "").replace("#", "").replace("$", "").replace("\n", " ").strip()
|
142 |
+
# Remove additional special characters, if needed
|
143 |
+
explanation = explanation.translate(
|
144 |
+
str.maketrans('', '', string.punctuation))
|
145 |
return explanation
|
146 |
except requests.RequestException as e:
|
147 |
return f"Error fetching explanation: {e}"
|
|
|
167 |
# Gradio interface
|
168 |
|
169 |
|
170 |
+
def process_input(input_type, audio=None, image=None, sentiment_option=None, api_key=None, tts_voice=None, tts_rate=0, tts_pitch=0):
|
171 |
if input_type == "Audio":
|
172 |
+
return inference_audio(audio, sentiment_option, api_key, tts_voice, tts_rate, tts_pitch)
|
173 |
elif input_type == "Image":
|
174 |
pred, explanation = classify_sign_language(image, api_key)
|
175 |
+
explanation_audio, _ = asyncio.run(tts_interface(
|
176 |
+
explanation, tts_voice, tts_rate, tts_pitch))
|
177 |
+
return "N/A", pred, "N/A", explanation, explanation_audio
|
178 |
+
|
179 |
|
180 |
+
async def main():
|
181 |
+
voices = await get_voices()
|
182 |
|
|
|
183 |
with gr.Blocks() as demo:
|
184 |
+
gr.Markdown("# Speak & Sign AI Assistant")
|
185 |
|
186 |
# Layout: Split user input and bot response sides
|
187 |
with gr.Row():
|
|
|
206 |
image_input = gr.Image(
|
207 |
label="Upload Image", type="pil", visible=False)
|
208 |
|
209 |
+
# TTS settings for explanation
|
210 |
+
tts_voice = gr.Dropdown(label="Select Voice", choices=[
|
211 |
+
] + list(voices.keys()), value="")
|
212 |
+
tts_rate = gr.Slider(
|
213 |
+
minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1)
|
214 |
+
tts_pitch = gr.Slider(
|
215 |
+
minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
|
216 |
+
|
217 |
# Change input visibility based on selection
|
218 |
def update_visibility(input_type):
|
219 |
if input_type == "Audio":
|
|
|
238 |
sentiment_output = gr.Textbox(
|
239 |
label="Sentiment Analysis Results", interactive=False)
|
240 |
search_results = gr.Textbox(
|
241 |
+
label="Explanation or Search Results", interactive=False)
|
242 |
+
audio_output = gr.Audio(
|
243 |
+
label="Generated Explanation Audio", type="filepath", interactive=False)
|
244 |
+
|
245 |
+
# Submit button action
|
246 |
+
submit_btn.click(
|
247 |
+
process_input,
|
248 |
+
inputs=[input_type, audio_input, image_input, sentiment_option,
|
249 |
+
api_key_input, tts_voice, tts_rate, tts_pitch],
|
250 |
+
outputs=[lang_str, text, sentiment_output,
|
251 |
+
search_results, audio_output]
|
252 |
+
)
|
253 |
|
254 |
demo.launch(share=True)
|
255 |
|
256 |
+
asyncio.run(main())
|
|
|
|