import gradio as gr import whisper from transformers import pipeline import requests import cv2 import string import numpy as np import tensorflow as tf import edge_tts import asyncio import tempfile # Load models whisper_model = whisper.load_model("base") sentiment_analysis = pipeline( "sentiment-analysis", framework="pt", model="SamLowe/roberta-base-go_emotions") def load_sign_language_model(): return tf.keras.models.load_model('best_model.h5') sign_language_model = load_sign_language_model() # Get all available voices async def get_voices(): voices = await edge_tts.list_voices() return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices} # Audio-based functions def analyze_sentiment(text): results = sentiment_analysis(text) sentiment_results = {result['label']: result['score'] for result in results} return sentiment_results def display_sentiment_results(sentiment_results, option): sentiment_text = "" for sentiment, score in sentiment_results.items(): if option == "Sentiment Only": sentiment_text += f"{sentiment}\n" elif option == "Sentiment + Score": sentiment_text += f"{sentiment}: {score}\n" return sentiment_text def search_text(text, api_key): api_endpoint = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent" headers = {"Content-Type": "application/json"} payload = {"contents": [{"parts": [{"text": text}]}]} try: response = requests.post( api_endpoint, headers=headers, json=payload, params={"key": api_key}) response.raise_for_status() response_json = response.json() if 'candidates' in response_json and len(response_json['candidates']) > 0: content_parts = response_json['candidates'][0]['content']['parts'] if len(content_parts) > 0: return content_parts[0]['text'].strip() return "No relevant content found." except requests.exceptions.RequestException as e: return {"error": str(e)} async def text_to_speech(text, voice, rate, pitch): if not text.strip(): return None, gr.Warning("Please enter text to convert.") if not voice: return None, gr.Warning("Please select a voice.") voice_short_name = voice.split(" - ")[0] rate_str = f"{rate:+d}%" pitch_str = f"{pitch:+d}Hz" communicate = edge_tts.Communicate( text, voice_short_name, rate=rate_str, pitch=pitch_str) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: tmp_path = tmp_file.name await communicate.save(tmp_path) return tmp_path, None async def tts_interface(text, voice, rate, pitch): audio, warning = await text_to_speech(text, voice, rate, pitch) return audio, warning def inference_audio(audio, sentiment_option, api_key, tts_voice, tts_rate, tts_pitch): if audio is None: return "No audio file provided.", "", "", "", None audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device) _, probs = whisper_model.detect_language(mel) lang = max(probs, key=probs.get) options = whisper.DecodingOptions(fp16=False) result = whisper.decode(whisper_model, mel, options) sentiment_results = analyze_sentiment(result.text) sentiment_output = display_sentiment_results( sentiment_results, sentiment_option) search_results = search_text(result.text, api_key) # Generate audio for explanation explanation_audio, _ = asyncio.run(tts_interface( search_results, tts_voice, tts_rate, tts_pitch)) return lang.upper(), result.text, sentiment_output, search_results, explanation_audio # Image-based functions def get_explanation(letter, api_key): url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent" headers = {"Content-Type": "application/json"} data = { "contents": [ {"parts": [{"text": f"Explain how the American Sign Language letter '{letter}' is shown, its significance, and why it is represented this way."}]} ] } params = {"key": api_key} try: response = requests.post(url, headers=headers, json=data, params=params) response.raise_for_status() response_data = response.json() explanation = response_data.get("contents", [{}])[0].get("parts", [{}])[ 0].get("text", "No explanation available.") # Remove unnecessary symbols and formatting explanation = explanation.replace( "*", "").replace("#", "").replace("$", "").replace("\n", " ").strip() # Remove additional special characters, if needed explanation = explanation.translate( str.maketrans('', '', string.punctuation)) return explanation except requests.RequestException as e: return f"Error fetching explanation: {e}" def classify_sign_language(image, api_key): img = np.array(image) gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) gray_img = cv2.resize(gray_img, (28, 28)) normalized_img = gray_img / 255.0 input_img = np.expand_dims(normalized_img, axis=0) output = sign_language_model.predict(input_img) output = np.argmax(output, axis=1).item() uppercase_alphabet = string.ascii_uppercase output = output + 1 if output > 7 else output pred = uppercase_alphabet[output] explanation = get_explanation(pred, api_key) return pred, explanation # Gradio interface def process_input(input_type, audio=None, image=None, sentiment_option=None, api_key=None, tts_voice=None, tts_rate=0, tts_pitch=0): if input_type == "Audio": return inference_audio(audio, sentiment_option, api_key, tts_voice, tts_rate, tts_pitch) elif input_type == "Image": pred, explanation = classify_sign_language(image, api_key) explanation_audio, _ = asyncio.run(tts_interface( explanation, tts_voice, tts_rate, tts_pitch)) return "N/A", pred, "N/A", explanation, explanation_audio async def main(): voices = await get_voices() with gr.Blocks() as demo: gr.Markdown("# Speak & Sign AI Assistant") # Layout: Split user input and bot response sides with gr.Row(): # User Input Side with gr.Column(): gr.Markdown("### User Input") # Input selection input_type = gr.Radio(label="Choose Input Type", choices=[ "Audio", "Image"], value="Audio") # API key input api_key_input = gr.Textbox( label="API Key", placeholder="Your API key here", type="password") # Audio input audio_input = gr.Audio( label="Upload or Record Audio", type="filepath", visible=True) sentiment_option = gr.Radio(choices=[ "Sentiment Only", "Sentiment + Score"], label="Sentiment Output", value="Sentiment Only", visible=True) # Image input image_input = gr.Image( label="Upload Image", type="pil", visible=False) # TTS settings for explanation tts_voice = gr.Dropdown(label="Select Voice", choices=[ ] + list(voices.keys()), value="") tts_rate = gr.Slider( minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1) tts_pitch = gr.Slider( minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1) # Change input visibility based on selection def update_visibility(input_type): if input_type == "Audio": return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False) else: return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True) input_type.change(update_visibility, inputs=input_type, outputs=[ audio_input, sentiment_option, image_input]) # Submit button submit_btn = gr.Button("Submit") # Bot Response Side with gr.Column(): gr.Markdown("### Bot Response") lang_str = gr.Textbox( label="Detected Language", interactive=False) text = gr.Textbox( label="Transcription or Prediction", interactive=False) sentiment_output = gr.Textbox( label="Sentiment Analysis Results", interactive=False) search_results = gr.Textbox( label="Explanation or Search Results", interactive=False) audio_output = gr.Audio( label="Generated Explanation Audio", type="filepath", interactive=False) # Submit button action submit_btn.click( process_input, inputs=[input_type, audio_input, image_input, sentiment_option, api_key_input, tts_voice, tts_rate, tts_pitch], outputs=[lang_str, text, sentiment_output, search_results, audio_output] ) demo.launch(share=True) asyncio.run(main())