File size: 7,147 Bytes
d1c1b86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7077adc
d1c1b86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import gradio as gr
import whisper
from transformers import pipeline
import requests
import cv2
import string
import numpy as np
import tensorflow as tf

# Load models
whisper_model = whisper.load_model("base")
sentiment_analysis = pipeline(
    "sentiment-analysis", framework="pt", model="SamLowe/roberta-base-go_emotions")


def load_sign_language_model():
    return tf.keras.models.load_model('best_model.h5')


sign_language_model = load_sign_language_model()

# Audio-based functions


def analyze_sentiment(text):
    results = sentiment_analysis(text)
    sentiment_results = {result['label']: result['score']
                         for result in results}
    return sentiment_results


def display_sentiment_results(sentiment_results, option):
    sentiment_text = ""
    for sentiment, score in sentiment_results.items():
        if option == "Sentiment Only":
            sentiment_text += f"{sentiment}\n"
        elif option == "Sentiment + Score":
            sentiment_text += f"{sentiment}: {score}\n"
    return sentiment_text


def search_text(text, api_key):
    api_endpoint = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent"
    headers = {"Content-Type": "application/json"}
    payload = {"contents": [{"parts": [{"text": text}]}]}

    try:
        response = requests.post(
            api_endpoint, headers=headers, json=payload, params={"key": api_key})
        response.raise_for_status()
        response_json = response.json()
        if 'candidates' in response_json and len(response_json['candidates']) > 0:
            content_parts = response_json['candidates'][0]['content']['parts']
            if len(content_parts) > 0:
                return content_parts[0]['text'].strip()
        return "No relevant content found."
    except requests.exceptions.RequestException as e:
        return {"error": str(e)}


def inference_audio(audio, sentiment_option, api_key):
    if audio is None:
        return "No audio file provided.", "", "", ""

    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)

    _, probs = whisper_model.detect_language(mel)
    lang = max(probs, key=probs.get)

    options = whisper.DecodingOptions(fp16=False)
    result = whisper.decode(whisper_model, mel, options)

    sentiment_results = analyze_sentiment(result.text)
    sentiment_output = display_sentiment_results(
        sentiment_results, sentiment_option)

    search_results = search_text(result.text, api_key)

    return lang.upper(), result.text, sentiment_output, search_results

# Image-based functions


def get_explanation(letter, api_key):
    url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent"
    headers = {"Content-Type": "application/json"}
    data = {
        "contents": [
            {"parts": [{"text": f"Explain how the American Sign Language letter '{letter}' is shown, its significance, and why it is represented this way."}]}
        ]
    }
    params = {"key": api_key}

    try:
        response = requests.post(url, headers=headers,
                                 json=data, params=params)
        response.raise_for_status()
        response_data = response.json()
        explanation = response_data.get("contents", [{}])[0].get("parts", [{}])[
            0].get("text", "No explanation available.")
        return explanation
    except requests.RequestException as e:
        return f"Error fetching explanation: {e}"


def classify_sign_language(image, api_key):
    img = np.array(image)
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray_img = cv2.resize(gray_img, (28, 28))
    normalized_img = gray_img / 255.0
    input_img = np.expand_dims(normalized_img, axis=0)

    output = sign_language_model.predict(input_img)
    output = np.argmax(output, axis=1).item()
    uppercase_alphabet = string.ascii_uppercase
    output = output + 1 if output > 7 else output
    pred = uppercase_alphabet[output]

    explanation = get_explanation(pred, api_key)

    return pred, explanation

# Gradio interface


def process_input(input_type, audio=None, image=None, sentiment_option=None, api_key=None):
    if input_type == "Audio":
        return inference_audio(audio, sentiment_option, api_key)
    elif input_type == "Image":
        pred, explanation = classify_sign_language(image, api_key)
        return "N/A", pred, "N/A", explanation


def main():
    with gr.Blocks() as demo:
        gr.Markdown("# 🤖 Speak & Sign AI Assistant")

        # Layout: Split user input and bot response sides
        with gr.Row():
            # User Input Side
            with gr.Column():
                gr.Markdown("### User Input")
                # Input selection
                input_type = gr.Radio(label="Choose Input Type", choices=[
                                      "Audio", "Image"], value="Audio")

                # API key input
                api_key_input = gr.Textbox(
                    label="API Key", placeholder="Your API key here", type="password")

                # Audio input
                audio_input = gr.Audio(
                    label="Upload or Record Audio", type="filepath", visible=True)
                sentiment_option = gr.Radio(choices=[
                                            "Sentiment Only", "Sentiment + Score"], label="Sentiment Output", value="Sentiment Only", visible=True)

                # Image input
                image_input = gr.Image(
                    label="Upload Image", type="pil", visible=False)

                # Change input visibility based on selection
                def update_visibility(input_type):
                    if input_type == "Audio":
                        return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)
                    else:
                        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)

                input_type.change(update_visibility, inputs=input_type, outputs=[
                                  audio_input, sentiment_option, image_input])

                # Submit button
                submit_btn = gr.Button("Submit")

            # Bot Response Side
            with gr.Column():
                gr.Markdown("### Bot Response")

                lang_str = gr.Textbox(
                    label="Detected Language", interactive=False)
                text = gr.Textbox(
                    label="Transcription or Prediction", interactive=False)
                sentiment_output = gr.Textbox(
                    label="Sentiment Analysis Results", interactive=False)
                search_results = gr.Textbox(
                    label="Explanation or API Search Results", interactive=False)

                submit_btn.click(process_input, inputs=[input_type, audio_input, image_input, sentiment_option, api_key_input], outputs=[
                                 lang_str, text, sentiment_output, search_results])

    demo.launch(share=True)


if __name__ == "__main__":
    main()