abdullahmubeen10 commited on
Commit
e6f3c3d
Β·
verified Β·
1 Parent(s): 3c8faec

Upload 15 files

Browse files
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [theme]
2
+ base="light"
3
+ primaryColor="#29B4E8"
Demo.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sparknlp
3
+ import os
4
+ import pandas as pd
5
+ import librosa
6
+
7
+ from sparknlp.base import *
8
+ from sparknlp.common import *
9
+ from sparknlp.annotator import *
10
+ from pyspark.ml import Pipeline
11
+ from sparknlp.pretrained import PretrainedPipeline
12
+ from pyspark.sql.types import *
13
+ import pyspark.sql.functions as F
14
+
15
+ # Page configuration
16
+ st.set_page_config(
17
+ layout="wide",
18
+ initial_sidebar_state="auto"
19
+ )
20
+
21
+ # Custom CSS for styling
22
+ st.markdown("""
23
+ <style>
24
+ .main-title {
25
+ font-size: 36px;
26
+ color: #4A90E2;
27
+ font-weight: bold;
28
+ text-align: center;
29
+ }
30
+ .section {
31
+ background-color: #f9f9f9;
32
+ padding: 10px;
33
+ border-radius: 10px;
34
+ margin-top: 10px;
35
+ }
36
+ .section p, .section ul {
37
+ color: #666666;
38
+ }
39
+ </style>
40
+ """, unsafe_allow_html=True)
41
+
42
+ @st.cache_resource
43
+ def init_spark():
44
+ """Initialize Spark NLP."""
45
+ return sparknlp.start()
46
+
47
+ @st.cache_resource
48
+ def create_pipeline(model):
49
+ """Create a Spark NLP pipeline for audio processing."""
50
+ audio_assembler = AudioAssembler() \
51
+ .setInputCol("audio_content") \
52
+ .setOutputCol("audio_assembler")
53
+
54
+ speech_to_text = WhisperForCTC \
55
+ .pretrained(model)\
56
+ .setInputCols("audio_assembler") \
57
+ .setOutputCol("text")
58
+
59
+ pipeline = Pipeline(stages=[
60
+ audio_assembler,
61
+ speech_to_text
62
+ ])
63
+ return pipeline
64
+
65
+ def fit_data(pipeline, fed_data):
66
+ """Fit the data into the pipeline and return the transcription."""
67
+ data, sampling_rate = librosa.load(fed_data, sr=16000)
68
+ data = data.tolist()
69
+ spark_df = spark.createDataFrame([[data]], ["audio_content"])
70
+
71
+ model = pipeline.fit(spark_df)
72
+ lp = LightPipeline(model)
73
+ lp_result = lp.fullAnnotate(data)[0]
74
+ return lp_result
75
+
76
+ def save_uploadedfile(uploadedfile, path):
77
+ """Save the uploaded file to the specified path."""
78
+ filepath = os.path.join(path, uploadedfile.name)
79
+ with open(filepath, "wb") as f:
80
+ if hasattr(uploadedfile, 'getbuffer'):
81
+ f.write(uploadedfile.getbuffer())
82
+ else:
83
+ f.write(uploadedfile.read())
84
+
85
+ # Sidebar content
86
+ model_list = ["asr_whisper_small_english"]
87
+ model = st.sidebar.selectbox(
88
+ "Choose the pretrained model",
89
+ model_list,
90
+ help="For more info about the models visit: https://sparknlp.org/models"
91
+ )
92
+
93
+ # Main content
94
+ st.markdown('<div class="main-title">Speech Recognition With WhisperForCTC</div>', unsafe_allow_html=True)
95
+ st.markdown('<div class="section"><p>This demo transcribes audio files into texts using the <code>WhisperForCTC</code> Annotator and advanced speech recognition models.</p></div>', unsafe_allow_html=True)
96
+
97
+ # Reference notebook link in sidebar
98
+ st.sidebar.markdown('Reference notebook:')
99
+ st.sidebar.markdown("""
100
+ <a href="https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/audio/whisper/Automatic_Speech_Recognition_Whisper_(WhisperForCTC).ipynb">
101
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
102
+ </a>
103
+ """, unsafe_allow_html=True)
104
+
105
+ # Load examples
106
+ AUDIO_FILE_PATH = "inputs"
107
+ audio_files = sorted(os.listdir(AUDIO_FILE_PATH))
108
+
109
+ selected_audio = st.selectbox("Select an audio", audio_files)
110
+
111
+ # Creating a simplified Python list of audio file types
112
+ audio_file_types = ["mp3", "flac", "wav", "aac", "ogg", "aiff", "wma", "m4a", "ape", "dsf", "dff", "midi", "mid", "opus", "amr"]
113
+ uploadedfile = st.file_uploader("Try it for yourself!", type=audio_file_types)
114
+
115
+ if uploadedfile:
116
+ selected_audio = f"{AUDIO_FILE_PATH}/{uploadedfile.name}"
117
+ save_uploadedfile(uploadedfile, AUDIO_FILE_PATH)
118
+ elif selected_audio:
119
+ selected_audio = f"{AUDIO_FILE_PATH}/{selected_audio}"
120
+
121
+ # Audio playback and transcription
122
+ st.subheader("Play Audio")
123
+
124
+ with open(selected_audio, 'rb') as audio_file:
125
+ audio_bytes = audio_file.read()
126
+ st.audio(audio_bytes)
127
+
128
+ spark = init_spark()
129
+ pipeline = create_pipeline(model)
130
+ output = fit_data(pipeline, selected_audio)
131
+
132
+ st.subheader(f"Transcription:")
133
+ st.markdown(f"{(output['text'][0].result).title()}")
Dockerfile ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Download base image ubuntu 18.04
2
+ FROM ubuntu:18.04
3
+
4
+ # Set environment variables
5
+ ENV NB_USER jovyan
6
+ ENV NB_UID 1000
7
+ ENV HOME /home/${NB_USER}
8
+
9
+ # Install required packages
10
+ RUN apt-get update && apt-get install -y \
11
+ tar \
12
+ wget \
13
+ bash \
14
+ rsync \
15
+ gcc \
16
+ libfreetype6-dev \
17
+ libhdf5-serial-dev \
18
+ libpng-dev \
19
+ libzmq3-dev \
20
+ python3 \
21
+ python3-dev \
22
+ python3-pip \
23
+ unzip \
24
+ pkg-config \
25
+ software-properties-common \
26
+ graphviz \
27
+ openjdk-8-jdk \
28
+ ant \
29
+ ca-certificates-java \
30
+ && apt-get clean \
31
+ && update-ca-certificates -f;
32
+
33
+ # Install Python 3.8 and pip
34
+ RUN add-apt-repository ppa:deadsnakes/ppa \
35
+ && apt-get update \
36
+ && apt-get install -y python3.8 python3-pip \
37
+ && apt-get clean;
38
+
39
+ # Set up JAVA_HOME
40
+ ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
41
+ RUN mkdir -p ${HOME} \
42
+ && echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> ${HOME}/.bashrc \
43
+ && chown -R ${NB_UID}:${NB_UID} ${HOME}
44
+
45
+ # Create a new user named "jovyan" with user ID 1000
46
+ RUN useradd -m -u ${NB_UID} ${NB_USER}
47
+
48
+ # Switch to the "jovyan" user
49
+ USER ${NB_USER}
50
+
51
+ # Set home and path variables for the user
52
+ ENV HOME=/home/${NB_USER} \
53
+ PATH=/home/${NB_USER}/.local/bin:$PATH
54
+
55
+ # Set the working directory to the user's home directory
56
+ WORKDIR ${HOME}
57
+
58
+ # Upgrade pip and install Python dependencies
59
+ RUN python3.8 -m pip install --upgrade pip
60
+ COPY requirements.txt /tmp/requirements.txt
61
+ RUN python3.8 -m pip install -r /tmp/requirements.txt
62
+
63
+ # Copy the application code into the container at /home/jovyan
64
+ COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
65
+
66
+ # Expose port for Streamlit
67
+ EXPOSE 7860
68
+
69
+ # Define the entry point for the container
70
+ ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
inputs/audio-1.flac ADDED
Binary file (112 kB). View file
 
inputs/audio-10.flac ADDED
Binary file (76 kB). View file
 
inputs/audio-2.flac ADDED
Binary file (49 kB). View file
 
inputs/audio-3.flac ADDED
Binary file (74 kB). View file
 
inputs/audio-4.flac ADDED
Binary file (113 kB). View file
 
inputs/audio-5.flac ADDED
Binary file (138 kB). View file
 
inputs/audio-6.flac ADDED
Binary file (36.5 kB). View file
 
inputs/audio-7.flac ADDED
Binary file (177 kB). View file
 
inputs/audio-8.flac ADDED
Binary file (94.3 kB). View file
 
inputs/audio-9.flac ADDED
Binary file (129 kB). View file
 
pages/Workflow & Model Overview.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # Custom CSS for better styling
4
+ st.markdown("""
5
+ <style>
6
+ .main-title {
7
+ font-size: 36px;
8
+ color: #4A90E2;
9
+ font-weight: bold;
10
+ text-align: center;
11
+ }
12
+ .sub-title {
13
+ font-size: 24px;
14
+ color: #4A90E2;
15
+ margin-top: 20px;
16
+ }
17
+ .section {
18
+ background-color: #f9f9f9;
19
+ padding: 15px;
20
+ border-radius: 10px;
21
+ margin-top: 20px;
22
+ }
23
+ .section p, .section ul {
24
+ color: #666666;
25
+ }
26
+ .link {
27
+ color: #4A90E2;
28
+ text-decoration: none;
29
+ }
30
+ .benchmark-table {
31
+ width: 100%;
32
+ border-collapse: collapse;
33
+ margin-top: 20px;
34
+ }
35
+ .benchmark-table th, .benchmark-table td {
36
+ border: 1px solid #ddd;
37
+ padding: 8px;
38
+ text-align: left;
39
+ }
40
+ .benchmark-table th {
41
+ background-color: #4A90E2;
42
+ color: white;
43
+ }
44
+ .benchmark-table td {
45
+ background-color: #f2f2f2;
46
+ }
47
+ </style>
48
+ """, unsafe_allow_html=True)
49
+
50
+ # Main Title
51
+ st.markdown('<div class="main-title">Whisper: Advanced Speech Recognition</div>', unsafe_allow_html=True)
52
+
53
+ # Overview Section
54
+ st.markdown("""
55
+ <div class="section">
56
+ <p>The <strong>Whisper</strong> model, developed by OpenAI, was introduced in the paper <em>Robust Speech Recognition via Large-Scale Weak Supervision</em>. Whisper is a cutting-edge speech recognition model designed to handle a wide range of tasks by learning from an extensive dataset of 680,000 hours of multilingual and multitask audio transcripts.</p>
57
+ <p>Whisper's robust architecture allows it to perform well across different speech processing tasks without the need for fine-tuning. Its zero-shot transfer capabilities enable it to generalize effectively, making it a versatile tool for developers and researchers alike.</p>
58
+ </div>
59
+ """, unsafe_allow_html=True)
60
+
61
+ # Use Cases Section
62
+ st.markdown('<div class="sub-title">Use Cases</div>', unsafe_allow_html=True)
63
+ st.markdown("""
64
+ <div class="section">
65
+ <ul>
66
+ <li><strong>Transcription Services:</strong> Automate transcription of audio files in English for media, legal, and academic purposes.</li>
67
+ <li><strong>Voice-Activated Assistants:</strong> Enhance voice command recognition in smart devices and applications.</li>
68
+ <li><strong>Broadcast Media:</strong> Provide real-time transcription and subtitling for live broadcasts.</li>
69
+ <li><strong>Multilingual Translation:</strong> Use as a base for developing multilingual speech-to-text and translation services.</li>
70
+ </ul>
71
+ </div>
72
+ """, unsafe_allow_html=True)
73
+
74
+ # How to Use Section
75
+ st.markdown('<div class="sub-title">How to Use Whisper</div>', unsafe_allow_html=True)
76
+ st.code('''
77
+ audioAssembler = AudioAssembler() \\
78
+ .setInputCol("audio_content") \\
79
+ .setOutputCol("audio_assembler")
80
+
81
+ speechToText = WhisperForCTC \\
82
+ .pretrained("asr_whisper_small_english")\\
83
+ .setInputCols("audio_assembler") \\
84
+ .setOutputCol("text")
85
+
86
+ pipeline = Pipeline().setStages([audioAssembler, speechToText])
87
+
88
+ pipelineModel = pipeline.fit(data)
89
+
90
+ pipelineDF = pipelineModel.transform(data)
91
+ ''', language='python')
92
+
93
+ st.markdown("""
94
+ <div class="section">
95
+ <p>This example demonstrates how to use Whisper in a Spark NLP pipeline to convert raw audio content into text. The model processes the input audio sampled at 16 kHz and outputs the corresponding text transcription, making it ideal for tasks like transcription, voice command recognition, and more.</p>
96
+ </div>
97
+ """, unsafe_allow_html=True)
98
+
99
+ # Model Information Section
100
+ st.markdown('<div class="sub-title">Model Information</div>', unsafe_allow_html=True)
101
+ st.markdown("""
102
+ <div class="section">
103
+ <table class="benchmark-table">
104
+ <tr>
105
+ <th>Attribute</th>
106
+ <th>Description</th>
107
+ </tr>
108
+ <tr>
109
+ <td><strong>Model Name</strong></td>
110
+ <td>asr_whisper_small_english</td>
111
+ </tr>
112
+ <tr>
113
+ <td><strong>Compatibility</strong></td>
114
+ <td>Spark NLP 5.1.4+, PySpark 3.4+</td>
115
+ </tr>
116
+ <tr>
117
+ <td><strong>License</strong></td>
118
+ <td>Open Source</td>
119
+ </tr>
120
+ <tr>
121
+ <td><strong>Edition</strong></td>
122
+ <td>Official</td>
123
+ </tr>
124
+ <tr>
125
+ <td><strong>Input Labels</strong></td>
126
+ <td>[audio_assembler]</td>
127
+ </tr>
128
+ <tr>
129
+ <td><strong>Output Labels</strong></td>
130
+ <td>[text]</td>
131
+ </tr>
132
+ <tr>
133
+ <td><strong>Language</strong></td>
134
+ <td>en</td>
135
+ </tr>
136
+ <tr>
137
+ <td><strong>Model Size</strong></td>
138
+ <td>1.1 GB</td>
139
+ </tr>
140
+ </table>
141
+ </div>
142
+ """, unsafe_allow_html=True)
143
+
144
+ # References Section
145
+ st.markdown('<div class="sub-title">References</div>', unsafe_allow_html=True)
146
+ st.markdown("""
147
+ <div class="section">
148
+ <ul>
149
+ <li><a class="link" href="https://sparknlp.org/2023/10/17/asr_whisper_small_english_en.html" target="_blank">Whisper Model on Spark NLP</a></li>
150
+ <li><a class="link" href="https://huggingface.co/openai/whisper-small.en" target="_blank">Whisper Model on Hugging Face</a></li>
151
+ <li><a class="link" href="https://arxiv.org/abs/2212.04356" target="_blank">Whisper Paper</a></li>
152
+ <li><a class="link" href="https://github.com/openai/whisper" target="_blank">Whisper GitHub Repository</a></li>
153
+ </ul>
154
+ </div>
155
+ """, unsafe_allow_html=True)
156
+
157
+ # Community & Support
158
+ st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
159
+ st.markdown("""
160
+ <div class="section">
161
+ <ul>
162
+ <li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
163
+ <li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
164
+ <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
165
+ <li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
166
+ <li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
167
+ </ul>
168
+ </div>
169
+ """, unsafe_allow_html=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ spark-nlp
3
+ pyspark
4
+ librosa
5
+ pandas