UnarineLeo
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,67 +1,52 @@
|
|
1 |
import streamlit as st
|
2 |
from transformers import pipeline
|
3 |
|
4 |
-
# Initialize the pipeline for the fill-mask task
|
5 |
unmasker = pipeline('fill-mask', model='dsfsi/zabantu-bantu-250m')
|
6 |
|
7 |
-
# Sample sentences for different languages with placeholders for the masked word
|
8 |
sample_sentences = {
|
9 |
-
'Zulu': "Le ndoda ithi izo____ ukudla.",
|
10 |
-
'Tshivenda': "Mufana uyo____ vhukuma.",
|
11 |
-
'Sepedi': "Mosadi o ____ pheka.",
|
12 |
-
'Tswana': "Monna o ____ tsamaya.",
|
13 |
-
'Tsonga': "N'wana wa xisati u ____ ku tsaka."
|
14 |
}
|
15 |
|
16 |
-
# Function to perform fill-mask on sentences with the token '__' replaced
|
17 |
def fill_mask_for_languages(sentences):
|
18 |
results = {}
|
19 |
for language, sentence in sentences.items():
|
20 |
-
# Replace the '____' placeholder with the model's mask token
|
21 |
masked_sentence = sentence.replace('____', unmasker.tokenizer.mask_token)
|
22 |
|
23 |
-
# Get predictions for the masked sentence
|
24 |
unmasked = unmasker(masked_sentence)
|
25 |
|
26 |
-
# Store the result for each language
|
27 |
results[language] = unmasked
|
28 |
return results
|
29 |
|
30 |
-
|
|
|
|
|
31 |
st.title("Fill Mask for Multiple Languages | Zabantu-Bantu-250m")
|
32 |
st.write("This app predicts the missing word for sentences in Zulu, Tshivenda, Sepedi, Tswana, and Tsonga using a Zabantu BERT model.")
|
33 |
|
34 |
-
|
35 |
-
st.write("### Sample sentences:")
|
36 |
-
for language, sentence in sample_sentences.items():
|
37 |
-
st.write(f"**{language}**: {sentence}")
|
38 |
-
|
39 |
-
# User input for custom sentences
|
40 |
-
user_sentence = st.text_input("Enter your own sentence with a masked word (use '____'):", "Enter sentence here...")
|
41 |
|
42 |
-
# Add a submit button
|
43 |
if st.button("Submit"):
|
44 |
-
# Prepare user input for prediction
|
45 |
user_masked_sentence = user_sentence.replace('____', unmasker.tokenizer.mask_token)
|
46 |
|
47 |
-
# Get predictions for the user input sentence
|
48 |
user_predictions = unmasker(user_masked_sentence)
|
49 |
|
50 |
-
# Display results for user input
|
51 |
st.write("### Your Input:")
|
52 |
st.write(f"Original sentence: {user_sentence}")
|
53 |
st.write(f"Top prediction for the masked token: {user_predictions[0]['sequence']}")
|
54 |
|
55 |
-
# Display results for sample sentences
|
56 |
st.write("### Predictions for Sample Sentences:")
|
57 |
for language, predictions in fill_mask_for_languages(sample_sentences).items():
|
58 |
original_sentence = sample_sentences[language]
|
59 |
-
predicted_sentence = predictions[0]['sequence']
|
|
|
60 |
st.write(f"Original sentence ({language}): {original_sentence}")
|
61 |
st.write(f"Top prediction for the masked token: {predicted_sentence}\n")
|
62 |
st.write("=" * 80)
|
63 |
|
64 |
-
# Custom CSS styling for Streamlit elements
|
65 |
css = """
|
66 |
<style>
|
67 |
footer {display:none !important}
|
@@ -109,4 +94,5 @@ div[data-testid="stMarkdownContainer"] p {
|
|
109 |
}
|
110 |
</style>
|
111 |
"""
|
112 |
-
|
|
|
|
1 |
import streamlit as st
|
2 |
from transformers import pipeline
|
3 |
|
|
|
4 |
unmasker = pipeline('fill-mask', model='dsfsi/zabantu-bantu-250m')
|
5 |
|
|
|
6 |
sample_sentences = {
|
7 |
+
'Zulu': "Le ndoda ithi izo____ ukudla.",
|
8 |
+
'Tshivenda': "Mufana uyo____ vhukuma.",
|
9 |
+
'Sepedi': "Mosadi o ____ pheka.",
|
10 |
+
'Tswana': "Monna o ____ tsamaya.",
|
11 |
+
'Tsonga': "N'wana wa xisati u ____ ku tsaka."
|
12 |
}
|
13 |
|
|
|
14 |
def fill_mask_for_languages(sentences):
|
15 |
results = {}
|
16 |
for language, sentence in sentences.items():
|
|
|
17 |
masked_sentence = sentence.replace('____', unmasker.tokenizer.mask_token)
|
18 |
|
|
|
19 |
unmasked = unmasker(masked_sentence)
|
20 |
|
|
|
21 |
results[language] = unmasked
|
22 |
return results
|
23 |
|
24 |
+
def replace_mask(sentence, predicted_word):
|
25 |
+
return sentence.replace("____", predicted_word)
|
26 |
+
|
27 |
st.title("Fill Mask for Multiple Languages | Zabantu-Bantu-250m")
|
28 |
st.write("This app predicts the missing word for sentences in Zulu, Tshivenda, Sepedi, Tswana, and Tsonga using a Zabantu BERT model.")
|
29 |
|
30 |
+
user_sentence = st.text_input("Enter your own sentence with a masked word (use '____'):", "\n".join(sample_sentences))
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
|
|
32 |
if st.button("Submit"):
|
|
|
33 |
user_masked_sentence = user_sentence.replace('____', unmasker.tokenizer.mask_token)
|
34 |
|
|
|
35 |
user_predictions = unmasker(user_masked_sentence)
|
36 |
|
|
|
37 |
st.write("### Your Input:")
|
38 |
st.write(f"Original sentence: {user_sentence}")
|
39 |
st.write(f"Top prediction for the masked token: {user_predictions[0]['sequence']}")
|
40 |
|
|
|
41 |
st.write("### Predictions for Sample Sentences:")
|
42 |
for language, predictions in fill_mask_for_languages(sample_sentences).items():
|
43 |
original_sentence = sample_sentences[language]
|
44 |
+
predicted_sentence = replace_mask(sentence, predictions[0]['sequence'])
|
45 |
+
|
46 |
st.write(f"Original sentence ({language}): {original_sentence}")
|
47 |
st.write(f"Top prediction for the masked token: {predicted_sentence}\n")
|
48 |
st.write("=" * 80)
|
49 |
|
|
|
50 |
css = """
|
51 |
<style>
|
52 |
footer {display:none !important}
|
|
|
94 |
}
|
95 |
</style>
|
96 |
"""
|
97 |
+
|
98 |
+
st.markdown(css, unsafe_allow_html=True)
|