Spaces:

UnarineLeo
/

fill-mask-zabantu-xlm-roberta

Sleeping

App Files Files Community

UnarineLeo commited on Oct 7, 2024

Commit

4d5270d

verified ·

1 Parent(s): b92b795

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -51

app.py CHANGED Viewed

@@ -3,72 +3,111 @@ from transformers import pipeline
 unmasker = pipeline('fill-mask', model='dsfsi/zabantu-xlm-roberta')
-sample_sentences = {
-    'zulu': "Le ndoda ithi izo <mask> ukudla.",
-    'tshivenda': "Mufana uyo <mask> vhukuma.",
-    'sepedi': "Mosadi o <mask> pheka.",
-    'tswana': "Monna o <mask> tsamaya.",
-    'tsonga': "N'wana wa xisati u <mask> ku tsaka."
-}
-def fill_mask_for_languages(sentences):
     results = {}
     for language, sentence in sentences.items():
-        masked_sentence = sentence.replace('<mask>', unmasker.tokenizer.mask_token)
-        unmasked = unmasker(masked_sentence)
-        results[language] = unmasked
-    return results
 def replace_mask(sentence, predicted_word):
     return sentence.replace("<mask>", f"**{predicted_word}**")
-st.title("Fill Mask| Zabantu-XLM-Roberta")
 st.write(f"")
 col1, col2 = st.columns(2)
-with col1:
-    user_sentence = st.text_area("Enter your own sentence with a masked word (use '____'):", "\n".join(
-        f"'{lang}': '{sentence}'," for lang, sentence in sample_sentences.items()
-    ))
-    if st.button("Submit"):
-        user_masked_sentence = user_sentence.replace('<mask>', unmasker.tokenizer.mask_token)
 with col2:
-    if 'user_masked_sentence' in locals():
-        if user_masked_sentence:
-            user_predictions = unmasker(user_masked_sentence)
-            # st.write(user_predictions)
-            if len(user_predictions) > 0:
-                # st.write(f"Top prediction for the masked token: {user_predictions[0]['sequence']}")
-                predictions = fill_mask_for_languages(sample_sentences)
-                for language, language_predictions in predictions.items():
-                    predicted_word = language_predictions[0]['token_str']
-                    score = language_predictions[0]['score'] * 100
-                    st.markdown(f"""
-                    <div class="bar">
-                        <div class="bar-fill" style="width: {score}%;"></div>
-                    </div>
-                    <div class="container">
-                        <div style="align-items: left;">{predicted_word}({language})</div>
-                        <div style="align-items: right;">{score:.2f}%</div>
-                    </div>
-                    """, unsafe_allow_html=True)
 if 'predictions' in locals():
-        if predictions:
-                for language, language_predictions in predictions.items():
-                    original_sentence = sample_sentences[language]
-                    predicted_sentence = replace_mask(original_sentence, language_predictions[0]['token_str'])
-                    # st.write(language_predictions)
-                    # st.write(f"Original sentence ({language}): {original_sentence}")
-                    st.write(f"{language}: {predicted_sentence}\n")
 css = """
 <style>
@@ -135,6 +174,7 @@ footer {display:none !important;}
     --tw-text-opacity: 1 !important;
     color:rgb(37 56 133 / var(--tw-text-opacity)) !important;
 }
 .container {
     display: flex;
     justify-content: space-between;
@@ -143,7 +183,7 @@ footer {display:none !important;}
     width: 100%;
 }
 .bar {
-    width: 70%;
     background-color: #e6e6e6;
     border-radius: 12px;
     overflow: hidden;
@@ -155,6 +195,8 @@ footer {display:none !important;}
     height: 100%;
     border-radius: 12px;
 }
 </style>
 """
-st.markdown(css, unsafe_allow_html=True)

 unmasker = pipeline('fill-mask', model='dsfsi/zabantu-xlm-roberta')
+st.set_page_config(layout="wide")
+def fill_mask(sentences):
     results = {}
+    warnings = []
     for language, sentence in sentences.items():
+        if "<mask>" in sentence:
+            masked_sentence = sentence.replace('<mask>', unmasker.tokenizer.mask_token)
+            unmasked = unmasker(masked_sentence)
+            results[language] = unmasked
+        else:
+            warnings.append(f"Warning: No <mask> token found in sentence: {sentence}")
+    return results, warnings
 def replace_mask(sentence, predicted_word):
     return sentence.replace("<mask>", f"**{predicted_word}**")
+st.title("Fill Mask | Zabantu-XLM-Roberta")
 st.write(f"")
+st.markdown("Zabantu-XLMR refers to a fleet of models trained on different combinations of South African Bantu languages. These include: Zabantu-VEN, Zabantu-NSO, Zabantu-NSO+VEN, Zabantu-SOT+VEN, Zabantu-BANTU(from 9 South African Bantu languages)")
 col1, col2 = st.columns(2)
+if 'text_input' not in st.session_state:
+    st.session_state['text_input'] = ""
+if 'warnings' not in st.session_state:
+    st.session_state['warnings'] = []
+with col1:
+    with st.container(border=True):
+        st.markdown("Input :clipboard:")
+        sample_sentence = {
+            'zulu': "Le ndoda ithi izo <mask> ukudla.",
+            'tshivenda': "Mufana uyo <mask> vhukuma.",
+            'sepedi': "Mosadi o <mask> pheka.",
+            'tswana': "Monna o <mask> tsamaya.",
+            'tsonga': "N'wana wa xisati u <mask> ku tsaka."
+        }
+        text_input = st.text_area(
+            "Enter sentences with <mask> token:",
+            value=st.session_state['text_input']
+        )
+        input_sentences = text_input.split("\n")
+        button1, button2, _ = st.columns([2, 2, 4])
+        with button1:
+            if st.button("Test Example"):
+                user_sentence = f"'{lang}': '{sentence}'," for lang, sentence in sample_sentences.items()
+                user_masked_sentence = user_sentence.replace('<mask>', unmasker.tokenizer.mask_token)
+                # st.rerun()
+                # result, warnings = fill_mask(sample_sentence.split("\n"))
+                # st.session_state['text_input'] = sample_sentence
+        with button2:
+            if st.button("Submit"):
+                user_masked_sentence = input_sentences.replace('<mask>', unmasker.tokenizer.mask_token)
+                # result, warnings = fill_mask(input_sentences)
+                # st.session_state['warnings'] = warnings
+        if st.session_state['warnings']:
+            for warning in st.session_state['warnings']:
+                st.warning(warning)
+        st.markdown("Example")
+        st.code(sample_sentence, wrap_lines=True)
 with col2:
+    with st.container(border=True):
+        st.markdown("Output :bar_chart:")
+        if 'user_masked_sentence' in locals():
+            if user_masked_sentence:
+                user_predictions = unmasker(user_masked_sentence)
+                # st.write(user_predictions)
+                if len(user_predictions) > 0:
+                    # st.write(f"Top prediction for the masked token: {user_predictions[0]['sequence']}")
+                    predictions = fill_mask_for_languages(sample_sentences)
+                    for language, language_predictions in predictions.items():
+                        predicted_word = language_predictions[0]['token_str']
+                        score = language_predictions[0]['score'] * 100
+                        st.markdown(f"""
+                        <div class="bar">
+                            <div class="bar-fill" style="width: {score}%;"></div>
+                        </div>
+                        <div class="container">
+                            <div style="align-items: left;">{predicted_word}({language})</div>
+                            <div style="align-items: right;">{score:.2f}%</div>
+                        </div>
+                        """, unsafe_allow_html=True)
 if 'predictions' in locals():
+    if predictions:
+        for language, language_predictions in predictions.items():
+            original_sentence = sample_sentences[language]
+            predicted_sentence = replace_mask(original_sentence, language_predictions[0]['token_str'])
+            # st.write(language_predictions)
+            # st.write(f"Original sentence ({language}): {original_sentence}")
+            st.write(f"{language}: {predicted_sentence}\n")
 css = """
 <style>
     --tw-text-opacity: 1 !important;
     color:rgb(37 56 133 / var(--tw-text-opacity)) !important;
 }
 .container {
     display: flex;
     justify-content: space-between;
     width: 100%;
 }
 .bar {
+    # width: 70%;
     background-color: #e6e6e6;
     border-radius: 12px;
     overflow: hidden;
     height: 100%;
     border-radius: 12px;
 }
 </style>
 """
+st.markdown(css, unsafe_allow_html=True)