import streamlit as st from transformers import pipeline # Initialize the pipeline for the fill-mask task unmasker = pipeline('fill-mask', model='dsfsi/zabantu-bantu-250m') # Sample sentences for different languages with placeholders for the masked word sample_sentences = { 'Zulu': "Le ndoda ithi izo____ ukudla.", # Masked word for Zulu 'Tshivenda': "Mufana uyo____ vhukuma.", # Masked word for Tshivenda 'Sepedi': "Mosadi o ____ pheka.", # Masked word for Sepedi 'Tswana': "Monna o ____ tsamaya.", # Masked word for Tswana 'Tsonga': "N'wana wa xisati u ____ ku tsaka." # Masked word for Tsonga } # Function to perform fill-mask on sentences with the token '__' replaced def fill_mask_for_languages(sentences): results = {} for language, sentence in sentences.items(): # Replace the '____' placeholder with the model's mask token masked_sentence = sentence.replace('____', unmasker.tokenizer.mask_token) # Get predictions for the masked sentence unmasked = unmasker(masked_sentence) # Store the result for each language results[language] = unmasked return results # Streamlit interface st.title("Fill Mask for Multiple Languages | Zabantu-Bantu-250m") st.write("This app predicts the missing word for sentences in Zulu, Tshivenda, Sepedi, Tswana, and Tsonga using a Zabantu BERT model.") # Display the original sample sentences st.write("### Sample sentences:") for language, sentence in sample_sentences.items(): st.write(f"**{language}**: {sentence}") # User input for custom sentences user_sentence = st.text_input("Enter your own sentence with a masked word (use '____'):", "Enter sentence here...") # Add a submit button if st.button("Submit"): # Prepare user input for prediction user_masked_sentence = user_sentence.replace('____', unmasker.tokenizer.mask_token) # Get predictions for the user input sentence user_predictions = unmasker(user_masked_sentence) # Display results for user input st.write("### Your Input:") st.write(f"Original sentence: {user_sentence}") st.write(f"Top prediction for the masked token: {user_predictions[0]['sequence']}") # Display results for sample sentences st.write("### Predictions for Sample Sentences:") for language, predictions in fill_mask_for_languages(sample_sentences).items(): original_sentence = sample_sentences[language] predicted_sentence = predictions[0]['sequence'] st.write(f"Original sentence ({language}): {original_sentence}") st.write(f"Top prediction for the masked token: {predicted_sentence}\n") st.write("=" * 80) # Custom CSS styling for Streamlit elements css = """ """ st.markdown(css, unsafe_allow_html=True)