Spaces:

UnarineLeo
/

fill-mask-zabantu-xlm-roberta

Sleeping

App Files Files Community

fill-mask-zabantu-xlm-roberta / app.py

UnarineLeo

Update app.py

d104ff1 verified 5 months ago

raw

history blame

3.65 kB

	import streamlit as st
	from transformers import pipeline

	# Initialize the pipeline for the fill-mask task
	unmasker = pipeline('fill-mask', model='dsfsi/zabantu-bantu-250m')

	# Sample sentences for different languages with placeholders for the masked word
	sample_sentences = {
	'Zulu': "Le ndoda ithi izo____ ukudla.", # Masked word for Zulu
	'Tshivenda': "Mufana uyo____ vhukuma.", # Masked word for Tshivenda
	'Sepedi': "Mosadi o ____ pheka.", # Masked word for Sepedi
	'Tswana': "Monna o ____ tsamaya.", # Masked word for Tswana
	'Tsonga': "N'wana wa xisati u ____ ku tsaka." # Masked word for Tsonga
	}

	# Function to perform fill-mask on sentences with the token '__' replaced
	def fill_mask_for_languages(sentences):
	results = {}
	for language, sentence in sentences.items():
	# Replace the '____' placeholder with the model's mask token
	masked_sentence = sentence.replace('____', unmasker.tokenizer.mask_token)

	# Get predictions for the masked sentence
	unmasked = unmasker(masked_sentence)

	# Store the result for each language
	results[language] = unmasked
	return results

	# Streamlit interface
	st.title("Fill Mask for Multiple Languages \| Zabantu-Bantu-250m")
	st.write("This app predicts the missing word for sentences in Zulu, Tshivenda, Sepedi, Tswana, and Tsonga using a Zabantu BERT model.")

	# Display the original sample sentences
	st.write("### Sample sentences:")
	for language, sentence in sample_sentences.items():
	st.write(f"{language}: {sentence}")

	# User input for custom sentences
	user_sentence = st.text_input("Enter your own sentence with a masked word (use '____'):", "Enter sentence here...")

	# Add a submit button
	if st.button("Submit"):
	# Prepare user input for prediction
	user_masked_sentence = user_sentence.replace('____', unmasker.tokenizer.mask_token)

	# Get predictions for the user input sentence
	user_predictions = unmasker(user_masked_sentence)

	# Display results for user input
	st.write("### Your Input:")
	st.write(f"Original sentence: {user_sentence}")
	st.write(f"Top prediction for the masked token: {user_predictions[0]['sequence']}")

	# Display results for sample sentences
	st.write("### Predictions for Sample Sentences:")
	for language, predictions in fill_mask_for_languages(sample_sentences).items():
	original_sentence = sample_sentences[language]
	predicted_sentence = predictions[0]['sequence']
	st.write(f"Original sentence ({language}): {original_sentence}")
	st.write(f"Top prediction for the masked token: {predicted_sentence}\n")
	st.write("=" * 80)

	# Custom CSS styling for Streamlit elements
	css = """
	<style>
	footer {display:none !important}

	.stButton > button {
	background-color: #17152e;
	color: white;
	border: none;
	padding: 0.75em 2em;
	text-align: center;
	text-decoration: none;
	display: inline-block;
	font-size: 16px;
	margin: 4px 2px;
	cursor: pointer;
	border-radius: 12px;
	transition: background-color 0.3s ease;
	}

	.stButton > button:hover {
	background-color: #3c4a6b;
	}

	.stTextInput, .stTextArea {
	border: 1px solid #e6e6e6;
	padding: 0.75em;
	border-radius: 10px;
	font-size: 16px;
	width: 100%;
	}

	.stTextInput:focus, .stTextArea:focus {
	border-color: #17152e;
	outline: none;
	box-shadow: 0px 0px 5px rgba(23, 21, 46, 0.5);
	}

	div[data-testid="stMarkdownContainer"] p {
	font-size: 16px;
	}

	.stApp {
	padding: 2em;
	font-family: 'Poppins', sans-serif;
	}
	</style>
	"""
	st.markdown(css, unsafe_allow_html=True)