diff --git a/.streamlit/config.toml b/.streamlit/config.toml
new file mode 100644
index 0000000000000000000000000000000000000000..1351925fe878f35a9e31ac01757b8a4853757090
--- /dev/null
+++ b/.streamlit/config.toml
@@ -0,0 +1,3 @@
+[theme]
+base="light"
+primaryColor="#29B4E8"
diff --git a/Demo.py b/Demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..435066b33f959d878e82ed8a33232ae37404eebc
--- /dev/null
+++ b/Demo.py
@@ -0,0 +1,190 @@
+import streamlit as st
+import sparknlp
+import os
+import pandas as pd
+
+from sparknlp.base import *
+from sparknlp.annotator import *
+from pyspark.ml import Pipeline
+from sparknlp.pretrained import PretrainedPipeline
+from annotated_text import annotated_text
+
+# Page configuration
+st.set_page_config(
+ layout="wide",
+ initial_sidebar_state="auto"
+)
+
+# CSS for styling
+st.markdown("""
+
+""", unsafe_allow_html=True)
+
+@st.cache_resource
+def init_spark():
+ return sparknlp.start()
+
+@st.cache_resource
+def create_pipeline(model):
+ documentAssembler = DocumentAssembler()\
+ .setInputCol("text")\
+ .setOutputCol("document")
+
+ sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")\
+ .setInputCols(["document"])\
+ .setOutputCol("sentence")
+
+ tokenizer = Tokenizer()\
+ .setInputCols(["sentence"])\
+ .setOutputCol("token")
+
+ ner_converter = NerConverter()\
+ .setInputCols(["sentence", "token", "ner"])\
+ .setOutputCol("ner_chunk")
+
+
+ if model == 'xlm_roberta_large_token_classifier_masakhaner':
+ tokenClassifier = XlmRoBertaForTokenClassification.pretrained("xlm_roberta_large_token_classifier_masakhaner", "xx")\
+ .setInputCols(["sentence",'token'])\
+ .setOutputCol("ner")
+
+ else:
+ tokenClassifier = DistilBertForTokenClassification.pretrained("distilbert_base_token_classifier_masakhaner", "xx")\
+ .setInputCols(["sentence",'token'])\
+ .setOutputCol("ner")
+
+ nlpPipeline = Pipeline(stages=[documentAssembler, sentenceDetector, tokenizer, tokenClassifier, ner_converter])
+ return nlpPipeline
+
+def fit_data(pipeline, data):
+ empty_df = spark.createDataFrame([['']]).toDF('text')
+ pipeline_model = pipeline.fit(empty_df)
+ model = LightPipeline(pipeline_model)
+ result = model.fullAnnotate(data)
+ return result
+
+def annotate(data):
+ document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"]
+ annotated_words = []
+ for chunk, label in zip(chunks, labels):
+ parts = document.split(chunk, 1)
+ if parts[0]:
+ annotated_words.append(parts[0])
+ annotated_words.append((chunk, label))
+ document = parts[1]
+ if document:
+ annotated_words.append(document)
+ annotated_text(*annotated_words)
+
+# Set up the page layout
+st.markdown('
Recognize entities in 10 African languages
', unsafe_allow_html=True)
+st.markdown("""
+
+
This model carries out Name Entity Recognition on 10 African languages (Amharic, Hausa, Igbo, Kinyarwanda, Luganda, Nigerian, Pidgin, Swahilu, Wolof, and Yorùbá).
+
+""", unsafe_allow_html=True)
+
+# Sidebar content
+model = st.sidebar.selectbox(
+ "Choose the pretrained model",
+ ["xlm_roberta_large_token_classifier_masakhaner", "distilbert_base_token_classifier_masakhaner"],
+ help="For more info about the models visit: https://sparknlp.org/models"
+)
+
+language = st.sidebar.selectbox(
+ "Choose the pretrained model",
+ ["Amharic", "Hausa", "Igbo", "Kinyarwanda", "Luganda", "Nigerian", "Pidgin", "Swahilu", "Wolof", "Yorùbá"],
+ help="For more info about the models visit: https://sparknlp.org/models"
+)
+
+try:
+ labels_set = set()
+ for i in results['NER Chunk'].values:
+ labels_set.add(results["NER Label"][i])
+ labels_set = list(labels_set)
+
+ labels = st.sidebar.multiselect("Entity labels", options=labels_set, default=list(labels_set))
+
+ NER_labs = ['PER', 'ORG', 'LOC', 'DATE']
+ NER_exp = ['People, including fictional.', 'Companies, agencies, institutions, etc.', 'Countries, cities, states.', 'Date, Year']
+
+ NER_dict = dict(zip(NER_labs, NER_exp))
+
+ show_exp = st.sidebar.checkbox("Explain NER Labels", value=True)
+ if show_exp:
+ t_ner_k = []
+ t_ner_v = []
+ for t_lab in labels_set:
+ if t_lab in NER_dict:
+ t_ner_k.append(t_lab)
+ t_ner_v.append(NER_dict[t_lab])
+ tdf = pd.DataFrame({"NER": t_ner_k, "Meaning": t_ner_v})
+ tdf.index=['']*len(t_ner_k)
+ st.sidebar.table(tdf)
+except:
+ pass
+
+# Reference notebook link in sidebar
+link = """
+
+
+
+"""
+st.sidebar.markdown('Reference notebook:')
+st.sidebar.markdown(link, unsafe_allow_html=True)
+
+# Load examples
+folder_path = f"inputs/{language}"
+examples = [
+ lines[1].strip()
+ for filename in os.listdir(folder_path)
+ if filename.endswith('.txt')
+ for lines in [open(os.path.join(folder_path, filename), 'r', encoding='utf-8').readlines()]
+ if len(lines) >= 2
+]
+
+selected_text = st.selectbox("Select an example", examples)
+custom_input = st.text_input("Try it with your own Sentence!")
+
+text_to_analyze = custom_input if custom_input else selected_text
+
+st.subheader('Full example text')
+HTML_WRAPPER = """{}
"""
+st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)
+
+# Initialize Spark and create pipeline
+spark = init_spark()
+pipeline = create_pipeline(model)
+output = fit_data(pipeline, text_to_analyze)
+
+# Display matched sentence
+st.subheader("Processed output:")
+
+results = {
+ 'Document': output[0]['document'][0].result,
+ 'NER Chunk': [n.result for n in output[0]['ner_chunk']],
+ "NER Label": [n.metadata['entity'] for n in output[0]['ner_chunk']]
+}
+
+annotate(results)
+
+with st.expander("View DataFrame"):
+ df = pd.DataFrame({'NER Chunk': results['NER Chunk'], 'NER Label': results['NER Label']})
+ df.index += 1
+ st.dataframe(df)
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..9bfdb55fe4c4a7afeedff8f137e4b25c06115433
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,70 @@
+# Download base image ubuntu 18.04
+FROM ubuntu:18.04
+
+# Set environment variables
+ENV NB_USER jovyan
+ENV NB_UID 1000
+ENV HOME /home/${NB_USER}
+
+# Install required packages
+RUN apt-get update && apt-get install -y \
+ tar \
+ wget \
+ bash \
+ rsync \
+ gcc \
+ libfreetype6-dev \
+ libhdf5-serial-dev \
+ libpng-dev \
+ libzmq3-dev \
+ python3 \
+ python3-dev \
+ python3-pip \
+ unzip \
+ pkg-config \
+ software-properties-common \
+ graphviz \
+ openjdk-8-jdk \
+ ant \
+ ca-certificates-java \
+ && apt-get clean \
+ && update-ca-certificates -f;
+
+# Install Python 3.8 and pip
+RUN add-apt-repository ppa:deadsnakes/ppa \
+ && apt-get update \
+ && apt-get install -y python3.8 python3-pip \
+ && apt-get clean;
+
+# Set up JAVA_HOME
+ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
+RUN mkdir -p ${HOME} \
+ && echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> ${HOME}/.bashrc \
+ && chown -R ${NB_UID}:${NB_UID} ${HOME}
+
+# Create a new user named "jovyan" with user ID 1000
+RUN useradd -m -u ${NB_UID} ${NB_USER}
+
+# Switch to the "jovyan" user
+USER ${NB_USER}
+
+# Set home and path variables for the user
+ENV HOME=/home/${NB_USER} \
+ PATH=/home/${NB_USER}/.local/bin:$PATH
+
+# Set the working directory to the user's home directory
+WORKDIR ${HOME}
+
+# Upgrade pip and install Python dependencies
+RUN python3.8 -m pip install --upgrade pip
+COPY requirements.txt /tmp/requirements.txt
+RUN python3.8 -m pip install -r /tmp/requirements.txt
+
+# Copy the application code into the container at /home/jovyan
+COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
+
+# Expose port for Streamlit
+EXPOSE 7860
+
+# Define the entry point for the container
+ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
\ No newline at end of file
diff --git a/inputs/Amharic/Example1.txt b/inputs/Amharic/Example1.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c582ca9f0ce99a19277e1112bb56505547a2e402
--- /dev/null
+++ b/inputs/Amharic/Example1.txt
@@ -0,0 +1,2 @@
+አህመድ ቫንዳ ከ3-10-2000 ጀምሮ በአዲ...
+አህመድ ቫንዳ ከ3-10-2000 ጀምሮ በአዲስ አበባ ኖሯል።
\ No newline at end of file
diff --git a/inputs/Amharic/Example2.txt b/inputs/Amharic/Example2.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5579c85a8cdd52c34d901d87589f6a984d9d5bc5
--- /dev/null
+++ b/inputs/Amharic/Example2.txt
@@ -0,0 +1,2 @@
+ሰማያዊ ፓርቲ ዛሬ በወቅታዊ የሀገሪቱ የፖለቲካ ጉዳዮች ላይ በመኢአድ ጽህፈት ቤት የሰጠው ጋዜጣዊ መግለ...
+ሰማያዊ ፓርቲ ዛሬ በወቅታዊ የሀገሪቱ የፖለቲካ ጉዳዮች ላይ በመኢአድ ጽህፈት ቤት የሰጠው ጋዜጣዊ መግለጫ ይከተላል ፡፡
\ No newline at end of file
diff --git a/inputs/Amharic/Example3.txt b/inputs/Amharic/Example3.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0442ec7e95da834da5e818149b2106dd78343849
--- /dev/null
+++ b/inputs/Amharic/Example3.txt
@@ -0,0 +1,2 @@
+የ ዓመቱ አዲሱ የዚምባብዌ ፕሬዚደንት ኤመርሰን ምናንጋግዋ በሁለቱ ቻዎቻቸው አንፃር በዕድሜ...
+የ ዓመቱ አዲሱ የዚምባብዌ ፕሬዚደንት ኤመርሰን ምናንጋግዋ በሁለቱ ቻዎቻቸው አንፃር በዕድሜ ትንሹ ናቸው ።
\ No newline at end of file
diff --git a/inputs/Amharic/Example4.txt b/inputs/Amharic/Example4.txt
new file mode 100644
index 0000000000000000000000000000000000000000..27e535ca7ba8e738f79dd2874c757c5b19832afa
--- /dev/null
+++ b/inputs/Amharic/Example4.txt
@@ -0,0 +1,2 @@
+ዶይቸ ቬለ ያነጋገራቸው የመብት ተሟጋቿ ሊንዳ ማዜሪሬ በዕድሙ በተዳከሙ መሪዎች ነው የምንተዳደረው በማለት የርሳቸውን እና የአህጉሩን ወጣት ትውልድ ቅ...
+ዶይቸ ቬለ ያነጋገራቸው የመብት ተሟጋቿ ሊንዳ ማዜሪሬ በዕድሙ በተዳከሙ መሪዎች ነው የምንተዳደረው በማለት የርሳቸውን እና የአህጉሩን ወጣት ትውልድ ቅሬታ ገልጸዋል ።
\ No newline at end of file
diff --git a/inputs/Amharic/Example5.txt b/inputs/Amharic/Example5.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4e5ba2f3869229f99e8034c3a4a1ee364c8c9b9d
--- /dev/null
+++ b/inputs/Amharic/Example5.txt
@@ -0,0 +1,2 @@
+ሪታ ፓንክኸርስት የኢትዮጵያ ባለውለታ አዲስ አበባ ላይ ትዳር የመሰረቱት የኢትዮጵያ ታሪክ ተመራማሪዎች በትዳር ከ ዓመታት በ...
+ሪታ ፓንክኸርስት የኢትዮጵያ ባለውለታ አዲስ አበባ ላይ ትዳር የመሰረቱት የኢትዮጵያ ታሪክ ተመራማሪዎች በትዳር ከ ዓመታት በላይ ዘልቀዋል ።
\ No newline at end of file
diff --git a/inputs/Amharic/Example6.txt b/inputs/Amharic/Example6.txt
new file mode 100644
index 0000000000000000000000000000000000000000..653eb6d924c9cff50e8a9a30ff1697b473b853d8
--- /dev/null
+++ b/inputs/Amharic/Example6.txt
@@ -0,0 +1,2 @@
+በሳልስቱ እስራኤል ዉስጥ በተደረገዉ አጠቃላይ ምርጫ አክራሪዉ የጠቅላይ ሚንስትር ቤንያሚን ኔትንያሁ ፓርቲ ...
+በሳልስቱ እስራኤል ዉስጥ በተደረገዉ አጠቃላይ ምርጫ አክራሪዉ የጠቅላይ ሚንስትር ቤንያሚን ኔትንያሁ ፓርቲ ሊኩድ አሸነፈ ።
\ No newline at end of file
diff --git a/inputs/Hausa/Example1.txt b/inputs/Hausa/Example1.txt
new file mode 100644
index 0000000000000000000000000000000000000000..09229a7162d043cda50065424980c9e1bc5e388e
--- /dev/null
+++ b/inputs/Hausa/Example1.txt
@@ -0,0 +1,2 @@
+A saurari cikakken rahoton wakilin Muryar Amurka Ibrahim ...
+A saurari cikakken rahoton wakilin Muryar Amurka Ibrahim Abdul'aziz
\ No newline at end of file
diff --git a/inputs/Hausa/Example2.txt b/inputs/Hausa/Example2.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a255f6aebf89132ccace3f5aeb68dfc525af27f0
--- /dev/null
+++ b/inputs/Hausa/Example2.txt
@@ -0,0 +1,2 @@
+Najeriya : Kungiyar Ma'aikatan Jami'o'i Ta Shiga Yajin Aik...
+Najeriya : Kungiyar Ma'aikatan Jami'o'i Ta Shiga Yajin Aikin Gargadi
\ No newline at end of file
diff --git a/inputs/Hausa/Example3.txt b/inputs/Hausa/Example3.txt
new file mode 100644
index 0000000000000000000000000000000000000000..aa6907bd4ab26e85159f37ce068f0ba2d0d161f2
--- /dev/null
+++ b/inputs/Hausa/Example3.txt
@@ -0,0 +1,2 @@
+A ranar Juma’a mai zuwa ne wa’adin yajin aikin na gargadi zai kammala , kuma a hirar su da wakilin M...
+A ranar Juma’a mai zuwa ne wa’adin yajin aikin na gargadi zai kammala , kuma a hirar su da wakilin Muaryar Amurka , Komared Mohammed Jaji ya yi tsokaci game da mataki na gaba .
\ No newline at end of file
diff --git a/inputs/Hausa/Example4.txt b/inputs/Hausa/Example4.txt
new file mode 100644
index 0000000000000000000000000000000000000000..df06fb29928bd8d78c8c895dc7c33cac91d118f3
--- /dev/null
+++ b/inputs/Hausa/Example4.txt
@@ -0,0 +1,2 @@
+Kan haka Majalisar Dinkin Duniya ta zabi Aliko Dangote , da shugaban bankin raya Afirka , da wassu m...
+Kan haka Majalisar Dinkin Duniya ta zabi Aliko Dangote , da shugaban bankin raya Afirka , da wassu mutane 25 a fadin duniya su jagoranci magance matsalar tamowa , kafin shekara 2030 .
\ No newline at end of file
diff --git a/inputs/Hausa/Example5.txt b/inputs/Hausa/Example5.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d7fcf1fd21dfbd61f60c8392a2a7abd2f54b4
--- /dev/null
+++ b/inputs/Hausa/Example5.txt
@@ -0,0 +1,2 @@
+Temitope Olatoye Sugar shine mai wakiltar mazabar Lagelu da Akinyele daga jihar Oyo , a majalisar wa...
+Temitope Olatoye Sugar shine mai wakiltar mazabar Lagelu da Akinyele daga jihar Oyo , a majalisar wakilan tarayyar Najeriya .
\ No newline at end of file
diff --git a/inputs/Hausa/Example6.txt b/inputs/Hausa/Example6.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fa6112b849a8e565bf7240a0ba5d5207b52b483f
--- /dev/null
+++ b/inputs/Hausa/Example6.txt
@@ -0,0 +1,2 @@
+Tsohon mataimakin shugaban Najeriya , kuma dan takarar shugaban kasa a zaben 2019 karkashin jam’iyya...
+Tsohon mataimakin shugaban Najeriya , kuma dan takarar shugaban kasa a zaben 2019 karkashin jam’iyyar adawa ta PDP , Atiku Abubakar , ya yi Allah wadai da yunkurin da wasu sojoji suka yi na kifar da “ zababbiyar gwamnatin Habasha .
\ No newline at end of file
diff --git a/inputs/Igbo/Example1.txt b/inputs/Igbo/Example1.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ec0079c2c2c95d5f407d4485acbc034a4d808caa
--- /dev/null
+++ b/inputs/Igbo/Example1.txt
@@ -0,0 +1,2 @@
+Osote onye - isi ndị ome - iwu Naịjirịa bụ Ike Ekweremadu ekwuola na ike agwụla ndị Sịnatị iji otu n...
+Osote onye - isi ndị ome - iwu Naịjirịa bụ Ike Ekweremadu ekwuola na ike agwụla ndị Sịnatị iji otu nkeji darajụụ akwanyere ndị egburu n'ime oke ọgbaghara dị na Naịjirịa oge ọ bula .
\ No newline at end of file
diff --git a/inputs/Igbo/Example2.txt b/inputs/Igbo/Example2.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a2f74af6bdefb0675140bd6b1873ea95ef0ce15f
--- /dev/null
+++ b/inputs/Igbo/Example2.txt
@@ -0,0 +1,2 @@
+Okwu a Buhari kwuru na isi ndọrọndọrọ ọchịchị na 2015 bu ịhe eji kpụrụ ya na ọnụ ugbua , ọkachasị ka...
+Okwu a Buhari kwuru na isi ndọrọndọrọ ọchịchị na 2015 bu ịhe eji kpụrụ ya na ọnụ ugbua , ọkachasị ka ụlọ ọrụ na - ahụ maka ọnụ ọgụgụ a na - akpọ National Bureau of Statistics ( NBS ) nwepụtara ozi n'akọwa na mmadụ ruru nde asaa na nari ise so na ndị enweghi ọrụ kemgbe afọ 2016 .
\ No newline at end of file
diff --git a/inputs/Igbo/Example3.txt b/inputs/Igbo/Example3.txt
new file mode 100644
index 0000000000000000000000000000000000000000..26f0cc280179d75adac40c7b3deb28d766fef229
--- /dev/null
+++ b/inputs/Igbo/Example3.txt
@@ -0,0 +1,2 @@
+Google Africa kwuru n'igwe okwu Twitter sị : Taa , anyị na - akwanyere onye egwuregwu bọọlụ a ma ama...
+Google Africa kwuru n'igwe okwu Twitter sị : Taa , anyị na - akwanyere onye egwuregwu bọọlụ a ma ama , Stephen Keshi ugwu .
\ No newline at end of file
diff --git a/inputs/Igbo/Example4.txt b/inputs/Igbo/Example4.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3c54542900fe5513b03c74cd1669686aade1ecaa
--- /dev/null
+++ b/inputs/Igbo/Example4.txt
@@ -0,0 +1,2 @@
+Keshi chịrị ndị otu egwuregwu Super Eagles kemgbe afọ 2011 ma durukwa ha gaa asọmpi dị iche iche nke...
+Keshi chịrị ndị otu egwuregwu Super Eagles kemgbe afọ 2011 ma durukwa ha gaa asọmpi dị iche iche nke gụnyere ; Iko Mba Afrika na 2013 ( nke ha bulatara Naịjirịa ) , iko mpaghara Afrịka dị iche iche na 2013 , ma nye aka wetara Naijiria ọnọdụ n'asọmpi Iko Mbaụwa niile na 2014 .
\ No newline at end of file
diff --git a/inputs/Igbo/Example5.txt b/inputs/Igbo/Example5.txt
new file mode 100644
index 0000000000000000000000000000000000000000..812ce89dd30d37eb34dce8f4e7448b54499f1fc6
--- /dev/null
+++ b/inputs/Igbo/Example5.txt
@@ -0,0 +1,2 @@
+N' akwụkwọ ozi , ngalaba 'US Department' tinyere na websait ha , ha kwuru sị : Yunaited Steeti na - ...
+N' akwụkwọ ozi , ngalaba 'US Department' tinyere na websait ha , ha kwuru sị : Yunaited Steeti na - enwe obi mwute n' iyi ọrụ nke onye ndu ndị na - ama gọọmenti Kenya aka n'ihu bụ Raila Odinga duru onwe ya ka ọnwa Jenuwari gbara ịrị atọ .
\ No newline at end of file
diff --git a/inputs/Igbo/Example6.txt b/inputs/Igbo/Example6.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9fc0244439842c3e2ea5ba1e199da37c2c8acad3
--- /dev/null
+++ b/inputs/Igbo/Example6.txt
@@ -0,0 +1,2 @@
+Taa , otu n'ime ndị kewapụtara n'otu ndọrọndọrọ ọchịchị APC kpọrọ ndị ntaakụkọ n'isi ụlọọrụ ha maka ...
+Taa , otu n'ime ndị kewapụtara n'otu ndọrọndọrọ ọchịchị APC kpọrọ ndị ntaakụkọ n'isi ụlọọrụ ha maka ị kọwa echiche ha n'esomokwu nke di n'etiti ndị APC nke Imo steeti . N'ọnụ okwu TOE Ekechi bụ onụ na - ekwuchitere otu a , ha na - ebo gọvanọ Okorocha ebubo na o nupuru iwu ji patu ha isi ọtụtụ
\ No newline at end of file
diff --git a/inputs/Igbo/Example7.txt b/inputs/Igbo/Example7.txt
new file mode 100644
index 0000000000000000000000000000000000000000..87dcac45720e60fa7a2f9a7ec8df4efeacd56159
--- /dev/null
+++ b/inputs/Igbo/Example7.txt
@@ -0,0 +1,2 @@
+Otu kporo onweha 'The Coalition of Northern Groups' na bekee gwara onyeisiala Naịjirịa bụ Muhammadu ...
+Otu kporo onweha 'The Coalition of Northern Groups' na bekee gwara onyeisiala Naịjirịa bụ Muhammadu Buhuri na onye chiburu dịka osote onyeisiala n'oge garaaga bụ Atiku Abubakar na ọ ga - adị mma maọbụrụ na ha abụọ wepuru aka n'ime ọsọ ị banye n'ọkwa ọchịchị dịka onyeisiala n'afọ 2019 .
\ No newline at end of file
diff --git a/inputs/Kinyarwanda/Example1.txt b/inputs/Kinyarwanda/Example1.txt
new file mode 100644
index 0000000000000000000000000000000000000000..eb0029c73fa0a16519b1a78e0070665a6598bfd4
--- /dev/null
+++ b/inputs/Kinyarwanda/Example1.txt
@@ -0,0 +1,2 @@
+Ambasaderi w’Umuryango w’Ubumwe bw’u Burayi mu Rwanda , Nicola Bellomo , aherutse gushima uko u Rwan...
+Ambasaderi w’Umuryango w’Ubumwe bw’u Burayi mu Rwanda , Nicola Bellomo , aherutse gushima uko u Rwanda rurimo guhangana n’icyorezo cya Coronavirus , yizeza ko uyu muryango uzakomeza gufatanya na rwo muri uru rugamba no mu zindi gahunda z’iterambere .
\ No newline at end of file
diff --git a/inputs/Kinyarwanda/Example2.txt b/inputs/Kinyarwanda/Example2.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d56c1aeb356a27a54a4526fb981688f9c92e627a
--- /dev/null
+++ b/inputs/Kinyarwanda/Example2.txt
@@ -0,0 +1,2 @@
+Imibare ya Banki y’Isi yo kuwa 9 Mata igaragaza ko ubukungu bwo muri Afurika yo-munsi y’Ubutayu bwa ...
+Imibare ya Banki y’Isi yo kuwa 9 Mata igaragaza ko ubukungu bwo muri Afurika yo-munsi y’Ubutayu bwa Sahara , bwagizweho ingaruka na Coronavirus ndetse ko buzamanuka ku kigero cya - 2 .
\ No newline at end of file
diff --git a/inputs/Kinyarwanda/Example3.txt b/inputs/Kinyarwanda/Example3.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7960724bce8209fed0cb6691f5aff727cd4faf36
--- /dev/null
+++ b/inputs/Kinyarwanda/Example3.txt
@@ -0,0 +1,2 @@
+Amagambo ya Mateke anahura n’ay’umudepite Ruth Nankabirwa , kuri uyu kuwa Gatatu wabwiye bagenzi be ...
+Amagambo ya Mateke anahura n’ay’umudepite Ruth Nankabirwa , kuri uyu kuwa Gatatu wabwiye bagenzi be mu Nteko Ishinga Amategeko ko Guverinoma ya Uganda ikwiye gukemura bwangu ikibazo ifitanye n’u Rwanda , ariko asa n’uca amarenga ku buryo bwakoreshwa .
\ No newline at end of file
diff --git a/inputs/Kinyarwanda/Example4.txt b/inputs/Kinyarwanda/Example4.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1a7f47fc0f87217ba36193eb2461d14c6f0805da
--- /dev/null
+++ b/inputs/Kinyarwanda/Example4.txt
@@ -0,0 +1,2 @@
+Ubwo bari ku ngingo zijyanye n’uko Uganda ifasha imitwe yitwaje intwaro , Nduhungirehe yatanze uruge...
+Ubwo bari ku ngingo zijyanye n’uko Uganda ifasha imitwe yitwaje intwaro , Nduhungirehe yatanze urugero rw’igitero cyabaye mu ijoro rishyira ku itariki ya Kane Ukwakira aho abarwanyi b’umutwe wa RUD Urunana bateye mu Kinigi .
\ No newline at end of file
diff --git a/inputs/Luganda/Example1.txt b/inputs/Luganda/Example1.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8da175836b1f5cab59e5ddcb776c8ceec54ec5db
--- /dev/null
+++ b/inputs/Luganda/Example1.txt
@@ -0,0 +1,2 @@
+Phillip Wokorach , Justin Kimono ne Adrian Kisito be bamu ku baayambye Uganda , eyawangula empala zi...
+Phillip Wokorach , Justin Kimono ne Adrian Kisito be bamu ku baayambye Uganda , eyawangula empala zino omwaka oguwedde , okuva emabeganefuna obuwanguzi .
\ No newline at end of file
diff --git a/inputs/Luganda/Example2.txt b/inputs/Luganda/Example2.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4825a4bdef4eca92097007fbda37eb013067875b
--- /dev/null
+++ b/inputs/Luganda/Example2.txt
@@ -0,0 +1,2 @@
+Oluvannyuma yaddukira mu Zimbabwe ngakozesa Paasipooti eyali mu mannya ga David Mubiru , kyokka aboo...
+Oluvannyuma yaddukira mu Zimbabwe ngakozesa Paasipooti eyali mu mannya ga David Mubiru , kyokka aboobuyinza baamuyigga ne bamukomyawo mu Uganda , mu November 2016 , okumalayo ekibonerezo ekyemyaka ena nemyaka emirala ebiri , egyamwongerwako olwokutoloka mu kkomera .
\ No newline at end of file
diff --git a/inputs/Luganda/Example3.txt b/inputs/Luganda/Example3.txt
new file mode 100644
index 0000000000000000000000000000000000000000..85e5fe871ad3bd8bce9c910b5b60155f61c792ee
--- /dev/null
+++ b/inputs/Luganda/Example3.txt
@@ -0,0 +1,2 @@
+DPC wa Rakai , Patience Baganzi yategeezezza nti bagenda kumukwasa poliisi ye Katwe mu Kampala gye y...
+DPC wa Rakai , Patience Baganzi yategeezezza nti bagenda kumukwasa poliisi ye Katwe mu Kampala gye yaddiza omusango avunaanibwe .
\ No newline at end of file
diff --git a/inputs/Luganda/Example4.txt b/inputs/Luganda/Example4.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ffac673947519179a79490a2369b910ab1b6f230
--- /dev/null
+++ b/inputs/Luganda/Example4.txt
@@ -0,0 +1,2 @@
+OMWAMI wa Ssabasajja owessaza lya Mawokota afudde kibwatukira nalekabanna Mawokota mu kiyongobero . ...
+OMWAMI wa Ssabasajja owessaza lya Mawokota afudde kibwatukira nalekabanna Mawokota mu kiyongobero . Kayima David Ssekyeru afudde mu ngeri yentiisa bwaseredde nagwa mu kinaabiro nga egenze okunaaba bagenze okuyita ambulensi okumuddusa mu ddwaliro e Mmengo nafiira mu kkubo nga tebanatuuka mu ddwaliro . Ssekyeru abadde amaze wiiki emu nga mugonvugonvu kyokka abadde azeemu endasi kwekwewaliriza agende mu kinaabiro
\ No newline at end of file
diff --git a/inputs/Luganda/Example5.txt b/inputs/Luganda/Example5.txt
new file mode 100644
index 0000000000000000000000000000000000000000..42529dbb9c673ddcfb6c17461b9792fbc187851e
--- /dev/null
+++ b/inputs/Luganda/Example5.txt
@@ -0,0 +1,2 @@
+Omwogezi wa poliisi mu Greater Masaka , ASP Paul Kangave yategeezezza Bukedde nti poliisi yatandikid...
+Omwogezi wa poliisi mu Greater Masaka , ASP Paul Kangave yategeezezza Bukedde nti poliisi yatandikiddewo okunoonyereza oluvannyuma lwokufuna amawulire gokutemulwa kwomusuubuzi ono .
\ No newline at end of file
diff --git a/inputs/Nigerian/Example1.txt b/inputs/Nigerian/Example1.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f801cea02109772aaa02cfdff117e7c9da6128fb
--- /dev/null
+++ b/inputs/Nigerian/Example1.txt
@@ -0,0 +1,2 @@
+Jii 2 go mane gin ja apiko moro ma ja higni 20 mane oyang nyinge kaka Kevin Omondi kod achiel kuom j...
+Jii 2 go mane gin ja apiko moro ma ja higni 20 mane oyang nyinge kaka Kevin Omondi kod achiel kuom jowuoth mage mane oting' o mane iluongo ni Shopie Anyango ma ja higni 23 ne jotho mana kanyo gi kanyo e masirano mane ojuko lori moro mar kambi jo China kod apiko yoo Ringa
\ No newline at end of file
diff --git a/inputs/Nigerian/Example2.txt b/inputs/Nigerian/Example2.txt
new file mode 100644
index 0000000000000000000000000000000000000000..31d9d72bffada2d66504653d4862e8f57dae42ee
--- /dev/null
+++ b/inputs/Nigerian/Example2.txt
@@ -0,0 +1,2 @@
+Japuonjreno ma wuoi ma jahigni 15 ochopo e nyim jayal bura Joseph Karanja kama odonjne kod ketho mar...
+Japuonjreno ma wuoi ma jahigni 15 ochopo e nyim jayal bura Joseph Karanja kama odonjne kod ketho mar nego Noel Adhiambo midenyo ma jahigni 11 ; mane en japuonjre e skul ma Kosele Community Christian Center e kar chung' od bura ma Kasipul dwee mokalo .
\ No newline at end of file
diff --git a/inputs/Nigerian/Example3.txt b/inputs/Nigerian/Example3.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2b25c595c87b5152cc5d2e1ddf63d00b587211a8
--- /dev/null
+++ b/inputs/Nigerian/Example3.txt
@@ -0,0 +1,2 @@
+Magi oyangi gi jawach eloo State House nyadendi Kanze Dena mane owacho ni jogo nyocha opim ne tuono ...
+Magi oyangi gi jawach eloo State House nyadendi Kanze Dena mane owacho ni jogo nyocha opim ne tuono e pimo manyocha otim chieng tich 4
\ No newline at end of file
diff --git a/inputs/Nigerian/Example4.txt b/inputs/Nigerian/Example4.txt
new file mode 100644
index 0000000000000000000000000000000000000000..76cec4711fa2934a651daabc14e1ca7d88d7a7c5
--- /dev/null
+++ b/inputs/Nigerian/Example4.txt
@@ -0,0 +1,2 @@
+Kanomedochiwo ler ewii wachno Kanze nowacho ni jotich duto mag State House ipimoga moting' o e kinde...
+Kanomedochiwo ler ewii wachno Kanze nowacho ni jotich duto mag State House ipimoga moting' o e kinde ka kinde moting' o nyaka jatend piny Kenya migosi Uhuru Kenyata gi familia mare mar ng' eyo chal margi ne tuo mar Covid - 19no kowacho ni jii 4 mane oyudi ni kod tuono sani jonie kar thieth ma Kenyatta University Teaching , Referal and Research Hospital ma gidhiyoe nyime gi yudo thieth
\ No newline at end of file
diff --git a/inputs/Pidgin/Example1.txt b/inputs/Pidgin/Example1.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0fb1fd22f8c4288623c91d8e60a293bec19749ff
--- /dev/null
+++ b/inputs/Pidgin/Example1.txt
@@ -0,0 +1,2 @@
+Popular cable satellite broadcaster DsTV , no get right to Bundesliga live matches for di 2019 / 202...
+Popular cable satellite broadcaster DsTV , no get right to Bundesliga live matches for di 2019 / 2020 season so na pipo wey get StarTimes dey in luck because na dem get broadcast rights for Sub - Saharan Africa .
\ No newline at end of file
diff --git a/inputs/Pidgin/Example2.txt b/inputs/Pidgin/Example2.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3551a681c1a962e4fbd310587fb0079b50102990
--- /dev/null
+++ b/inputs/Pidgin/Example2.txt
@@ -0,0 +1,2 @@
+Whichever way wey you watch just know say you dey part of one billion pipo wey Bayern CEO Karl - Hei...
+Whichever way wey you watch just know say you dey part of one billion pipo wey Bayern CEO Karl - Heinz Rummenigge don gauge say go watch dis weekend live matches See Saturday games .
\ No newline at end of file
diff --git a/inputs/Pidgin/Example3.txt b/inputs/Pidgin/Example3.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bd9a562180bdc6992b21b008f0dfc99700ae9f56
--- /dev/null
+++ b/inputs/Pidgin/Example3.txt
@@ -0,0 +1,2 @@
+Conditions Spain top league and working place of Lionel Messi dey torchlight June 12 as di date when...
+Conditions Spain top league and working place of Lionel Messi dey torchlight June 12 as di date when dem go resume di season .
\ No newline at end of file
diff --git a/inputs/Pidgin/Example4.txt b/inputs/Pidgin/Example4.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6caa1bc4dddca03b3d57a2d168f43e823ba50393
--- /dev/null
+++ b/inputs/Pidgin/Example4.txt
@@ -0,0 +1,2 @@
+LA Lakers legend Kobe Bryant and im daughter Gianna plus seven oda die for helicopter crash for di c...
+LA Lakers legend Kobe Bryant and im daughter Gianna plus seven oda die for helicopter crash for di city of Calabasa , California on Sunday 26 January .
\ No newline at end of file
diff --git a/inputs/Pidgin/Example5.txt b/inputs/Pidgin/Example5.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fcbba2a8525d4777842e13fc71624190bfe95444
--- /dev/null
+++ b/inputs/Pidgin/Example5.txt
@@ -0,0 +1,2 @@
+Ighalo move go Chinese Super League for 2017 , first with Changch...
+Ighalo move go Chinese Super League for 2017 , first with Changchun Yatai .
\ No newline at end of file
diff --git a/inputs/Pidgin/Example6.txt b/inputs/Pidgin/Example6.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1698f4fb612b3ecde41bdb1c1ece140dff283218
--- /dev/null
+++ b/inputs/Pidgin/Example6.txt
@@ -0,0 +1,2 @@
+Senegal and Liverpool forward Mane beat both Egypt player Mohammed Salah and Algeria winger Riyad Ma...
+Senegal and Liverpool forward Mane beat both Egypt player Mohammed Salah and Algeria winger Riyad Mahrez to win di award wey dem do for Egypt on Tuesday .
\ No newline at end of file
diff --git a/inputs/Swahilu/Example1.txt b/inputs/Swahilu/Example1.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3574dbd12325aac9b192cce13924d5fc4d4c4bc1
--- /dev/null
+++ b/inputs/Swahilu/Example1.txt
@@ -0,0 +1,2 @@
+Wanamgambo wa ADF Mauaji ya Alhamisi katika mkoa wa Mbau kaskazini mwa Beni yanashukiwa kufanya na k...
+Wanamgambo wa ADF Mauaji ya Alhamisi katika mkoa wa Mbau kaskazini mwa Beni yanashukiwa kufanya na kundi la waasi la Allied Democratic Force , ADF , ambalo linahusika na mfululizo wa mauaji tangu kuanza kwa ghasia mwezi November .
\ No newline at end of file
diff --git a/inputs/Swahilu/Example2.txt b/inputs/Swahilu/Example2.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f14b8f2aa85ae0cbfc3c8484c7b149d024562ed6
--- /dev/null
+++ b/inputs/Swahilu/Example2.txt
@@ -0,0 +1,2 @@
+Jeshi la Congo limegundua ‘kiwanda cha kutengeneza mabomu ya kienyeji’ katika kambi moja ya ADF wali...
+Jeshi la Congo limegundua ‘kiwanda cha kutengeneza mabomu ya kienyeji’ katika kambi moja ya ADF waliyoiteka , msemaji wa jeshi jenerali Leon Richard Kasonga amesema Jumatano .
\ No newline at end of file
diff --git a/inputs/Swahilu/Example3.txt b/inputs/Swahilu/Example3.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c35adc69f65da1898f60cc0ca037b339ffe34f94
--- /dev/null
+++ b/inputs/Swahilu/Example3.txt
@@ -0,0 +1,2 @@
+Wajumbe wa kikosi kazi cha virusi vya corona cha White House wamepangiwa kutoa ushuhuda mbele ya kam...
+Wajumbe wa kikosi kazi cha virusi vya corona cha White House wamepangiwa kutoa ushuhuda mbele ya kamati ya Nishati na Biashara ya Baraza la Wawakilishi Jumanne , na Spika wa Baraza la Wawakilishi Nancy Pelosi amesema , “ Wananchi wa Marekani wanahitaji majibu kwa nini Rais Trump anataka upimaji upunguzwe kasi wakati wataalam wanasema upimaji zaidi unahitajika .
\ No newline at end of file
diff --git a/inputs/Swahilu/Example4.txt b/inputs/Swahilu/Example4.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cf11c4af4bac0ae7f70511d9643fcfd57e7eb4d7
--- /dev/null
+++ b/inputs/Swahilu/Example4.txt
@@ -0,0 +1,2 @@
+Siku Jumatano maafisa wawili wa Umoja wa Mataifa watawasilisha ripoti inayoeleza kwamba kuna ushahid...
+Siku Jumatano maafisa wawili wa Umoja wa Mataifa watawasilisha ripoti inayoeleza kwamba kuna ushahidi wa kutosha unaodhihirisha kwamba Saudi Arabia ilidukua simu ya Bezos .
\ No newline at end of file
diff --git a/inputs/Swahilu/Example5.txt b/inputs/Swahilu/Example5.txt
new file mode 100644
index 0000000000000000000000000000000000000000..749fea6afa3ca7d2993dcecc528b7aa8341cff84
--- /dev/null
+++ b/inputs/Swahilu/Example5.txt
@@ -0,0 +1,2 @@
+Mahakama ya Juu ya Korea Kusini imeamrisha mahakama ya chini ifikirie tena moja ya mashtaka ya jinai...
+Mahakama ya Juu ya Korea Kusini imeamrisha mahakama ya chini ifikirie tena moja ya mashtaka ya jinai dhidi ya Rais wa zamani Park Geun - hye ambaye alilazimishwa kuondoka madarakani mwaka 2017 kutokana na kashfa ya ufisadi .
\ No newline at end of file
diff --git a/inputs/Swahilu/Example6.txt b/inputs/Swahilu/Example6.txt
new file mode 100644
index 0000000000000000000000000000000000000000..489c7a9a873f3960afbded9b0bcb38cd3ddbba07
--- /dev/null
+++ b/inputs/Swahilu/Example6.txt
@@ -0,0 +1,2 @@
+Waziri Mkuu wa Uingereza Boris Johnson amesema ataheshimu utaratibu wa sheria lakini Uingereza itaji...
+Waziri Mkuu wa Uingereza Boris Johnson amesema ataheshimu utaratibu wa sheria lakini Uingereza itajiondowa kutoka Umoja wa Ulaya ( EU ) ifikapo Oktoba 31 .
\ No newline at end of file
diff --git a/inputs/Wolof/Example1.txt b/inputs/Wolof/Example1.txt
new file mode 100644
index 0000000000000000000000000000000000000000..36d6177f8db721c142557343d56e79ecfad5c206
--- /dev/null
+++ b/inputs/Wolof/Example1.txt
@@ -0,0 +1,2 @@
+Dafa di sax , ni mu ame woon noonu fit moo taxoon ñu dàq ko , moom ak benn doomu Farãs bu daan wuyoo...
+Dafa di sax , ni mu ame woon noonu fit moo taxoon ñu dàq ko , moom ak benn doomu Farãs bu daan wuyoo ci turu Daniel Cohn - Bendit , ca daara ju mag jooju , ci atum 1969 .
\ No newline at end of file
diff --git a/inputs/Wolof/Example2.txt b/inputs/Wolof/Example2.txt
new file mode 100644
index 0000000000000000000000000000000000000000..28b2f08e3621941ac55591a5903ab7446cd8c5e5
--- /dev/null
+++ b/inputs/Wolof/Example2.txt
@@ -0,0 +1,2 @@
+Usmaan Sonkoo ngi juddoo Cees ci a...
+Usmaan Sonkoo ngi juddoo Cees ci atum 1974 .
\ No newline at end of file
diff --git a/inputs/Wolof/Example3.txt b/inputs/Wolof/Example3.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2bedb40dfb6dfc06c8a1cb72b84f806a63e8ba9c
--- /dev/null
+++ b/inputs/Wolof/Example3.txt
@@ -0,0 +1,2 @@
+Waaw , Isaa Sàll nekkoon na fi Njiitu ndajem diiwaanu Fatig ci njeexitalu a...
+Waaw , Isaa Sàll nekkoon na fi Njiitu ndajem diiwaanu Fatig ci njeexitalu atiy 1990 .
\ No newline at end of file
diff --git a/inputs/Wolof/Example4.txt b/inputs/Wolof/Example4.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4f90fe2afa12e413f35196c77c000e161034ddae
--- /dev/null
+++ b/inputs/Wolof/Example4.txt
@@ -0,0 +1,2 @@
+IRÃ NDAW : Komisaariya bu Ndaakaaru woolu na waaraatekatu ...
+IRÃ NDAW : Komisaariya bu Ndaakaaru woolu na waaraatekatu Sentv bi .
\ No newline at end of file
diff --git a/inputs/Wolof/Example5.txt b/inputs/Wolof/Example5.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ccae3edd758757aee1619202fbadc9716b5ba867
--- /dev/null
+++ b/inputs/Wolof/Example5.txt
@@ -0,0 +1,2 @@
+Ciy ati 60 , bokkoon na ci ” Groupe de Grenoble ” kurél gu doon jéem a suqali làmmiñi réew mi mook ñ...
+Ciy ati 60 , bokkoon na ci ” Groupe de Grenoble ” kurél gu doon jéem a suqali làmmiñi réew mi mook ñoomin Asan Silla , Masàmba Sare ak Saaliyu Kànji ak it Ablaay Wàdd mi fi doonoon njiitu réewum Senegaal .
\ No newline at end of file
diff --git "a/inputs/Yor\303\271b\303\241/Example1.txt" "b/inputs/Yor\303\271b\303\241/Example1.txt"
new file mode 100644
index 0000000000000000000000000000000000000000..d8f5785a9b5b8bd66423638e3fcc542ca7eb655f
--- /dev/null
+++ "b/inputs/Yor\303\271b\303\241/Example1.txt"
@@ -0,0 +1,2 @@
+Ẹgbẹ́ Ohùn Àgbáyé dúró ṣinṣin pẹ̀lú Luis Carlos , ẹbíi rẹ̀ , àti oníròyìn aládàáṣiṣẹ́ gbogbo àwọn tí...
+Ẹgbẹ́ Ohùn Àgbáyé dúró ṣinṣin pẹ̀lú Luis Carlos , ẹbíi rẹ̀ , àti oníròyìn aládàáṣiṣẹ́ gbogbo àwọn tí ó ń mú ìjọba ṣe bí ó ti yẹ ní Venezuela .
\ No newline at end of file
diff --git "a/inputs/Yor\303\271b\303\241/Example2.txt" "b/inputs/Yor\303\271b\303\241/Example2.txt"
new file mode 100644
index 0000000000000000000000000000000000000000..68d9dd627ed38f33e5254af7a2cfd1224cb3f44e
--- /dev/null
+++ "b/inputs/Yor\303\271b\303\241/Example2.txt"
@@ -0,0 +1,2 @@
+Ilé - iṣẹ́ẹ Mohammed Sani Musa , Activate Technologies Limited , ni ó kó ẹ̀rọ Ìwé - pélébé Ìdìbò Alá...
+Ilé - iṣẹ́ẹ Mohammed Sani Musa , Activate Technologies Limited , ni ó kó ẹ̀rọ Ìwé - pélébé Ìdìbò Alálòpẹ́ ( PVCs ) tí a lò fún ọdún - un 2019 , nígbà tí ó jẹ́ òǹdíjedupò lábẹ́ ẹgbẹ́ olóṣèlúu tí ó ń tukọ̀ ètò ìṣèlú lọ́wọ́ All Progressives Congress ( APC ) fún Aṣojú Ìlà - Oòrùn Niger , ìyẹn gẹ́gẹ́ bí ilé iṣẹ́ aṣèwádìí , Premium Times ṣe tẹ̀ ẹ́
\ No newline at end of file
diff --git "a/inputs/Yor\303\271b\303\241/Example3.txt" "b/inputs/Yor\303\271b\303\241/Example3.txt"
new file mode 100644
index 0000000000000000000000000000000000000000..2bdaacf326b3e92dc9fa519c9af13fb80bfc6093
--- /dev/null
+++ "b/inputs/Yor\303\271b\303\241/Example3.txt"
@@ -0,0 +1,2 @@
+Nínú oṣù Agẹmọ 2019 , ní ìṣojú ọlọ́pàá , Abbo ṣe àṣemáṣe pẹ̀lú òṣìṣẹ́bìnrin kan nínú ìsọ̀ ohun ìbálò...
+Nínú oṣù Agẹmọ 2019 , ní ìṣojú ọlọ́pàá , Abbo ṣe àṣemáṣe pẹ̀lú òṣìṣẹ́bìnrin kan nínú ìsọ̀ ohun ìbálòpọ̀ ní olú - ìlú Nàìjíríà ní Abuja .
\ No newline at end of file
diff --git "a/inputs/Yor\303\271b\303\241/Example4.txt" "b/inputs/Yor\303\271b\303\241/Example4.txt"
new file mode 100644
index 0000000000000000000000000000000000000000..2d4db858921b970c3fd85e07316c2779e3b36723
--- /dev/null
+++ "b/inputs/Yor\303\271b\303\241/Example4.txt"
@@ -0,0 +1,2 @@
+Abba Moro , tí í ṣe ọmọ ẹgbẹ́ẹ PDP , ni aṣojú fún ẹ̀ka Gúúsù Benue , àárín gbùngbùn àríwá ...
+Abba Moro , tí í ṣe ọmọ ẹgbẹ́ẹ PDP , ni aṣojú fún ẹ̀ka Gúúsù Benue , àárín gbùngbùn àríwá Nàìjíríà .
\ No newline at end of file
diff --git a/pages/Workflow & Model Overview.py b/pages/Workflow & Model Overview.py
new file mode 100644
index 0000000000000000000000000000000000000000..38a42d507b5688f0a5a0238f8ecc125623a1ebbf
--- /dev/null
+++ b/pages/Workflow & Model Overview.py
@@ -0,0 +1,246 @@
+import streamlit as st
+
+# Custom CSS for better styling
+st.markdown("""
+
+""", unsafe_allow_html=True)
+
+# Main Title
+st.markdown('NER Model for 10 African Languages
', unsafe_allow_html=True)
+
+# What is Named Entity Recognition (NER)?
+st.markdown('What is Named Entity Recognition (NER)?
', unsafe_allow_html=True)
+st.markdown("""
+
+
Named Entity Recognition (NER) is a crucial task in Natural Language Processing (NLP) that involves identifying and classifying entities within a text into predefined categories such as names of people, organizations, locations, dates, and more. NER helps in structuring unstructured text, making it easier to analyze and extract meaningful information.
+
For example, in the sentence "Barack Obama was born in Hawaii," NER would identify "Barack Obama" as a person (PER) and "Hawaii" as a location (LOC).
+
+""", unsafe_allow_html=True)
+
+# Importance of NER
+st.markdown('Importance of NER
', unsafe_allow_html=True)
+st.markdown("""
+
+
NER is essential for various applications, including:
+
+ - Information Extraction: Extracting structured information from unstructured text, such as news articles, social media posts, and legal documents.
+ - Search and Retrieval: Enhancing search engines and information retrieval systems by indexing and retrieving documents based on named entities.
+ - Content Recommendation: Improving content recommendation systems by understanding the context and entities mentioned in the text.
+ - Question Answering: Enhancing question-answering systems by accurately identifying entities within the text to provide precise answers.
+
+
+""", unsafe_allow_html=True)
+
+# Description
+st.markdown('Description
', unsafe_allow_html=True)
+st.markdown("""
+
+
This model is imported from Hugging Face. It’s been trained using xlm_roberta_large
fine-tuned model on 10 African languages: Amharic, Hausa, Igbo, Kinyarwanda, Luganda, Nigerian Pidgin, Swahili, Wolof, and Yorùbá.
+
+""", unsafe_allow_html=True)
+
+# Predicted Entities
+st.markdown('Predicted Entities
', unsafe_allow_html=True)
+st.markdown("""
+
+
+ - DATE: Refers to temporal expressions such as specific dates and periods (e.g., "3-10-2000").
+ - LOC: Refers to geographical locations, including cities, countries, and other landmarks (e.g., "Addis Ababa").
+ - PER: Refers to names of people or characters (e.g., "Ahmed Vanda").
+ - ORG: Refers to names of organizations, companies, and institutions (e.g., "United Nations").
+
+
+""", unsafe_allow_html=True)
+
+# How to use
+st.markdown('How to use
', unsafe_allow_html=True)
+st.markdown("""
+
+
To use this model, follow these steps in Python:
+
+""", unsafe_allow_html=True)
+st.code("""
+from sparknlp.base import *
+from sparknlp.annotator import *
+from pyspark.ml import Pipeline
+
+# Define the components of the pipeline
+documentAssembler = DocumentAssembler() \\
+ .setInputCol("text") \\
+ .setOutputCol("document")
+
+sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") \\
+ .setInputCols(["document"]) \\
+ .setOutputCol("sentence")
+
+tokenizer = Tokenizer() \\
+ .setInputCols(["sentence"]) \\
+ .setOutputCol("token")
+
+tokenClassifier = XlmRoBertaForTokenClassification.pretrained("xlm_roberta_large_token_classifier_masakhaner", "xx") \\
+ .setInputCols(["sentence",'token']) \\
+ .setOutputCol("ner")
+
+ner_converter = NerConverter() \\
+ .setInputCols(["sentence", "token", "ner"]) \\
+ .setOutputCol("ner_chunk")
+
+nlpPipeline = Pipeline(stages=[documentAssembler, sentenceDetector, tokenizer, tokenClassifier, ner_converter])
+
+empty_data = spark.createDataFrame([[""]]).toDF("text")
+
+model = nlpPipeline.fit(empty_data)
+text = '''አህመድ ቫንዳ ከ3-10-2000 ጀምሮ በአዲስ አበባ ኖሯል።'''
+result = model.transform(spark.createDataFrame([[text]]).toDF("text"))
+
+# Display the results
+result.selectExpr("explode(arrays_zip(ner_chunk.result, ner_chunk.metadata)) as entity")
+ .selectExpr("entity['0'] as chunk", "entity['1'].entity as ner_label")
+ .show(truncate=False)
+""", language="python")
+
+# Results
+import pandas as pd
+
+# Create the data for the DataFrame
+data = {
+ "chunk": ["አህመድ ቫንዳ", "ከ3-10-2000 ጀምሮ", "በአዲስ አበባ"],
+ "ner_label": ["PER", "DATE", "LOC"]
+}
+
+# Creating the DataFrame
+df = pd.DataFrame(data)
+df.index += 1
+st.dataframe(df)
+
+# What Can We Do with This Model?
+st.markdown('What Can We Do with This Model?
', unsafe_allow_html=True)
+st.markdown("""
+
+
This NER model for 10 African languages enables various applications:
+
+ - Multilingual Information Extraction: Efficiently extract information from texts written in diverse African languages, which is valuable for local news, social media analysis, and governmental reports.
+ - Cross-Language Search and Retrieval: Enhance search engines to support multiple African languages, making it easier to find relevant information across different linguistic contexts.
+ - Content Recommendation: Provide more accurate content recommendations by understanding the entities mentioned in the user's preferred language.
+ - Research and Analytics: Support linguistic research and analytics by processing texts in multiple African languages to identify trends and patterns.
+
+
+""", unsafe_allow_html=True)
+
+# Model Information
+st.markdown('Model Information
', unsafe_allow_html=True)
+st.markdown("""
+
+
+ - Model Name: xlm_roberta_large_token_classifier_masakhaner
+ - Compatibility: Spark NLP 3.3.2+
+ - License: Open Source
+ - Edition: Official
+ - Input Labels: [sentence, token]
+ - Output Labels: [ner]
+ - Language: xx
+ - Case sensitive: true
+ - Max sentence length: 256
+
+
+""", unsafe_allow_html=True)
+
+# Data Source
+st.markdown('Data Source
', unsafe_allow_html=True)
+st.markdown("""
+
+
The model was trained using the dataset available at Hugging Face.
+
+""", unsafe_allow_html=True)
+
+# Benchmarking
+st.markdown('Benchmarking
', unsafe_allow_html=True)
+st.markdown("""
+
+
Evaluating the performance of NER models is crucial to understanding their effectiveness in real-world applications. Below are the benchmark results for the xlm_roberta_large_token_classifier_masakhaner
model, focusing on various named entity categories across 10 African languages. The metrics used include F1-score, which is a standard for evaluating classification models.
+
+""", unsafe_allow_html=True)
+st.markdown("""
+---
+| language | F1-score |
+|----------|----------|
+| amh | 75.76 |
+| hau | 91.75 |
+| ibo | 86.26 |
+| kin | 76.38 |
+| lug | 84.64 |
+| luo | 80.65 |
+| pcm | 89.55 |
+| swa | 89.48 |
+| wol | 70.70 |
+| yor | 82.05 |
+---
+""", unsafe_allow_html=True)
+
+st.markdown("""
+
+
These results demonstrate the model's ability to accurately identify and classify named entities in multiple African languages. The F1-scores indicate the balance between precision and recall for each language, reflecting the model's robustness across diverse linguistic contexts.
+
+""", unsafe_allow_html=True)
+
+# Conclusion/Summary
+st.markdown('Conclusion
', unsafe_allow_html=True)
+st.markdown("""
+
+
The xlm_roberta_large_token_classifier_masakhaner
model showcases significant performance in recognizing named entities in 10 African languages. This model leverages xlm_roberta_large
embeddings to enhance its understanding and accuracy in identifying entities such as persons, locations, dates, and organizations. Its integration into Spark NLP provides an efficient and scalable solution for processing multilingual text data, making it an invaluable tool for researchers and developers working with African languages.
+
+""", unsafe_allow_html=True)
+
+# References
+st.markdown('References
', unsafe_allow_html=True)
+st.markdown("""
+
+""", unsafe_allow_html=True)
+
+# Community & Support
+st.markdown('Community & Support
', unsafe_allow_html=True)
+st.markdown("""
+
+""", unsafe_allow_html=True)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..327c0a2a3aac36b6f2416cf191e5c28edab4a0b4
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+streamlit
+st-annotated-text
+pandas
+numpy
+spark-nlp
+pyspark
\ No newline at end of file