e2eqa-wiki / src /features /text_utils.py
foxxy-hm's picture
Update src/features/text_utils.py
import json
from glob import glob
import re
import nltk
from nltk import word_tokenize as lib_tokenizer
import string
def preprocess(x, max_length=-1, remove_puncts=False):
x = nltk_tokenize(x)
x = x.replace("\n", " ")
if remove_puncts:
x = "".join([i for i in x if i not in string.punctuation])
if max_length > 0:
x = " ".join(x.split()[:max_length])
return x
def nltk_tokenize(x):
return " ".join(word_tokenize(strip_context(x))).strip()
def post_process_answer(x, entity_dict):
if type(x) is not str:
return x
x = strip_answer_string(x)
return "NaN"
x = "".join([c for c in x if c not in string.punctuation])
x = " ".join(x.split())
y = x.lower()
if len(y) > 1 and y.split()[0].isnumeric() and ("tháng" not in x):
return y.split()[0]
if not (x.isnumeric() or "ngày" in x or "tháng" in x or "năm" in x):
if len(x.split()) <= 2:
return entity_dict.get(x.lower(), x)
return x
return y
dict_map = dict({})
def word_tokenize(text):
global dict_map
words = text.split()
words_norm = []
for w in words:
if dict_map.get(w, None) is None:
dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '"').replace("''", '"')
return words_norm
def strip_answer_string(text):
text = text.strip()
while text[-1] in '.,/><;:\'"[]{}+=-_)(*&^!~`':
if text[0] != '(' and text[-1] == ')' and '(' in text:
if text[-1] == '"' and text[0] != '"' and text.count('"') > 1:
text = text[:-1].strip()
while text[0] in '.,/><;:\'"[]{}+=-_)(*&^!~`':
if text[0] == '"' and text[-1] != '"' and text.count('"') > 1:
text = text[1:].strip()
text = text.strip()
return text
def strip_context(text):
text = text.replace('\n', ' ')
text = re.sub(r'\s+', ' ', text)
text = text.strip()
return text
def check_number(x):
x = str(x).lower()
return (x.isnumeric() or "ngày" in x or "tháng" in x or "năm" in x)