Sign-language / src /synonyms_preprocess.py
ginipick's picture
Update src/synonyms_preprocess.py
95a5be9 verified
raw
history blame
3.7 kB
import spacy
import pickle
from nltk.corpus import wordnet
def load_spacy_values(model = "en_core_web_md", filepath_docs_spacy = 'dict_spacy_object.pkl'):
'''
Loads a spaCy language model and a dictionary of spaCy Doc objects from a pickle file.
Parameters
----------
model : str
The name or local path of the spaCy model to be loaded for processing text.
For example, "en_core_web_sm" or a custom model path.
filepath_docs_spacy : str
The path to the pickle file containing a dictionary where the keys are tokens
(strings) and the values are the corresponding serialized spaCy Doc objects.
Returns
-------
nlp : spacy.language.Language
The loaded spaCy language model.
dict_docs_spacy : dict
A dictionary where the keys are tokens (strings) and the values are spaCy Doc
objects reconstructed from the serialized bytes stored in the pickle file.
'''
# ---- Load the spaCy NLP model
#
nlp = spacy.load(model)
# ---- Load pickle file and reconstruct the dictionary with tokens as keys and spaCy Doc objects as values
#
with open(filepath_docs_spacy, 'rb') as file:
dict_docs_spacy_bytes = pickle.load(file)
dict_docs_spacy = {key: spacy.tokens.Doc(nlp.vocab).from_bytes(doc_bytes) for key, doc_bytes in dict_docs_spacy_bytes.items()}
return nlp, dict_docs_spacy
def find_antonyms(word):
'''
Generate a set of all the antonyms of a given word
Parameters
----------
word : str
The word that we want to find the antonyms
Returns
-------
antonyms : set of str
A set of all the antonym detected using nltk and WordNet
'''
antonyms = set()
# ---- Load all the set of synonyms of the word recorded from wordnet
#
syn_set = wordnet.synsets(word)
# ---- Loop over each set of synonyms
#
for syn in syn_set:
# ---- Loop over each synonym
#
for lemma in syn.lemmas():
# ---- Add antonyms of the synonyms to the antonyms set
#
if lemma.antonyms():
antonyms.add(lemma.antonyms()[0].name())
return antonyms
def find_synonyms(word, model, dict_embedding, list_2000_tokens):
# 고유명사 보존
doc = model(word)
if doc[0].pos_ == "PROPN":
return word
# 기본 동사 매핑
basic_verbs = {
"is": "IS",
"am": "IS",
"are": "IS",
"was": "IS",
"were": "IS",
"be": "IS",
"have": "HAVE",
"has": "HAVE",
"had": "HAVE"
}
if word.lower() in basic_verbs:
return basic_verbs[word.lower()]
# 이미 목록에 있는 단어는 그대로 반환
if word in list_2000_tokens:
return word
# 품사가 같은 유사어 찾기
word_doc = model(word)
word_pos = word_doc[0].pos_
antonyms = find_antonyms(word)
filtered_tokens = [
token for token in list_2000_tokens
if token not in antonyms
and model(token)[0].pos_ == word_pos
]
similarities = []
word_embedding = model(word)
for token in filtered_tokens:
similarities.append((token, dict_embedding.get(token).similarity(word_embedding)))
# ====== 수정된 부분: similarities 리스트가 비었는지 확인 ======
if not similarities:
# 유사 후보가 없다면 원본 단어를 그대로 반환
return word
# ==========================================================
most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0]
return most_similar_token