Spaces:
Paused
Paused
import spacy | |
import pickle | |
from nltk.corpus import wordnet | |
def load_spacy_values(model = "en_core_web_md", filepath_docs_spacy = 'dict_spacy_object.pkl'): | |
''' | |
Loads a spaCy language model and a dictionary of spaCy Doc objects from a pickle file. | |
Parameters | |
---------- | |
model : str | |
The name or local path of the spaCy model to be loaded for processing text. | |
For example, "en_core_web_sm" or a custom model path. | |
filepath_docs_spacy : str | |
The path to the pickle file containing a dictionary where the keys are tokens | |
(strings) and the values are the corresponding serialized spaCy Doc objects. | |
Returns | |
------- | |
nlp : spacy.language.Language | |
The loaded spaCy language model. | |
dict_docs_spacy : dict | |
A dictionary where the keys are tokens (strings) and the values are spaCy Doc | |
objects reconstructed from the serialized bytes stored in the pickle file. | |
''' | |
# ---- Load the spaCy NLP model | |
# | |
nlp = spacy.load(model) | |
# ---- Load pickle file and reconstruct the dictionary with tokens as keys and spaCy Doc objects as values | |
# | |
with open(filepath_docs_spacy, 'rb') as file: | |
dict_docs_spacy_bytes = pickle.load(file) | |
dict_docs_spacy = {key: spacy.tokens.Doc(nlp.vocab).from_bytes(doc_bytes) for key, doc_bytes in dict_docs_spacy_bytes.items()} | |
return nlp, dict_docs_spacy | |
def find_antonyms(word): | |
''' | |
Generate a set of all the antonyms of a given word | |
Parameters | |
---------- | |
word : str | |
The word that we want to find the antonyms | |
Returns | |
------- | |
antonyms : set of str | |
A set of all the antonym detected using nltk and WordNet | |
''' | |
antonyms = set() | |
# ---- Load all the set of synonyms of the word recorded from wordnet | |
# | |
syn_set = wordnet.synsets(word) | |
# ---- Loop over each set of synonyms | |
# | |
for syn in syn_set: | |
# ---- Loop over each synonym | |
# | |
for lemma in syn.lemmas(): | |
# ---- Add antonyms of the synonyms to the antonyms set | |
# | |
if lemma.antonyms(): | |
antonyms.add(lemma.antonyms()[0].name()) | |
return antonyms | |
def find_synonyms(word, model, dict_embedding, list_2000_tokens): | |
# 고유명사 보존 | |
doc = model(word) | |
if doc[0].pos_ == "PROPN": | |
return word | |
# 기본 동사 매핑 | |
basic_verbs = { | |
"is": "IS", | |
"am": "IS", | |
"are": "IS", | |
"was": "IS", | |
"were": "IS", | |
"be": "IS", | |
"have": "HAVE", | |
"has": "HAVE", | |
"had": "HAVE" | |
} | |
if word.lower() in basic_verbs: | |
return basic_verbs[word.lower()] | |
# 이미 목록에 있는 단어는 그대로 반환 | |
if word in list_2000_tokens: | |
return word | |
# 품사가 같은 유사어 찾기 | |
word_doc = model(word) | |
word_pos = word_doc[0].pos_ | |
antonyms = find_antonyms(word) | |
filtered_tokens = [ | |
token for token in list_2000_tokens | |
if token not in antonyms | |
and model(token)[0].pos_ == word_pos | |
] | |
similarities = [] | |
word_embedding = model(word) | |
for token in filtered_tokens: | |
similarities.append((token, dict_embedding.get(token).similarity(word_embedding))) | |
# ====== 수정된 부분: similarities 리스트가 비었는지 확인 ====== | |
if not similarities: | |
# 유사 후보가 없다면 원본 단어를 그대로 반환 | |
return word | |
# ========================================================== | |
most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0] | |
return most_similar_token | |