Spaces:
Paused
Paused
File size: 3,704 Bytes
fe1339d abc6394 974d749 abc6394 fe1339d 974d749 abc6394 974d749 abc6394 974d749 fe1339d 974d749 abc6394 974d749 fe1339d 974d749 abc6394 974d749 fe1339d 974d749 fe1339d 974d749 fe1339d 974d749 fe1339d 974d749 fe1339d 974d749 fe1339d 974d749 fe1339d 974d749 fe1339d 974d749 95a5be9 f562d6f 95a5be9 f562d6f 95a5be9 f562d6f 95a5be9 f562d6f 95a5be9 f562d6f 95a5be9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import spacy
import pickle
from nltk.corpus import wordnet
def load_spacy_values(model = "en_core_web_md", filepath_docs_spacy = 'dict_spacy_object.pkl'):
'''
Loads a spaCy language model and a dictionary of spaCy Doc objects from a pickle file.
Parameters
----------
model : str
The name or local path of the spaCy model to be loaded for processing text.
For example, "en_core_web_sm" or a custom model path.
filepath_docs_spacy : str
The path to the pickle file containing a dictionary where the keys are tokens
(strings) and the values are the corresponding serialized spaCy Doc objects.
Returns
-------
nlp : spacy.language.Language
The loaded spaCy language model.
dict_docs_spacy : dict
A dictionary where the keys are tokens (strings) and the values are spaCy Doc
objects reconstructed from the serialized bytes stored in the pickle file.
'''
# ---- Load the spaCy NLP model
#
nlp = spacy.load(model)
# ---- Load pickle file and reconstruct the dictionary with tokens as keys and spaCy Doc objects as values
#
with open(filepath_docs_spacy, 'rb') as file:
dict_docs_spacy_bytes = pickle.load(file)
dict_docs_spacy = {key: spacy.tokens.Doc(nlp.vocab).from_bytes(doc_bytes) for key, doc_bytes in dict_docs_spacy_bytes.items()}
return nlp, dict_docs_spacy
def find_antonyms(word):
'''
Generate a set of all the antonyms of a given word
Parameters
----------
word : str
The word that we want to find the antonyms
Returns
-------
antonyms : set of str
A set of all the antonym detected using nltk and WordNet
'''
antonyms = set()
# ---- Load all the set of synonyms of the word recorded from wordnet
#
syn_set = wordnet.synsets(word)
# ---- Loop over each set of synonyms
#
for syn in syn_set:
# ---- Loop over each synonym
#
for lemma in syn.lemmas():
# ---- Add antonyms of the synonyms to the antonyms set
#
if lemma.antonyms():
antonyms.add(lemma.antonyms()[0].name())
return antonyms
def find_synonyms(word, model, dict_embedding, list_2000_tokens):
# 고유명사 보존
doc = model(word)
if doc[0].pos_ == "PROPN":
return word
# 기본 동사 매핑
basic_verbs = {
"is": "IS",
"am": "IS",
"are": "IS",
"was": "IS",
"were": "IS",
"be": "IS",
"have": "HAVE",
"has": "HAVE",
"had": "HAVE"
}
if word.lower() in basic_verbs:
return basic_verbs[word.lower()]
# 이미 목록에 있는 단어는 그대로 반환
if word in list_2000_tokens:
return word
# 품사가 같은 유사어 찾기
word_doc = model(word)
word_pos = word_doc[0].pos_
antonyms = find_antonyms(word)
filtered_tokens = [
token for token in list_2000_tokens
if token not in antonyms
and model(token)[0].pos_ == word_pos
]
similarities = []
word_embedding = model(word)
for token in filtered_tokens:
similarities.append((token, dict_embedding.get(token).similarity(word_embedding)))
# ====== 수정된 부분: similarities 리스트가 비었는지 확인 ======
if not similarities:
# 유사 후보가 없다면 원본 단어를 그대로 반환
return word
# ==========================================================
most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0]
return most_similar_token
|