Spaces:

ginigen
/

Sign-language

Paused

File size: 3,704 Bytes

fe1339d
 
 
 
 
abc6394
974d749
abc6394
fe1339d
974d749
 
abc6394
 
 
974d749
 
abc6394
 
974d749
 
 
 
 
fe1339d
974d749
abc6394
 
974d749
fe1339d
974d749
 
abc6394
974d749
 
 
fe1339d
 
 
 
 
 
 
974d749
fe1339d
974d749
 
 
 
 
 
 
 
 
 
 
 
 
 
fe1339d
974d749
 
 
fe1339d
974d749
 
 
fe1339d
974d749
 
fe1339d
974d749
 
fe1339d
 
974d749
fe1339d
 
 
974d749
95a5be9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f562d6f
95a5be9
 
f562d6f
95a5be9
 
 
f562d6f
95a5be9
 
 
f562d6f
95a5be9
 
 
 
 
 
 
 
 
f562d6f
95a5be9

import spacy
import pickle
from nltk.corpus import wordnet


def load_spacy_values(model = "en_core_web_md", filepath_docs_spacy = 'dict_spacy_object.pkl'):
    '''
    Loads a spaCy language model and a dictionary of spaCy Doc objects from a pickle file.

    Parameters
    ----------
    model : str
        The name or local path of the spaCy model to be loaded for processing text. 
        For example, "en_core_web_sm" or a custom model path.

    filepath_docs_spacy : str
        The path to the pickle file containing a dictionary where the keys are tokens 
        (strings) and the values are the corresponding serialized spaCy Doc objects.

    Returns
    -------
    nlp : spacy.language.Language
        The loaded spaCy language model.

    dict_docs_spacy : dict
        A dictionary where the keys are tokens (strings) and the values are spaCy Doc 
        objects reconstructed from the serialized bytes stored in the pickle file.
    '''
    
    # ---- Load the spaCy NLP model
    #
    nlp = spacy.load(model)
    
    # ---- Load pickle file and reconstruct the dictionary with tokens as keys and spaCy Doc objects as values
    #
    with open(filepath_docs_spacy, 'rb') as file:
        dict_docs_spacy_bytes = pickle.load(file)
    
    dict_docs_spacy = {key: spacy.tokens.Doc(nlp.vocab).from_bytes(doc_bytes) for key, doc_bytes in dict_docs_spacy_bytes.items()}
    
    return nlp, dict_docs_spacy


def find_antonyms(word):
    '''
    Generate a set of all the antonyms of a given word

    Parameters
    ----------
    word : str
        The word that we want to find the antonyms

    Returns
    -------
    antonyms : set of str
        A set of all the antonym detected using nltk and WordNet
    '''
    
    antonyms = set()

    # ---- Load all the set of synonyms of the word recorded from wordnet
    #
    syn_set = wordnet.synsets(word)

    # ---- Loop over each set of synonyms
    #
    for syn in syn_set:
        # ---- Loop over each synonym
        #
        for lemma in syn.lemmas():
            # ---- Add antonyms of the synonyms to the antonyms set
            #
            if lemma.antonyms():
                antonyms.add(lemma.antonyms()[0].name())

    return antonyms


def find_synonyms(word, model, dict_embedding, list_2000_tokens):
    # 고유명사 보존 
    doc = model(word)
    if doc[0].pos_ == "PROPN":
        return word

    # 기본 동사 매핑
    basic_verbs = {
        "is": "IS",
        "am": "IS", 
        "are": "IS",
        "was": "IS",
        "were": "IS",
        "be": "IS",
        "have": "HAVE",
        "has": "HAVE",
        "had": "HAVE"
    }
   
    if word.lower() in basic_verbs:
        return basic_verbs[word.lower()]

    # 이미 목록에 있는 단어는 그대로 반환
    if word in list_2000_tokens:
        return word
       
    # 품사가 같은 유사어 찾기
    word_doc = model(word)
    word_pos = word_doc[0].pos_
   
    antonyms = find_antonyms(word)
    filtered_tokens = [
        token for token in list_2000_tokens 
        if token not in antonyms
        and model(token)[0].pos_ == word_pos
    ]

    similarities = []
    word_embedding = model(word)
   
    for token in filtered_tokens:
        similarities.append((token, dict_embedding.get(token).similarity(word_embedding)))

    # ====== 수정된 부분: similarities 리스트가 비었는지 확인 ======
    if not similarities:
        # 유사 후보가 없다면 원본 단어를 그대로 반환
        return word
    # ==========================================================
    
    most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0]
    return most_similar_token