Spaces:

ginigen
/

Sign-language

Paused

App Files Files Community

Sign-language / src /synonyms_preprocess.py

ginipick

Update src/synonyms_preprocess.py

95a5be9 verified 10 days ago

raw

history blame

3.7 kB

	import spacy
	import pickle
	from nltk.corpus import wordnet


	def load_spacy_values(model = "en_core_web_md", filepath_docs_spacy = 'dict_spacy_object.pkl'):
	'''
	Loads a spaCy language model and a dictionary of spaCy Doc objects from a pickle file.

	Parameters
	----------
	model : str
	The name or local path of the spaCy model to be loaded for processing text.
	For example, "en_core_web_sm" or a custom model path.

	filepath_docs_spacy : str
	The path to the pickle file containing a dictionary where the keys are tokens
	(strings) and the values are the corresponding serialized spaCy Doc objects.

	Returns
	-------
	nlp : spacy.language.Language
	The loaded spaCy language model.

	dict_docs_spacy : dict
	A dictionary where the keys are tokens (strings) and the values are spaCy Doc
	objects reconstructed from the serialized bytes stored in the pickle file.
	'''

	# ---- Load the spaCy NLP model
	#
	nlp = spacy.load(model)

	# ---- Load pickle file and reconstruct the dictionary with tokens as keys and spaCy Doc objects as values
	#
	with open(filepath_docs_spacy, 'rb') as file:
	dict_docs_spacy_bytes = pickle.load(file)

	dict_docs_spacy = {key: spacy.tokens.Doc(nlp.vocab).from_bytes(doc_bytes) for key, doc_bytes in dict_docs_spacy_bytes.items()}

	return nlp, dict_docs_spacy


	def find_antonyms(word):
	'''
	Generate a set of all the antonyms of a given word

	Parameters
	----------
	word : str
	The word that we want to find the antonyms

	Returns
	-------
	antonyms : set of str
	A set of all the antonym detected using nltk and WordNet
	'''

	antonyms = set()

	# ---- Load all the set of synonyms of the word recorded from wordnet
	#
	syn_set = wordnet.synsets(word)

	# ---- Loop over each set of synonyms
	#
	for syn in syn_set:
	# ---- Loop over each synonym
	#
	for lemma in syn.lemmas():
	# ---- Add antonyms of the synonyms to the antonyms set
	#
	if lemma.antonyms():
	antonyms.add(lemma.antonyms()[0].name())

	return antonyms


	def find_synonyms(word, model, dict_embedding, list_2000_tokens):
	# 고유명사 보존
	doc = model(word)
	if doc[0].pos_ == "PROPN":
	return word

	# 기본 동사 매핑
	basic_verbs = {
	"is": "IS",
	"am": "IS",
	"are": "IS",
	"was": "IS",
	"were": "IS",
	"be": "IS",
	"have": "HAVE",
	"has": "HAVE",
	"had": "HAVE"
	}

	if word.lower() in basic_verbs:
	return basic_verbs[word.lower()]

	# 이미 목록에 있는 단어는 그대로 반환
	if word in list_2000_tokens:
	return word

	# 품사가 같은 유사어 찾기
	word_doc = model(word)
	word_pos = word_doc[0].pos_

	antonyms = find_antonyms(word)
	filtered_tokens = [
	token for token in list_2000_tokens
	if token not in antonyms
	and model(token)[0].pos_ == word_pos
	]

	similarities = []
	word_embedding = model(word)

	for token in filtered_tokens:
	similarities.append((token, dict_embedding.get(token).similarity(word_embedding)))

	# ====== 수정된 부분: similarities 리스트가 비었는지 확인 ======
	if not similarities:
	# 유사 후보가 없다면 원본 단어를 그대로 반환
	return word
	# ==========================================================

	most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0]
	return most_similar_token