Spaces:

ginigen
/

Sign-language

Building

App Files Files Community

ginipick commited on Jan 26

Commit

f562d6f

verified ·

1 Parent(s): 7e24471

Update src/synonyms_preprocess.py

Browse files

Files changed (1) hide show

src/synonyms_preprocess.py +43 -46

src/synonyms_preprocess.py CHANGED Viewed

@@ -77,49 +77,46 @@ def find_antonyms(word):
 def find_synonyms(word, model, dict_embedding, list_2000_tokens):
-    '''
-    Finds the most similar token to a given word.
-    Parameters
-    ----------
-    word : str
-        The word that we want to find the most similar word
-    model : spacy.language.Language
-        spaCy language model to use for the detection of the synonym
-    dict_embedding: dict
-        A dictionary where the keys are tokens (str) and the values are spaCy Doc objects
-    list_2000_tokens : list of str
-        A list of 2000 tokens against which the gloss will be checked.
-    Returns
-    -------
-    most_similar_token : str
-        The most similar token to the given word
-    '''
-    # ---- Skip synonym detection if the word is already in the list_2000_token
-    #
-    if word in list_2000_tokens:
-        return word
-    else:
-        # ---- Remove antonyms of the given word of the list_2000_tokens (a word and an antonym might be similar in embedding representation)
-        #
-        antonyms = find_antonyms(word)
-        list_2000_tokens_less_antonyms = [token for token in list_2000_tokens if token not in antonyms]
-        # ---- Generate a list of tuple (token, similarities values between the embedding of the given word and the embedding of each token of the list_2000_tokens)
-        #
-        word_embedding = model(word)
-        similarities=[]
-        for token in list_2000_tokens_less_antonyms:
-            similarities.append((token, dict_embedding.get(token).similarity(word_embedding)))
-        # ---- Extract the most similar token of the list
-        #
-        most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0]
-        return most_similar_token

 def find_synonyms(word, model, dict_embedding, list_2000_tokens):
+   # 고유명사 보존
+   doc = model(word)
+   if doc[0].pos_ == "PROPN":
+       return word
+   # 기본 동사 매핑
+   basic_verbs = {
+       "is": "IS",
+       "am": "IS",
+       "are": "IS",
+       "was": "IS",
+       "were": "IS",
+       "be": "IS",
+       "have": "HAVE",
+       "has": "HAVE",
+       "had": "HAVE"
+   }
+   if word.lower() in basic_verbs:
+       return basic_verbs[word.lower()]
+   # 이미 목록에 있는 단어는 그대로 반환
+   if word in list_2000_tokens:
+       return word
+   # 품사가 같은 유사어 찾기
+   word_doc = model(word)
+   word_pos = word_doc[0].pos_
+   antonyms = find_antonyms(word)
+   filtered_tokens = [token for token in list_2000_tokens
+                     if token not in antonyms
+                     and model(token)[0].pos_ == word_pos]
+   similarities = []
+   word_embedding = model(word)
+   for token in filtered_tokens:
+       similarities.append((token, dict_embedding.get(token).similarity(word_embedding)))
+   most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0]
+   return most_similar_token