ginipick commited on
Commit
f562d6f
·
verified ·
1 Parent(s): 7e24471

Update src/synonyms_preprocess.py

Browse files
Files changed (1) hide show
  1. src/synonyms_preprocess.py +43 -46
src/synonyms_preprocess.py CHANGED
@@ -77,49 +77,46 @@ def find_antonyms(word):
77
 
78
 
79
  def find_synonyms(word, model, dict_embedding, list_2000_tokens):
80
- '''
81
- Finds the most similar token to a given word.
82
-
83
- Parameters
84
- ----------
85
- word : str
86
- The word that we want to find the most similar word
87
-
88
- model : spacy.language.Language
89
- spaCy language model to use for the detection of the synonym
90
-
91
- dict_embedding: dict
92
- A dictionary where the keys are tokens (str) and the values are spaCy Doc objects
93
-
94
- list_2000_tokens : list of str
95
- A list of 2000 tokens against which the gloss will be checked.
96
-
97
- Returns
98
- -------
99
- most_similar_token : str
100
- The most similar token to the given word
101
- '''
102
-
103
- # ---- Skip synonym detection if the word is already in the list_2000_token
104
- #
105
- if word in list_2000_tokens:
106
- return word
107
- else:
108
- # ---- Remove antonyms of the given word of the list_2000_tokens (a word and an antonym might be similar in embedding representation)
109
- #
110
- antonyms = find_antonyms(word)
111
- list_2000_tokens_less_antonyms = [token for token in list_2000_tokens if token not in antonyms]
112
-
113
- # ---- Generate a list of tuple (token, similarities values between the embedding of the given word and the embedding of each token of the list_2000_tokens)
114
- #
115
- word_embedding = model(word)
116
- similarities=[]
117
-
118
- for token in list_2000_tokens_less_antonyms:
119
- similarities.append((token, dict_embedding.get(token).similarity(word_embedding)))
120
-
121
- # ---- Extract the most similar token of the list
122
- #
123
- most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0]
124
-
125
- return most_similar_token
 
77
 
78
 
79
  def find_synonyms(word, model, dict_embedding, list_2000_tokens):
80
+ # 고유명사 보존
81
+ doc = model(word)
82
+ if doc[0].pos_ == "PROPN":
83
+ return word
84
+
85
+ # 기본 동사 매핑
86
+ basic_verbs = {
87
+ "is": "IS",
88
+ "am": "IS",
89
+ "are": "IS",
90
+ "was": "IS",
91
+ "were": "IS",
92
+ "be": "IS",
93
+ "have": "HAVE",
94
+ "has": "HAVE",
95
+ "had": "HAVE"
96
+ }
97
+
98
+ if word.lower() in basic_verbs:
99
+ return basic_verbs[word.lower()]
100
+
101
+ # 이미 목록에 있는 단어는 그대로 반환
102
+ if word in list_2000_tokens:
103
+ return word
104
+
105
+ # 품사가 같은 유사어 찾기
106
+ word_doc = model(word)
107
+ word_pos = word_doc[0].pos_
108
+
109
+ antonyms = find_antonyms(word)
110
+ filtered_tokens = [token for token in list_2000_tokens
111
+ if token not in antonyms
112
+ and model(token)[0].pos_ == word_pos]
113
+
114
+ similarities = []
115
+ word_embedding = model(word)
116
+
117
+ for token in filtered_tokens:
118
+ similarities.append((token, dict_embedding.get(token).similarity(word_embedding)))
119
+
120
+ most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0]
121
+
122
+ return most_similar_token