RobPruzan commited on
Commit
8a9f69a
1 Parent(s): b327954

Updating diversity scoring function

Browse files
Files changed (1) hide show
  1. app.py +55 -48
app.py CHANGED
@@ -42,58 +42,65 @@ for idx, key in enumerate(glove_vectors.key_to_index.keys()):
42
 
43
 
44
  def calculate_diversity(text):
45
- stop_words = set(stopwords.words('english'))
46
- for i in string.punctuation:
47
- stop_words.add(i)
48
 
49
- tokenized_text = word_tokenize(text)
50
- tokenized_text = list(map(lambda word: word.lower(), tokenized_text))
51
- sim_words = {}
52
- if len(tokenized_text) <= 1:
53
- return 1, "More Text Required"
54
-
55
- for idx, anc_word in enumerate(tokenized_text):
56
- if anc_word in stop_words:
57
- continue
58
- if idx in sim_words:
59
- sim_words[idx] = sim_words[idx]
60
- continue
61
-
62
- vocab = [anc_word]
63
-
64
- for pos, comp_word in enumerate(tokenized_text):
65
-
66
- try:
67
- if not comp_word in stop_words and cosine_similarity(w2v[anc_word].reshape(1, -1),
68
- w2v[comp_word].reshape(1, -1)) > .75:
69
- vocab.append(comp_word)
70
-
71
- sim_words[idx] = vocab
72
-
73
- except KeyError:
74
- continue
75
-
76
- scores = {}
77
- for k, value in sim_words.items():
78
- if len(value) == 1:
79
- scores[k] = 1
80
- continue
81
-
82
- t_sim = len(value) - 1
83
- t_rep = (len(value) - 1) - (len(set(value)))
84
-
85
- score = ((t_sim - t_rep) / t_sim) ** 2
86
-
87
- scores[key] = score
 
 
 
 
 
 
88
 
89
- mean_score = 0
90
- total = 0
91
 
92
- for value in scores.values():
93
- mean_score += value
94
- total += 1
95
 
96
- return scores, mean_score / total
 
 
 
 
 
 
97
 
98
 
99
  def dict_to_list(dictionary, max_size=10):
 
42
 
43
 
44
  def calculate_diversity(text):
 
 
 
45
 
46
+ stop_words = set(stopwords.words('english'))
47
+ for i in string.punctuation:
48
+ stop_words.add(i)
49
+
50
+ tokenized_text = word_tokenize(text)
51
+ tokenized_text = list(map(lambda word: word.lower(), tokenized_text))
52
+ sim_words = {}
53
+ if len(tokenized_text) <= 1:
54
+ return 1,"More Text Required"
55
+
56
+
57
+
58
+
59
+ for idx, anc_word in enumerate(tokenized_text):
60
+ if anc_word in stop_words:
61
+ continue
62
+
63
+ vocab = [anc_word]
64
+
65
+ for pos, comp_word in enumerate(tokenized_text):
66
+ if anc_word in sim_words.get(pos, []):
67
+ if anc_word == sim_words[pos][0]:
68
+ sim_words[idx] = sim_words[pos]
69
+ continue
70
+ try:
71
+ if not comp_word in stop_words and cosine_similarity(w2v[anc_word].reshape(1, -1), w2v[comp_word].reshape(1, -1)) > .75:
72
+ vocab.append(comp_word)
73
+ sim_words[idx] = vocab
74
+
75
+ except KeyError:
76
+ continue
77
+
78
+
79
+ scores = {}
80
+ for key, value in sim_words.items():
81
+ if len(value) == 1:
82
+ scores[key] = 1
83
+ continue
84
+ if len(value) == 2:
85
+ scores[key] = -1
86
+ continue
87
+ t_sim = len(value) - 1
88
+ t_rep = (len(value) - 1) - (len(set(value[1:])))
89
+
90
+ score = ((t_sim - t_rep)/t_sim)**2
91
 
92
+ scores[key] = score
 
93
 
94
+ mean_score = 0
95
+ total = 0
 
96
 
97
+ for value in scores.values():
98
+ if value == -1:
99
+ continue
100
+ mean_score += value
101
+ total += 1
102
+
103
+ return scores, mean_score/total
104
 
105
 
106
  def dict_to_list(dictionary, max_size=10):