Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -13,7 +13,6 @@ import string
|
|
13 |
import unicodedata
|
14 |
import time
|
15 |
|
16 |
-
|
17 |
nltk.download('punkt')
|
18 |
POST_ID = 0
|
19 |
REFERENDUM_TYPE = "referendums_v2"
|
@@ -99,7 +98,7 @@ def get_proposals():
|
|
99 |
def get_embeddings():
|
100 |
global df_emb
|
101 |
for i in range(len(df)):
|
102 |
-
df_emb.loc[i] =
|
103 |
|
104 |
|
105 |
def update_proposals():
|
@@ -117,6 +116,7 @@ def update_proposals():
|
|
117 |
POST_ID += 1
|
118 |
else:
|
119 |
print('proposals updated at {t}'.format(t=time.strftime("%H:%M:%S", time.localtime())))
|
|
|
120 |
event.set()
|
121 |
flag = False
|
122 |
|
@@ -125,15 +125,17 @@ def update_embeddings():
|
|
125 |
global df_emb
|
126 |
while True:
|
127 |
event.wait()
|
128 |
-
print(POST_ID)
|
129 |
-
print(len(df))
|
130 |
|
131 |
if len(df) != len(df_emb):
|
132 |
id_to_add = [x + len(df_emb) for x in range(len(df) - len(df_emb))]
|
133 |
for i in id_to_add:
|
134 |
-
|
|
|
|
|
135 |
else:
|
136 |
event.clear()
|
|
|
|
|
137 |
|
138 |
|
139 |
def run_periodically():
|
@@ -144,7 +146,7 @@ def run_periodically():
|
|
144 |
def compare_proposals(prop, count):
|
145 |
query_emb = model.encode(markdn_2_str(prop))
|
146 |
new_df = pd.DataFrame(columns=['sim1'])
|
147 |
-
new_df['sim1'] = df_emb.apply(lambda row: dot_product(row
|
148 |
best_match = np.argsort(-new_df['sim1'])[0:count]
|
149 |
res = [df.iloc[x]['content'] for x in best_match]
|
150 |
stat = [df.iloc[x]['status'] for x in best_match]
|
@@ -163,12 +165,22 @@ if __name__ == '__main__':
|
|
163 |
print('model downloaded')
|
164 |
|
165 |
df = pd.DataFrame(columns=['content', 'status', 'ksm'])
|
166 |
-
df_emb = pd.DataFrame(columns=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
|
168 |
-
print('proposal collection start')
|
169 |
-
get_proposals()
|
170 |
-
print('proposals collected, embeddings calculation start')
|
171 |
-
get_embeddings()
|
172 |
|
173 |
POST_ID = len(df)
|
174 |
|
|
|
13 |
import unicodedata
|
14 |
import time
|
15 |
|
|
|
16 |
nltk.download('punkt')
|
17 |
POST_ID = 0
|
18 |
REFERENDUM_TYPE = "referendums_v2"
|
|
|
98 |
def get_embeddings():
|
99 |
global df_emb
|
100 |
for i in range(len(df)):
|
101 |
+
df_emb.loc[i] = model.encode(markdn_2_str(df.iloc[i]['content']))
|
102 |
|
103 |
|
104 |
def update_proposals():
|
|
|
116 |
POST_ID += 1
|
117 |
else:
|
118 |
print('proposals updated at {t}'.format(t=time.strftime("%H:%M:%S", time.localtime())))
|
119 |
+
df.to_excel('df.xlsx', index=False)
|
120 |
event.set()
|
121 |
flag = False
|
122 |
|
|
|
125 |
global df_emb
|
126 |
while True:
|
127 |
event.wait()
|
|
|
|
|
128 |
|
129 |
if len(df) != len(df_emb):
|
130 |
id_to_add = [x + len(df_emb) for x in range(len(df) - len(df_emb))]
|
131 |
for i in id_to_add:
|
132 |
+
print(model.encode(markdn_2_str(df.iloc[i])))
|
133 |
+
print(len(model.encode(markdn_2_str(df.iloc[i]))))
|
134 |
+
df_emb.loc[i] = model.encode(markdn_2_str(df.iloc[i]))
|
135 |
else:
|
136 |
event.clear()
|
137 |
+
df_emb.to_csv('df_emb.csv', index=False)
|
138 |
+
|
139 |
|
140 |
|
141 |
def run_periodically():
|
|
|
146 |
def compare_proposals(prop, count):
|
147 |
query_emb = model.encode(markdn_2_str(prop))
|
148 |
new_df = pd.DataFrame(columns=['sim1'])
|
149 |
+
new_df['sim1'] = df_emb.apply(lambda row: dot_product(row, query_emb), axis=1)
|
150 |
best_match = np.argsort(-new_df['sim1'])[0:count]
|
151 |
res = [df.iloc[x]['content'] for x in best_match]
|
152 |
stat = [df.iloc[x]['status'] for x in best_match]
|
|
|
165 |
print('model downloaded')
|
166 |
|
167 |
df = pd.DataFrame(columns=['content', 'status', 'ksm'])
|
168 |
+
df_emb = pd.DataFrame(columns=range(768))
|
169 |
+
print(df_emb)
|
170 |
+
|
171 |
+
|
172 |
+
df = pd.read_excel('df.xlsx')
|
173 |
+
df_emb = pd.read_csv('df_emb.csv')
|
174 |
+
df = df.loc[:, ['content', 'status', 'ksm']]
|
175 |
+
|
176 |
+
# get_proposals()
|
177 |
+
# get_embeddings()
|
178 |
+
|
179 |
+
# df.to_excel('df.xlsx', index=False)
|
180 |
+
# df_emb.to_csv('df_emb.csv', index=False)
|
181 |
+
|
182 |
+
|
183 |
|
|
|
|
|
|
|
|
|
184 |
|
185 |
POST_ID = len(df)
|
186 |
|