Spaces:
Runtime error
Runtime error
Upload 2 files
Browse files- main.py +186 -0
- requirements.txt +90 -0
main.py
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import pandas as pd
|
3 |
+
from sentence_transformers import SentenceTransformer, util
|
4 |
+
import numpy as np
|
5 |
+
import threading
|
6 |
+
import gradio as gr
|
7 |
+
import re
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
from markdown import markdown
|
10 |
+
import nltk
|
11 |
+
from nltk.tokenize import sent_tokenize
|
12 |
+
import string
|
13 |
+
import unicodedata
|
14 |
+
|
15 |
+
nltk.download('punkt')
|
16 |
+
POST_ID = 0
|
17 |
+
REFERENDUM_TYPE = "referendums_v2"
|
18 |
+
VOTE_TYPE = "ReferendumV2" # "Motion", "Fellowship", "Referendum", "ReferendumV2", "DemocracyProposal"
|
19 |
+
UPDATE_INTERVAL = 1800
|
20 |
+
|
21 |
+
|
22 |
+
def dot_product(u, v):
|
23 |
+
res = np.dot(u, v)
|
24 |
+
return res
|
25 |
+
|
26 |
+
|
27 |
+
def markdn_2_str(text):
|
28 |
+
html = markdown(text)
|
29 |
+
clean_text = ' '.join(BeautifulSoup(html, features="html.parser").findAll(string=True))
|
30 |
+
markdown_link_pattern = re.compile(r'\[.*?\]\(.*?\)')
|
31 |
+
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
|
32 |
+
clean_text = re.sub(markdown_link_pattern, ' ', clean_text) # remove markdown style links
|
33 |
+
clean_text = re.sub(url_pattern, ' ', clean_text) # remove regular links
|
34 |
+
clean_text = clean_text.replace('\n', ' ') # remove \n
|
35 |
+
return clean_text
|
36 |
+
|
37 |
+
|
38 |
+
def get_sum(prop):
|
39 |
+
key_word = "KSM"
|
40 |
+
pattern = re.compile(r'(\d)')
|
41 |
+
|
42 |
+
search_phrases = [
|
43 |
+
"requests a total of 1500 KSM",
|
44 |
+
"requests a total of 7450 $ (17 KSM)",
|
45 |
+
"Requested amount: 3,333 KSM",
|
46 |
+
"Requested funding 78,804 USD // 2770 KSM",
|
47 |
+
"Requested KSM: 598"
|
48 |
+
]
|
49 |
+
|
50 |
+
ref = model.encode("".join(search_phrases), convert_to_tensor=True)
|
51 |
+
prop = unicodedata.normalize("NFKD", prop)
|
52 |
+
prop = markdn_2_str(prop)
|
53 |
+
sentences = sent_tokenize(prop)
|
54 |
+
|
55 |
+
similarities = []
|
56 |
+
for s in sentences:
|
57 |
+
sentence_embedding = model.encode(s, convert_to_tensor=True)
|
58 |
+
similarities.append(-dot_product(sentence_embedding, ref))
|
59 |
+
max_similarity_index = np.argsort(similarities)
|
60 |
+
|
61 |
+
sent = next((sentences[i] for i in max_similarity_index if "KSM" in sentences[i]), "None")
|
62 |
+
|
63 |
+
s = re.split(r'(\s)', sent)
|
64 |
+
s = [x.translate(str.maketrans('', '', string.punctuation)) if not pattern.search(x) else x for x in s]
|
65 |
+
s = [x for x in s if x != ' ']
|
66 |
+
s = [x for x in s if x != '']
|
67 |
+
try:
|
68 |
+
index_KSM = [idx for idx, val in enumerate(s) if "KSM" in val]
|
69 |
+
for el in index_KSM:
|
70 |
+
l = s[el - 1:el + 2]
|
71 |
+
for x in l:
|
72 |
+
if pattern.search(x):
|
73 |
+
return x
|
74 |
+
except Exception:
|
75 |
+
return None
|
76 |
+
|
77 |
+
|
78 |
+
def get_proposals():
|
79 |
+
global POST_ID
|
80 |
+
global df
|
81 |
+
flag = True
|
82 |
+
while flag:
|
83 |
+
rn = requests.post(
|
84 |
+
f"https://api.polkassembly.io/api/v1/posts/on-chain-post?proposalType={REFERENDUM_TYPE}&postId={POST_ID}",
|
85 |
+
headers={"x-network": "kusama"})
|
86 |
+
if rn.ok:
|
87 |
+
proposal_data = rn.json()
|
88 |
+
line = [proposal_data.get("content"), proposal_data.get("status"), get_sum(proposal_data.get("content"))]
|
89 |
+
df.loc[POST_ID] = line
|
90 |
+
POST_ID += 1
|
91 |
+
else:
|
92 |
+
event.set()
|
93 |
+
flag = False
|
94 |
+
|
95 |
+
|
96 |
+
def get_embeddings():
|
97 |
+
global df_emb
|
98 |
+
for i in range(len(df)):
|
99 |
+
df_emb.loc[i] = [model.encode(markdn_2_str(df.iloc[i]['content']))]
|
100 |
+
|
101 |
+
|
102 |
+
def update_proposals():
|
103 |
+
global POST_ID
|
104 |
+
global df
|
105 |
+
flag = True
|
106 |
+
while flag:
|
107 |
+
rn = requests.post(
|
108 |
+
f"https://api.polkassembly.io/api/v1/posts/on-chain-post?proposalType={REFERENDUM_TYPE}&postId={POST_ID}",
|
109 |
+
headers={"x-network": "kusama"})
|
110 |
+
if rn.ok:
|
111 |
+
proposal_data = rn.json()
|
112 |
+
line = [proposal_data.get("content"), proposal_data.get("status"), get_sum(proposal_data.get("content"))]
|
113 |
+
df.loc[POST_ID] = line
|
114 |
+
POST_ID += 1
|
115 |
+
else:
|
116 |
+
event.set()
|
117 |
+
flag = False
|
118 |
+
|
119 |
+
|
120 |
+
def update_embeddings():
|
121 |
+
global df_emb
|
122 |
+
while True:
|
123 |
+
event.wait()
|
124 |
+
print(POST_ID)
|
125 |
+
print(len(df))
|
126 |
+
|
127 |
+
if len(df) != len(df_emb):
|
128 |
+
id_to_add = [x + len(df_emb) for x in range(len(df) - len(df_emb))]
|
129 |
+
for i in id_to_add:
|
130 |
+
df_emb.loc[i] = [model.encode(markdn_2_str(df.iloc[i]['content']))]
|
131 |
+
else:
|
132 |
+
event.clear()
|
133 |
+
|
134 |
+
|
135 |
+
def run_periodically():
|
136 |
+
update_proposals()
|
137 |
+
threading.Timer(UPDATE_INTERVAL, run_periodically, args=(UPDATE_INTERVAL,)).start()
|
138 |
+
|
139 |
+
|
140 |
+
def compare_proposals(prop, count):
|
141 |
+
query_emb = model.encode(markdn_2_str(prop))
|
142 |
+
new_df = pd.DataFrame(columns=['sim1'])
|
143 |
+
new_df['sim1'] = df_emb.apply(lambda row: dot_product(row[0], query_emb), axis=1)
|
144 |
+
best_match = np.argsort(-new_df['sim1'])[0:count]
|
145 |
+
res = [df.iloc[x]['content'] for x in best_match]
|
146 |
+
stat = [df.iloc[x]['status'] for x in best_match]
|
147 |
+
ksm = [df.iloc[x]['ksm'] for x in best_match]
|
148 |
+
# total = [get_sum(y) for y in [df.iloc[x]['content'] for x in best_match]]
|
149 |
+
title = [
|
150 |
+
'''<span style="color:blue"><h2>Total KSM requested: {sum}, status: {status}, ID: {id}</h2></span> \n '''.format(
|
151 |
+
sum=x, status=y, id=z) for x, y, z in zip(ksm, stat, best_match)]
|
152 |
+
result = "\n ".join([a + b for a, b in zip(title, res)])
|
153 |
+
return result
|
154 |
+
|
155 |
+
|
156 |
+
if __name__ == '__main__':
|
157 |
+
event = threading.Event()
|
158 |
+
model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')
|
159 |
+
|
160 |
+
df = pd.DataFrame(columns=['content', 'status', 'ksm'])
|
161 |
+
df_emb = pd.DataFrame(columns=['content'])
|
162 |
+
|
163 |
+
get_proposals()
|
164 |
+
get_embeddings()
|
165 |
+
|
166 |
+
POST_ID = len(df)
|
167 |
+
|
168 |
+
update_thread = threading.Thread(target=run_periodically) # background proposals update
|
169 |
+
upd_emb_thread = threading.Thread(target=update_embeddings) # background embeddings update
|
170 |
+
|
171 |
+
update_thread.start()
|
172 |
+
upd_emb_thread.start()
|
173 |
+
|
174 |
+
with gr.Blocks() as demo:
|
175 |
+
gr.Markdown("<h1>Compare proposals</h1>")
|
176 |
+
inpt = gr.Textbox(label="Input Proposal", lines=5, max_lines=12)
|
177 |
+
dr = gr.Dropdown(label="Vote type",
|
178 |
+
choices=["Motion", "Fellowship", "Referendum", "ReferendumV2", "DemocracyProposal"],
|
179 |
+
value="ReferendumV2", interactive=True)
|
180 |
+
slider = gr.Slider(label="Number of proposals to output", minimum=1, maximum=20, step=1, value=5,
|
181 |
+
interactive=True)
|
182 |
+
btn = gr.Button("Find similar proposals")
|
183 |
+
otpt = gr.Markdown("")
|
184 |
+
btn.click(fn=compare_proposals, inputs=[inpt, slider], outputs=otpt)
|
185 |
+
|
186 |
+
demo.launch(show_error=True)
|
requirements.txt
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.2.1
|
2 |
+
altair==5.2.0
|
3 |
+
annotated-types==0.6.0
|
4 |
+
anyio==4.3.0
|
5 |
+
attrs==23.2.0
|
6 |
+
beautifulsoup4==4.12.3
|
7 |
+
certifi==2024.2.2
|
8 |
+
charset-normalizer==3.3.2
|
9 |
+
click==8.1.7
|
10 |
+
colorama==0.4.6
|
11 |
+
contourpy==1.2.0
|
12 |
+
cycler==0.12.1
|
13 |
+
DateTime==5.4
|
14 |
+
et-xmlfile==1.1.0
|
15 |
+
exceptiongroup==1.2.0
|
16 |
+
fastapi==0.110.0
|
17 |
+
ffmpy==0.3.2
|
18 |
+
filelock==3.13.1
|
19 |
+
fonttools==4.49.0
|
20 |
+
fsspec==2024.2.0
|
21 |
+
gradio==4.19.2
|
22 |
+
gradio_client==0.10.1
|
23 |
+
h11==0.14.0
|
24 |
+
httpcore==1.0.4
|
25 |
+
httpx==0.27.0
|
26 |
+
huggingface-hub==0.21.3
|
27 |
+
idna==3.6
|
28 |
+
importlib_metadata==7.0.2
|
29 |
+
importlib_resources==6.1.2
|
30 |
+
Jinja2==3.1.3
|
31 |
+
joblib==1.3.2
|
32 |
+
jsonschema==4.21.1
|
33 |
+
jsonschema-specifications==2023.12.1
|
34 |
+
kiwisolver==1.4.5
|
35 |
+
Markdown==3.5.2
|
36 |
+
markdown-it-py==3.0.0
|
37 |
+
MarkupSafe==2.1.5
|
38 |
+
matplotlib==3.8.3
|
39 |
+
mdurl==0.1.2
|
40 |
+
mpmath==1.3.0
|
41 |
+
networkx==3.2.1
|
42 |
+
nltk==3.8.1
|
43 |
+
numpy==1.26.4
|
44 |
+
openpyxl==3.1.2
|
45 |
+
orjson==3.9.15
|
46 |
+
packaging==23.2
|
47 |
+
pandas==2.2.1
|
48 |
+
pillow==10.2.0
|
49 |
+
pydantic==2.6.3
|
50 |
+
pydantic_core==2.16.3
|
51 |
+
pydub==0.25.1
|
52 |
+
Pygments==2.17.2
|
53 |
+
pyparsing==3.1.1
|
54 |
+
python-dateutil==2.8.2
|
55 |
+
python-multipart==0.0.9
|
56 |
+
pytz==2024.1
|
57 |
+
PyYAML==6.0.1
|
58 |
+
referencing==0.33.0
|
59 |
+
regex==2023.12.25
|
60 |
+
requests==2.31.0
|
61 |
+
rich==13.7.1
|
62 |
+
rpds-py==0.18.0
|
63 |
+
ruff==0.3.0
|
64 |
+
safetensors==0.4.2
|
65 |
+
schedule==1.2.1
|
66 |
+
scikit-learn==1.4.1.post1
|
67 |
+
scipy==1.12.0
|
68 |
+
semantic-version==2.10.0
|
69 |
+
sentence-transformers==2.5.0
|
70 |
+
shellingham==1.5.4
|
71 |
+
six==1.16.0
|
72 |
+
sniffio==1.3.1
|
73 |
+
soupsieve==2.5
|
74 |
+
starlette==0.36.3
|
75 |
+
sympy==1.12
|
76 |
+
threadpoolctl==3.3.0
|
77 |
+
tokenizers==0.15.2
|
78 |
+
tomlkit==0.12.0
|
79 |
+
toolz==0.12.1
|
80 |
+
torch==2.2.1
|
81 |
+
tqdm==4.66.2
|
82 |
+
transformers==4.38.1
|
83 |
+
typer==0.9.0
|
84 |
+
typing_extensions==4.10.0
|
85 |
+
tzdata==2024.1
|
86 |
+
urllib3==2.2.1
|
87 |
+
uvicorn==0.27.1
|
88 |
+
websockets==11.0.3
|
89 |
+
zipp==3.17.0
|
90 |
+
zope.interface==6.2
|