DanichOne commited on
Commit
834975a
1 Parent(s): b1d0773

Upload 2 files

Browse files
Files changed (2) hide show
  1. main.py +186 -0
  2. requirements.txt +90 -0
main.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import pandas as pd
3
+ from sentence_transformers import SentenceTransformer, util
4
+ import numpy as np
5
+ import threading
6
+ import gradio as gr
7
+ import re
8
+ from bs4 import BeautifulSoup
9
+ from markdown import markdown
10
+ import nltk
11
+ from nltk.tokenize import sent_tokenize
12
+ import string
13
+ import unicodedata
14
+
15
+ nltk.download('punkt')
16
+ POST_ID = 0
17
+ REFERENDUM_TYPE = "referendums_v2"
18
+ VOTE_TYPE = "ReferendumV2" # "Motion", "Fellowship", "Referendum", "ReferendumV2", "DemocracyProposal"
19
+ UPDATE_INTERVAL = 1800
20
+
21
+
22
+ def dot_product(u, v):
23
+ res = np.dot(u, v)
24
+ return res
25
+
26
+
27
+ def markdn_2_str(text):
28
+ html = markdown(text)
29
+ clean_text = ' '.join(BeautifulSoup(html, features="html.parser").findAll(string=True))
30
+ markdown_link_pattern = re.compile(r'\[.*?\]\(.*?\)')
31
+ url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
32
+ clean_text = re.sub(markdown_link_pattern, ' ', clean_text) # remove markdown style links
33
+ clean_text = re.sub(url_pattern, ' ', clean_text) # remove regular links
34
+ clean_text = clean_text.replace('\n', ' ') # remove \n
35
+ return clean_text
36
+
37
+
38
+ def get_sum(prop):
39
+ key_word = "KSM"
40
+ pattern = re.compile(r'(\d)')
41
+
42
+ search_phrases = [
43
+ "requests a total of 1500 KSM",
44
+ "requests a total of 7450 $ (17 KSM)",
45
+ "Requested amount: 3,333 KSM",
46
+ "Requested funding 78,804 USD // 2770 KSM",
47
+ "Requested KSM: 598"
48
+ ]
49
+
50
+ ref = model.encode("".join(search_phrases), convert_to_tensor=True)
51
+ prop = unicodedata.normalize("NFKD", prop)
52
+ prop = markdn_2_str(prop)
53
+ sentences = sent_tokenize(prop)
54
+
55
+ similarities = []
56
+ for s in sentences:
57
+ sentence_embedding = model.encode(s, convert_to_tensor=True)
58
+ similarities.append(-dot_product(sentence_embedding, ref))
59
+ max_similarity_index = np.argsort(similarities)
60
+
61
+ sent = next((sentences[i] for i in max_similarity_index if "KSM" in sentences[i]), "None")
62
+
63
+ s = re.split(r'(\s)', sent)
64
+ s = [x.translate(str.maketrans('', '', string.punctuation)) if not pattern.search(x) else x for x in s]
65
+ s = [x for x in s if x != ' ']
66
+ s = [x for x in s if x != '']
67
+ try:
68
+ index_KSM = [idx for idx, val in enumerate(s) if "KSM" in val]
69
+ for el in index_KSM:
70
+ l = s[el - 1:el + 2]
71
+ for x in l:
72
+ if pattern.search(x):
73
+ return x
74
+ except Exception:
75
+ return None
76
+
77
+
78
+ def get_proposals():
79
+ global POST_ID
80
+ global df
81
+ flag = True
82
+ while flag:
83
+ rn = requests.post(
84
+ f"https://api.polkassembly.io/api/v1/posts/on-chain-post?proposalType={REFERENDUM_TYPE}&postId={POST_ID}",
85
+ headers={"x-network": "kusama"})
86
+ if rn.ok:
87
+ proposal_data = rn.json()
88
+ line = [proposal_data.get("content"), proposal_data.get("status"), get_sum(proposal_data.get("content"))]
89
+ df.loc[POST_ID] = line
90
+ POST_ID += 1
91
+ else:
92
+ event.set()
93
+ flag = False
94
+
95
+
96
+ def get_embeddings():
97
+ global df_emb
98
+ for i in range(len(df)):
99
+ df_emb.loc[i] = [model.encode(markdn_2_str(df.iloc[i]['content']))]
100
+
101
+
102
+ def update_proposals():
103
+ global POST_ID
104
+ global df
105
+ flag = True
106
+ while flag:
107
+ rn = requests.post(
108
+ f"https://api.polkassembly.io/api/v1/posts/on-chain-post?proposalType={REFERENDUM_TYPE}&postId={POST_ID}",
109
+ headers={"x-network": "kusama"})
110
+ if rn.ok:
111
+ proposal_data = rn.json()
112
+ line = [proposal_data.get("content"), proposal_data.get("status"), get_sum(proposal_data.get("content"))]
113
+ df.loc[POST_ID] = line
114
+ POST_ID += 1
115
+ else:
116
+ event.set()
117
+ flag = False
118
+
119
+
120
+ def update_embeddings():
121
+ global df_emb
122
+ while True:
123
+ event.wait()
124
+ print(POST_ID)
125
+ print(len(df))
126
+
127
+ if len(df) != len(df_emb):
128
+ id_to_add = [x + len(df_emb) for x in range(len(df) - len(df_emb))]
129
+ for i in id_to_add:
130
+ df_emb.loc[i] = [model.encode(markdn_2_str(df.iloc[i]['content']))]
131
+ else:
132
+ event.clear()
133
+
134
+
135
+ def run_periodically():
136
+ update_proposals()
137
+ threading.Timer(UPDATE_INTERVAL, run_periodically, args=(UPDATE_INTERVAL,)).start()
138
+
139
+
140
+ def compare_proposals(prop, count):
141
+ query_emb = model.encode(markdn_2_str(prop))
142
+ new_df = pd.DataFrame(columns=['sim1'])
143
+ new_df['sim1'] = df_emb.apply(lambda row: dot_product(row[0], query_emb), axis=1)
144
+ best_match = np.argsort(-new_df['sim1'])[0:count]
145
+ res = [df.iloc[x]['content'] for x in best_match]
146
+ stat = [df.iloc[x]['status'] for x in best_match]
147
+ ksm = [df.iloc[x]['ksm'] for x in best_match]
148
+ # total = [get_sum(y) for y in [df.iloc[x]['content'] for x in best_match]]
149
+ title = [
150
+ '''<span style="color:blue"><h2>Total KSM requested: {sum}, status: {status}, ID: {id}</h2></span> \n '''.format(
151
+ sum=x, status=y, id=z) for x, y, z in zip(ksm, stat, best_match)]
152
+ result = "\n ".join([a + b for a, b in zip(title, res)])
153
+ return result
154
+
155
+
156
+ if __name__ == '__main__':
157
+ event = threading.Event()
158
+ model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')
159
+
160
+ df = pd.DataFrame(columns=['content', 'status', 'ksm'])
161
+ df_emb = pd.DataFrame(columns=['content'])
162
+
163
+ get_proposals()
164
+ get_embeddings()
165
+
166
+ POST_ID = len(df)
167
+
168
+ update_thread = threading.Thread(target=run_periodically) # background proposals update
169
+ upd_emb_thread = threading.Thread(target=update_embeddings) # background embeddings update
170
+
171
+ update_thread.start()
172
+ upd_emb_thread.start()
173
+
174
+ with gr.Blocks() as demo:
175
+ gr.Markdown("<h1>Compare proposals</h1>")
176
+ inpt = gr.Textbox(label="Input Proposal", lines=5, max_lines=12)
177
+ dr = gr.Dropdown(label="Vote type",
178
+ choices=["Motion", "Fellowship", "Referendum", "ReferendumV2", "DemocracyProposal"],
179
+ value="ReferendumV2", interactive=True)
180
+ slider = gr.Slider(label="Number of proposals to output", minimum=1, maximum=20, step=1, value=5,
181
+ interactive=True)
182
+ btn = gr.Button("Find similar proposals")
183
+ otpt = gr.Markdown("")
184
+ btn.click(fn=compare_proposals, inputs=[inpt, slider], outputs=otpt)
185
+
186
+ demo.launch(show_error=True)
requirements.txt ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ altair==5.2.0
3
+ annotated-types==0.6.0
4
+ anyio==4.3.0
5
+ attrs==23.2.0
6
+ beautifulsoup4==4.12.3
7
+ certifi==2024.2.2
8
+ charset-normalizer==3.3.2
9
+ click==8.1.7
10
+ colorama==0.4.6
11
+ contourpy==1.2.0
12
+ cycler==0.12.1
13
+ DateTime==5.4
14
+ et-xmlfile==1.1.0
15
+ exceptiongroup==1.2.0
16
+ fastapi==0.110.0
17
+ ffmpy==0.3.2
18
+ filelock==3.13.1
19
+ fonttools==4.49.0
20
+ fsspec==2024.2.0
21
+ gradio==4.19.2
22
+ gradio_client==0.10.1
23
+ h11==0.14.0
24
+ httpcore==1.0.4
25
+ httpx==0.27.0
26
+ huggingface-hub==0.21.3
27
+ idna==3.6
28
+ importlib_metadata==7.0.2
29
+ importlib_resources==6.1.2
30
+ Jinja2==3.1.3
31
+ joblib==1.3.2
32
+ jsonschema==4.21.1
33
+ jsonschema-specifications==2023.12.1
34
+ kiwisolver==1.4.5
35
+ Markdown==3.5.2
36
+ markdown-it-py==3.0.0
37
+ MarkupSafe==2.1.5
38
+ matplotlib==3.8.3
39
+ mdurl==0.1.2
40
+ mpmath==1.3.0
41
+ networkx==3.2.1
42
+ nltk==3.8.1
43
+ numpy==1.26.4
44
+ openpyxl==3.1.2
45
+ orjson==3.9.15
46
+ packaging==23.2
47
+ pandas==2.2.1
48
+ pillow==10.2.0
49
+ pydantic==2.6.3
50
+ pydantic_core==2.16.3
51
+ pydub==0.25.1
52
+ Pygments==2.17.2
53
+ pyparsing==3.1.1
54
+ python-dateutil==2.8.2
55
+ python-multipart==0.0.9
56
+ pytz==2024.1
57
+ PyYAML==6.0.1
58
+ referencing==0.33.0
59
+ regex==2023.12.25
60
+ requests==2.31.0
61
+ rich==13.7.1
62
+ rpds-py==0.18.0
63
+ ruff==0.3.0
64
+ safetensors==0.4.2
65
+ schedule==1.2.1
66
+ scikit-learn==1.4.1.post1
67
+ scipy==1.12.0
68
+ semantic-version==2.10.0
69
+ sentence-transformers==2.5.0
70
+ shellingham==1.5.4
71
+ six==1.16.0
72
+ sniffio==1.3.1
73
+ soupsieve==2.5
74
+ starlette==0.36.3
75
+ sympy==1.12
76
+ threadpoolctl==3.3.0
77
+ tokenizers==0.15.2
78
+ tomlkit==0.12.0
79
+ toolz==0.12.1
80
+ torch==2.2.1
81
+ tqdm==4.66.2
82
+ transformers==4.38.1
83
+ typer==0.9.0
84
+ typing_extensions==4.10.0
85
+ tzdata==2024.1
86
+ urllib3==2.2.1
87
+ uvicorn==0.27.1
88
+ websockets==11.0.3
89
+ zipp==3.17.0
90
+ zope.interface==6.2