Spaces:

DanichOne
/

Proposal_Similarity_Kusama

Runtime error

App Files Files Community

DanichOne commited on Mar 10, 2024

Commit

834975a

verified ·

1 Parent(s): b1d0773

Upload 2 files

Browse files

Files changed (2) hide show

main.py +186 -0
requirements.txt +90 -0

main.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import requests
+import pandas as pd
+from sentence_transformers import SentenceTransformer, util
+import numpy as np
+import threading
+import gradio as gr
+import re
+from bs4 import BeautifulSoup
+from markdown import markdown
+import nltk
+from nltk.tokenize import sent_tokenize
+import string
+import unicodedata
+nltk.download('punkt')
+POST_ID = 0
+REFERENDUM_TYPE = "referendums_v2"
+VOTE_TYPE = "ReferendumV2"  # "Motion", "Fellowship", "Referendum", "ReferendumV2", "DemocracyProposal"
+UPDATE_INTERVAL = 1800
+def dot_product(u, v):
+    res = np.dot(u, v)
+    return res
+def markdn_2_str(text):
+    html = markdown(text)
+    clean_text = ' '.join(BeautifulSoup(html, features="html.parser").findAll(string=True))
+    markdown_link_pattern = re.compile(r'\[.*?\]\(.*?\)')
+    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
+    clean_text = re.sub(markdown_link_pattern, ' ', clean_text)  # remove markdown style links
+    clean_text = re.sub(url_pattern, ' ', clean_text)  # remove regular links
+    clean_text = clean_text.replace('\n', ' ')  # remove \n
+    return clean_text
+def get_sum(prop):
+    key_word = "KSM"
+    pattern = re.compile(r'(\d)')
+    search_phrases = [
+        "requests a total of 1500 KSM",
+        "requests a total of 7450 $ (17 KSM)",
+        "Requested amount: 3,333 KSM",
+        "Requested funding 78,804 USD // 2770 KSM",
+        "Requested KSM: 598"
+    ]
+    ref = model.encode("".join(search_phrases), convert_to_tensor=True)
+    prop = unicodedata.normalize("NFKD", prop)
+    prop = markdn_2_str(prop)
+    sentences = sent_tokenize(prop)
+    similarities = []
+    for s in sentences:
+        sentence_embedding = model.encode(s, convert_to_tensor=True)
+        similarities.append(-dot_product(sentence_embedding, ref))
+    max_similarity_index = np.argsort(similarities)
+    sent = next((sentences[i] for i in max_similarity_index if "KSM" in sentences[i]), "None")
+    s = re.split(r'(\s)', sent)
+    s = [x.translate(str.maketrans('', '', string.punctuation)) if not pattern.search(x) else x for x in s]
+    s = [x for x in s if x != ' ']
+    s = [x for x in s if x != '']
+    try:
+        index_KSM = [idx for idx, val in enumerate(s) if "KSM" in val]
+        for el in index_KSM:
+            l = s[el - 1:el + 2]
+            for x in l:
+                if pattern.search(x):
+                    return x
+    except Exception:
+        return None
+def get_proposals():
+    global POST_ID
+    global df
+    flag = True
+    while flag:
+        rn = requests.post(
+            f"https://api.polkassembly.io/api/v1/posts/on-chain-post?proposalType={REFERENDUM_TYPE}&postId={POST_ID}",
+            headers={"x-network": "kusama"})
+        if rn.ok:
+            proposal_data = rn.json()
+            line = [proposal_data.get("content"), proposal_data.get("status"), get_sum(proposal_data.get("content"))]
+            df.loc[POST_ID] = line
+            POST_ID += 1
+        else:
+            event.set()
+            flag = False
+def get_embeddings():
+    global df_emb
+    for i in range(len(df)):
+        df_emb.loc[i] = [model.encode(markdn_2_str(df.iloc[i]['content']))]
+def update_proposals():
+    global POST_ID
+    global df
+    flag = True
+    while flag:
+        rn = requests.post(
+            f"https://api.polkassembly.io/api/v1/posts/on-chain-post?proposalType={REFERENDUM_TYPE}&postId={POST_ID}",
+            headers={"x-network": "kusama"})
+        if rn.ok:
+            proposal_data = rn.json()
+            line = [proposal_data.get("content"), proposal_data.get("status"), get_sum(proposal_data.get("content"))]
+            df.loc[POST_ID] = line
+            POST_ID += 1
+        else:
+            event.set()
+            flag = False
+def update_embeddings():
+    global df_emb
+    while True:
+        event.wait()
+        print(POST_ID)
+        print(len(df))
+        if len(df) != len(df_emb):
+            id_to_add = [x + len(df_emb) for x in range(len(df) - len(df_emb))]
+            for i in id_to_add:
+                df_emb.loc[i] = [model.encode(markdn_2_str(df.iloc[i]['content']))]
+        else:
+            event.clear()
+def run_periodically():
+    update_proposals()
+    threading.Timer(UPDATE_INTERVAL, run_periodically, args=(UPDATE_INTERVAL,)).start()
+def compare_proposals(prop, count):
+    query_emb = model.encode(markdn_2_str(prop))
+    new_df = pd.DataFrame(columns=['sim1'])
+    new_df['sim1'] = df_emb.apply(lambda row: dot_product(row[0], query_emb), axis=1)
+    best_match = np.argsort(-new_df['sim1'])[0:count]
+    res = [df.iloc[x]['content'] for x in best_match]
+    stat = [df.iloc[x]['status'] for x in best_match]
+    ksm = [df.iloc[x]['ksm'] for x in best_match]
+    # total = [get_sum(y) for y in [df.iloc[x]['content'] for x in best_match]]
+    title = [
+        '''<span style="color:blue"><h2>Total KSM requested: {sum}, status: {status}, ID: {id}</h2></span> \n '''.format(
+            sum=x, status=y, id=z) for x, y, z in zip(ksm, stat, best_match)]
+    result = "\n ".join([a + b for a, b in zip(title, res)])
+    return result
+if __name__ == '__main__':
+    event = threading.Event()
+    model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')
+    df = pd.DataFrame(columns=['content', 'status', 'ksm'])
+    df_emb = pd.DataFrame(columns=['content'])
+    get_proposals()
+    get_embeddings()
+    POST_ID = len(df)
+    update_thread = threading.Thread(target=run_periodically)  # background proposals update
+    upd_emb_thread = threading.Thread(target=update_embeddings)  # background embeddings update
+    update_thread.start()
+    upd_emb_thread.start()
+    with gr.Blocks() as demo:
+        gr.Markdown("<h1>Compare proposals</h1>")
+        inpt = gr.Textbox(label="Input Proposal", lines=5, max_lines=12)
+        dr = gr.Dropdown(label="Vote type",
+                         choices=["Motion", "Fellowship", "Referendum", "ReferendumV2", "DemocracyProposal"],
+                         value="ReferendumV2", interactive=True)
+        slider = gr.Slider(label="Number of proposals to output", minimum=1, maximum=20, step=1, value=5,
+                           interactive=True)
+        btn = gr.Button("Find similar proposals")
+        otpt = gr.Markdown("")
+        btn.click(fn=compare_proposals, inputs=[inpt, slider], outputs=otpt)
+    demo.launch(show_error=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,90 @@

+aiofiles==23.2.1
+altair==5.2.0
+annotated-types==0.6.0
+anyio==4.3.0
+attrs==23.2.0
+beautifulsoup4==4.12.3
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+contourpy==1.2.0
+cycler==0.12.1
+DateTime==5.4
+et-xmlfile==1.1.0
+exceptiongroup==1.2.0
+fastapi==0.110.0
+ffmpy==0.3.2
+filelock==3.13.1
+fonttools==4.49.0
+fsspec==2024.2.0
+gradio==4.19.2
+gradio_client==0.10.1
+h11==0.14.0
+httpcore==1.0.4
+httpx==0.27.0
+huggingface-hub==0.21.3
+idna==3.6
+importlib_metadata==7.0.2
+importlib_resources==6.1.2
+Jinja2==3.1.3
+joblib==1.3.2
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+Markdown==3.5.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.8.3
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.2.1
+nltk==3.8.1
+numpy==1.26.4
+openpyxl==3.1.2
+orjson==3.9.15
+packaging==23.2
+pandas==2.2.1
+pillow==10.2.0
+pydantic==2.6.3
+pydantic_core==2.16.3
+pydub==0.25.1
+Pygments==2.17.2
+pyparsing==3.1.1
+python-dateutil==2.8.2
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.33.0
+regex==2023.12.25
+requests==2.31.0
+rich==13.7.1
+rpds-py==0.18.0
+ruff==0.3.0
+safetensors==0.4.2
+schedule==1.2.1
+scikit-learn==1.4.1.post1
+scipy==1.12.0
+semantic-version==2.10.0
+sentence-transformers==2.5.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+soupsieve==2.5
+starlette==0.36.3
+sympy==1.12
+threadpoolctl==3.3.0
+tokenizers==0.15.2
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.2.1
+tqdm==4.66.2
+transformers==4.38.1
+typer==0.9.0
+typing_extensions==4.10.0
+tzdata==2024.1
+urllib3==2.2.1
+uvicorn==0.27.1
+websockets==11.0.3
+zipp==3.17.0
+zope.interface==6.2