File size: 3,219 Bytes
d8a4dbe
 
eec46b2
d8a4dbe
eec46b2
 
 
 
84a9829
eec46b2
 
 
 
 
 
 
61ecf6c
eec46b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60f1365
 
eec46b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""The following program will read in 2 XL sheets of KP matches and the user will evaluate the quality of the matching"""
import streamlit as st
import ast
import pandas as pd
import random
from time import sleep
threshold = st.radio("Select threshold", ["0.7", "0.8", "0.85", "0.87", "0.9", "0.95"], 2)
num_kp = st.slider("Number of key-phrases to select", min_value=10, max_value=100,value=50,step=5)
xl1 = st.file_uploader("Choose first file", key="xl1")
#xl2 = st.file_uploader("Choose second file", key="xl2")
def merge_dicts(x):
    return {k: v for d in x.dropna() for k, v in d.items()}
def clean_dict(x):
    return  x.replace("'", '"')
if xl1 is not None :
    #assert that the first few columns are the same
    df1 = pd.read_excel(xl1, sheet_name= f"{threshold} Threshold")
    #first convert strings into dicts
    df1["Matched KPs"] = df1["Matched KPs"].apply(clean_dict)
    df1["Matched KPs"] = df1["Matched KPs"].apply(lambda x: ast.literal_eval(x))
    df1["Matched KPs"] = df1["Matched KPs"].apply(lambda x: {key: x[key] for key in x.keys() if x[key]!="null"})
    #now pop direct matches
    df1["Matched KPs"] = df1["Matched KPs"].apply(lambda x: {key:x[key] for key in x.keys() if x[key] <0.99})
    df1.drop(df1[df1["Matched KPs"] == {}].index, inplace=True)
    #now merge same KPs and their respective dicts
    new_df = df1[["KP","Matched KPs"]].groupby("KP").agg(merge_dicts)
    new_df["dict len"] = new_df["Matched KPs"].apply(lambda x: len(list(x.keys())))
    new_df = new_df.sort_values(by="dict len", ascending=False)
    new_df.reset_index(inplace=True)
    #new_df = new_df.drop("dict len", axis=1)
    with st.form("First excel file"):
        choices = []
        i = 0
        if num_kp > new_df.shape[0] :
            num_kp = new_df.shape[0]
        for t1 in new_df.sample(n=num_kp,random_state=42).iterrows():
        #for t1 in new_df.sample(n=10, random_state=42).iterrows():
            r1 = t1[1]
            kps1 = r1["Matched KPs"]
            curr_keys = list(kps1.keys()).copy()
            for kp1 in curr_keys:
                if kps1[kp1] > 0.99:
                    kps1.pop(kp1)
            # now display the kps
            if kps1 == {}:
                continue
            else:
                col1, col2 = st.columns(2)
                with col1:
                    st.write(r1["KP"])
                with col2:
                    #if number of keys > 5, then shuffle them and select 5 random
                    if len(list(kps1.keys())) > 5:
                        #we can repurpose curr_keys as it was used to pop direct matches
                        curr_keys = list(kps1.keys())
                        random.Random(42).shuffle(curr_keys)
                        curr_keys = curr_keys[:5]
                    else:
                        curr_keys = list(kps1.keys())
                    for kp1 in curr_keys:
                        choices.append(st.checkbox(f"{kp1}: {kps1[kp1]:0.2f}", key = i))
                        i+=1
                    st.markdown("""---""")
        submitted = st.form_submit_button("Submit")
        if submitted:
            st.write(len([x for x in choices if x]), i, f"{len([x for x in choices if x])/i : 0.3f}")