azizalto commited on
Commit
8106e91
1 Parent(s): d0c5a8e

track data

Browse files
Files changed (8) hide show
  1. .gitattributes +1 -0
  2. app.py +107 -0
  3. data/test.csv +3 -0
  4. data/titles.csv +3 -0
  5. data/train.csv +3 -0
  6. requirements.txt +4 -0
  7. src/config.py +15 -0
  8. src/utils.py +7 -0
.gitattributes CHANGED
@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ data/*.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+
4
+ from src.config import APP_PAGE_HEADER
5
+ from src.utils import search_df
6
+
7
+ APP_PAGE_HEADER()
8
+
9
+
10
+ @st.cache
11
+ class LoadData:
12
+ train: pd.DataFrame = pd.read_csv("data/train.csv")
13
+ train = train.sample(frac=1).reset_index(drop=True) # shuffle data
14
+
15
+ test: pd.DataFrame = pd.read_csv("data/test.csv")
16
+ titles: pd.DataFrame = pd.read_csv("data/titles.csv")
17
+
18
+ # add code titles to train data
19
+ merged = train.merge(titles, left_on="context", right_on="code")
20
+ train_df = merged[['id', 'code', 'anchor', 'context', 'target', 'title', 'score']].copy()
21
+
22
+ # add relations / edges for knowledge graph
23
+ train_kg: pd.DataFrame = train_df.copy()
24
+ train_kg['relation'] = train_kg['code'] + " || " + train_kg['title'] + " || " + train_kg['score'].astype(str)
25
+
26
+
27
+ class App:
28
+ def __init__(self):
29
+ self.data = LoadData()
30
+
31
+ def run(self, debug=False):
32
+ self.render_header(debug)
33
+ self.render_body(debug)
34
+ self.render_footer(debug)
35
+
36
+ def render_header(self, *args, **kwargs):
37
+ pass
38
+
39
+ def render_body(self, *args, **kwargs):
40
+ data = self.data.train
41
+ st.write(f"> Train data `{data.shape[0]}` rows")
42
+ filter_ = st.text_input("search", "")
43
+ if filter_:
44
+ data = search_df(self.data.train, filter_)
45
+ st.write(data)
46
+
47
+ Helper().visualize()
48
+
49
+ def render_footer(self, *args, **kwargs):
50
+ pass
51
+
52
+
53
+ class Helper(App):
54
+ def visualize(self, *args, **kwargs):
55
+
56
+ # filter data for visualization
57
+ MAX_EDGES = 100
58
+ sample = self.data.train_kg[:MAX_EDGES]
59
+
60
+ st1, st2 = st.columns(2)
61
+
62
+ data = self.data.train_kg
63
+
64
+ score = st1.selectbox("visualize by score", [""] + data["score"].unique().tolist())
65
+ if score:
66
+ sample = data[data["score"] == float(score)][:MAX_EDGES]
67
+
68
+ filter_ = st2.text_input("search term to visualize matching phrases")
69
+ if filter_:
70
+ sample = search_df(data, filter_)[:MAX_EDGES]
71
+
72
+ # create graph
73
+ nodes = list(sample["anchor"].unique()) + list(sample["target"].unique())
74
+ edges = [(h, t) for h, t in zip(sample["anchor"].tolist(), sample["target"].tolist())]
75
+ labels = sample["relation"].tolist()
76
+ edge_labels = dict(zip(edges, labels))
77
+
78
+ # create PyVis network from the graph data
79
+ self.pyvis_network(nodes, edge_labels)
80
+ st.write(f"> sample size: {sample.shape[0]}")
81
+
82
+ def pyvis_network(self, nodes, edge_labels):
83
+
84
+ from stvis import pv_static
85
+
86
+ g = self.build_network(edge_labels, nodes)
87
+
88
+ pv_static(g)
89
+
90
+ @staticmethod
91
+ @st.experimental_singleton
92
+ def build_network(edge_labels, nodes):
93
+ # src: https://stackoverflow.com/a/67279471/2839786
94
+ from pyvis.network import Network
95
+ g = Network(height="800px", width="1400px", heading="U.S. Patent Phrase/Context Network", bgcolor="#bbbffz") # notebook=True,
96
+ for node in nodes:
97
+ g.add_node(node)
98
+ for e in edge_labels:
99
+ n1, n2 = e[0], e[1]
100
+ label = edge_labels[e]
101
+ g.add_edge(n1, n2, title=label, show_edge_weights=True) # weight 42
102
+ return g
103
+
104
+
105
+ if __name__ == "__main__":
106
+ app = App()
107
+ app.run(debug=True)
data/test.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a21727914b70c69f48a1aec1bb7e7c4d7f01adf1ae73f0c77e7a2b62dc6a1de
3
+ size 1965
data/titles.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f138d6bdf2939ba576b96b633d81366123b5f64b9842f567360fb1f9e86a5ace
3
+ size 21605031
data/train.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b233317683bfab1dcc949ed4055f9ed168a26b9ef2b0a7cc0091a6bfbb5139da
3
+ size 2141136
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ pandas
2
+ streamlit
3
+ pyvis
4
+ stvis
src/config.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ def APP_PAGE_HEADER():
5
+ st.set_page_config(
6
+ page_title="U.S. Patent", page_icon="🔬", layout="wide", initial_sidebar_state="collapsed"
7
+ )
8
+
9
+ hide_streamlit_style = """
10
+ <style>
11
+ #MainMenu {visibility: hidden;}
12
+ footer {visibility: hidden;}
13
+ </style>
14
+ """
15
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
src/utils.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+
5
+ def search_df(df: pd.DataFrame, substring: str, case: bool = False) -> pd.DataFrame:
6
+ mask = np.column_stack([df[col].astype(str).str.contains(substring.lower(), case=case, na=False) for col in df])
7
+ return df.loc[mask.any(axis=1)]