Spaces:
Runtime error
Runtime error
track data
Browse files- .gitattributes +1 -0
- app.py +107 -0
- data/test.csv +3 -0
- data/titles.csv +3 -0
- data/train.csv +3 -0
- requirements.txt +4 -0
- src/config.py +15 -0
- src/utils.py +7 -0
.gitattributes
CHANGED
@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
data/*.csv filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
from src.config import APP_PAGE_HEADER
|
5 |
+
from src.utils import search_df
|
6 |
+
|
7 |
+
APP_PAGE_HEADER()
|
8 |
+
|
9 |
+
|
10 |
+
@st.cache
|
11 |
+
class LoadData:
|
12 |
+
train: pd.DataFrame = pd.read_csv("data/train.csv")
|
13 |
+
train = train.sample(frac=1).reset_index(drop=True) # shuffle data
|
14 |
+
|
15 |
+
test: pd.DataFrame = pd.read_csv("data/test.csv")
|
16 |
+
titles: pd.DataFrame = pd.read_csv("data/titles.csv")
|
17 |
+
|
18 |
+
# add code titles to train data
|
19 |
+
merged = train.merge(titles, left_on="context", right_on="code")
|
20 |
+
train_df = merged[['id', 'code', 'anchor', 'context', 'target', 'title', 'score']].copy()
|
21 |
+
|
22 |
+
# add relations / edges for knowledge graph
|
23 |
+
train_kg: pd.DataFrame = train_df.copy()
|
24 |
+
train_kg['relation'] = train_kg['code'] + " || " + train_kg['title'] + " || " + train_kg['score'].astype(str)
|
25 |
+
|
26 |
+
|
27 |
+
class App:
|
28 |
+
def __init__(self):
|
29 |
+
self.data = LoadData()
|
30 |
+
|
31 |
+
def run(self, debug=False):
|
32 |
+
self.render_header(debug)
|
33 |
+
self.render_body(debug)
|
34 |
+
self.render_footer(debug)
|
35 |
+
|
36 |
+
def render_header(self, *args, **kwargs):
|
37 |
+
pass
|
38 |
+
|
39 |
+
def render_body(self, *args, **kwargs):
|
40 |
+
data = self.data.train
|
41 |
+
st.write(f"> Train data `{data.shape[0]}` rows")
|
42 |
+
filter_ = st.text_input("search", "")
|
43 |
+
if filter_:
|
44 |
+
data = search_df(self.data.train, filter_)
|
45 |
+
st.write(data)
|
46 |
+
|
47 |
+
Helper().visualize()
|
48 |
+
|
49 |
+
def render_footer(self, *args, **kwargs):
|
50 |
+
pass
|
51 |
+
|
52 |
+
|
53 |
+
class Helper(App):
|
54 |
+
def visualize(self, *args, **kwargs):
|
55 |
+
|
56 |
+
# filter data for visualization
|
57 |
+
MAX_EDGES = 100
|
58 |
+
sample = self.data.train_kg[:MAX_EDGES]
|
59 |
+
|
60 |
+
st1, st2 = st.columns(2)
|
61 |
+
|
62 |
+
data = self.data.train_kg
|
63 |
+
|
64 |
+
score = st1.selectbox("visualize by score", [""] + data["score"].unique().tolist())
|
65 |
+
if score:
|
66 |
+
sample = data[data["score"] == float(score)][:MAX_EDGES]
|
67 |
+
|
68 |
+
filter_ = st2.text_input("search term to visualize matching phrases")
|
69 |
+
if filter_:
|
70 |
+
sample = search_df(data, filter_)[:MAX_EDGES]
|
71 |
+
|
72 |
+
# create graph
|
73 |
+
nodes = list(sample["anchor"].unique()) + list(sample["target"].unique())
|
74 |
+
edges = [(h, t) for h, t in zip(sample["anchor"].tolist(), sample["target"].tolist())]
|
75 |
+
labels = sample["relation"].tolist()
|
76 |
+
edge_labels = dict(zip(edges, labels))
|
77 |
+
|
78 |
+
# create PyVis network from the graph data
|
79 |
+
self.pyvis_network(nodes, edge_labels)
|
80 |
+
st.write(f"> sample size: {sample.shape[0]}")
|
81 |
+
|
82 |
+
def pyvis_network(self, nodes, edge_labels):
|
83 |
+
|
84 |
+
from stvis import pv_static
|
85 |
+
|
86 |
+
g = self.build_network(edge_labels, nodes)
|
87 |
+
|
88 |
+
pv_static(g)
|
89 |
+
|
90 |
+
@staticmethod
|
91 |
+
@st.experimental_singleton
|
92 |
+
def build_network(edge_labels, nodes):
|
93 |
+
# src: https://stackoverflow.com/a/67279471/2839786
|
94 |
+
from pyvis.network import Network
|
95 |
+
g = Network(height="800px", width="1400px", heading="U.S. Patent Phrase/Context Network", bgcolor="#bbbffz") # notebook=True,
|
96 |
+
for node in nodes:
|
97 |
+
g.add_node(node)
|
98 |
+
for e in edge_labels:
|
99 |
+
n1, n2 = e[0], e[1]
|
100 |
+
label = edge_labels[e]
|
101 |
+
g.add_edge(n1, n2, title=label, show_edge_weights=True) # weight 42
|
102 |
+
return g
|
103 |
+
|
104 |
+
|
105 |
+
if __name__ == "__main__":
|
106 |
+
app = App()
|
107 |
+
app.run(debug=True)
|
data/test.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6a21727914b70c69f48a1aec1bb7e7c4d7f01adf1ae73f0c77e7a2b62dc6a1de
|
3 |
+
size 1965
|
data/titles.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f138d6bdf2939ba576b96b633d81366123b5f64b9842f567360fb1f9e86a5ace
|
3 |
+
size 21605031
|
data/train.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b233317683bfab1dcc949ed4055f9ed168a26b9ef2b0a7cc0091a6bfbb5139da
|
3 |
+
size 2141136
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
streamlit
|
3 |
+
pyvis
|
4 |
+
stvis
|
src/config.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
|
4 |
+
def APP_PAGE_HEADER():
|
5 |
+
st.set_page_config(
|
6 |
+
page_title="U.S. Patent", page_icon="🔬", layout="wide", initial_sidebar_state="collapsed"
|
7 |
+
)
|
8 |
+
|
9 |
+
hide_streamlit_style = """
|
10 |
+
<style>
|
11 |
+
#MainMenu {visibility: hidden;}
|
12 |
+
footer {visibility: hidden;}
|
13 |
+
</style>
|
14 |
+
"""
|
15 |
+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
src/utils.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
def search_df(df: pd.DataFrame, substring: str, case: bool = False) -> pd.DataFrame:
|
6 |
+
mask = np.column_stack([df[col].astype(str).str.contains(substring.lower(), case=case, na=False) for col in df])
|
7 |
+
return df.loc[mask.any(axis=1)]
|