Mofe commited on
Commit
63994d4
·
1 Parent(s): 118228a

Upload 4 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +146 -0
  3. hess_papers_details.json +3 -0
  4. requirements.txt +2 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ hess_papers_details.json filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Reference: https://huggingface.co/spaces/gwf-uwaterloo/acl-spectrum (By Ehsan Khamallo)
3
+ """
4
+
5
+ import os
6
+ import re
7
+ import pandas as pd
8
+ import plotly.express as px
9
+ import streamlit as st
10
+
11
+ st.set_page_config(layout="wide")
12
+ DATA_FILE = "hess_papers_details.json"
13
+
14
+ st.markdown(
15
+ """
16
+ <link href="https://cdn.jsdelivr.net/npm/bootstrap@4.6.1/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha256-DF7Zhf293AJxJNTmh5zhoYYIMs2oXitRfBjY+9L//AY=" crossorigin="anonymous">
17
+ <link rel="preconnect" href="https://fonts.googleapis.com">
18
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
19
+ <link href="https://fonts.googleapis.com/css2?family=Permanent+Marker&display=swap" rel="stylesheet">
20
+ <style>
21
+ .title {
22
+ font-family: 'Permanent Marker', cursive;
23
+ font-size: 2.0rem;
24
+ }
25
+ </style>""",
26
+ unsafe_allow_html=True,
27
+ )
28
+
29
+ st.sidebar.write(
30
+ """<center><p class="title">
31
+ HESS Papers Clustering 🌎🌿
32
+ </p></center>""",
33
+ unsafe_allow_html=True,
34
+ )
35
+
36
+ st.sidebar.write(
37
+ """<p class="text-justify">
38
+ A clustered visualization of all papers submitted to the
39
+ <a href=https://www.hydrology-and-earth-system-sciences.net/>Hydrology and Earth System Sciences</a> (HESS) conference.
40
+ 5318 papers are embedded using <a href="https://huggingface.co/allenai/specter2_base">spectre2</a> and reduced with
41
+ t-SNE. Papers span from as early as 1997 to 2023.
42
+ </p>""",
43
+ unsafe_allow_html=True,
44
+ )
45
+
46
+ def to_string_authors(list_of_authors):
47
+ if len(list_of_authors) > 5:
48
+ return ", ".join(list_of_authors[:5]) + ", et al."
49
+ elif len(list_of_authors) > 2:
50
+ return ", ".join(list_of_authors[:-1]) + ", and " + list_of_authors[-1]
51
+ else:
52
+ return " and ".join(list_of_authors)
53
+
54
+
55
+ def load_df(data_file: os.PathLike):
56
+ df = pd.read_json(data_file, orient="records")
57
+ df["x"] = df["t-SNE1"]
58
+ df["y"] = df["t-SNE2"]
59
+
60
+ df["authors_trimmed"] = df["authors_trimmed"]
61
+
62
+ #sort dataframe by year
63
+ df['year'] = pd.to_datetime(df['year'])
64
+ df = df.sort_values('year', ascending=True)
65
+ df['year'] = df['year'].dt.strftime('%Y')
66
+
67
+ return df
68
+
69
+ @st.cache_data
70
+ def load_dataframe():
71
+ return load_df(DATA_FILE)
72
+
73
+ DF = load_dataframe()
74
+ DF["opacity"] = 0.04
75
+ min_year, max_year = DF["year"].min(), DF["year"].max()
76
+
77
+ with st.sidebar:
78
+ start_year, end_year = st.select_slider(
79
+ "Publication year",
80
+ options=[str(y) for y in range(min_year, max_year + 1)],
81
+ value=(str(min_year), str(max_year)),
82
+ )
83
+
84
+ author_names = st.text_input("Author names (separated by comma)")
85
+
86
+ title = st.text_input("Title")
87
+
88
+ # Work on this
89
+ # topics = st.multiselect(
90
+ # "Topics",
91
+ # ["Topics 1: "],
92
+ # ["Topics 2: "],
93
+ # )
94
+
95
+ start_year = int(start_year)
96
+ end_year = int(end_year)
97
+ df_mask = (DF["year"] >= start_year) & (DF["year"] <= end_year)
98
+
99
+ if author_names:
100
+ authors = [a.strip() for a in author_names.split(",")]
101
+ author_mask = DF.authors.apply(
102
+ lambda row: all(any(re.match(rf".*{a}.*", x, re.IGNORECASE) for x in row) for a in authors)
103
+ )
104
+ df_mask = df_mask & author_mask
105
+
106
+ if title:
107
+ df_mask = df_mask & DF.title.apply(lambda x: title.lower() in x.lower())
108
+
109
+ DF.loc[df_mask, "opacity"] = 1.0
110
+ st.write(f"Number of points: {DF[df_mask].shape[0]}")
111
+
112
+ fig = px.scatter(
113
+ DF,
114
+ x="x",
115
+ y="y",
116
+ opacity=DF["opacity"],
117
+ color=DF["cluster"],
118
+ width=1000,
119
+ height=800,
120
+ custom_data=("title", "authors_trimmed", "year"),
121
+ color_continuous_scale="haline",
122
+ )
123
+
124
+ fig.update_traces(
125
+ hovertemplate="<b>%{customdata[0]}</b><br>%{customdata[1]}<br>%{customdata[2]}<br><i>"
126
+ )
127
+
128
+ fig.update_layout(
129
+ showlegend=False,
130
+ font=dict(
131
+ family="Times New Roman",
132
+ size=30,
133
+ ),
134
+ hoverlabel=dict(
135
+ align="left",
136
+ font_size=14,
137
+ font_family="Rockwell",
138
+ namelength=-1,
139
+ ),
140
+ )
141
+
142
+ fig.update_xaxes(title="")
143
+ fig.update_yaxes(title="")
144
+
145
+ a = fig.show(fig, use_container_width=True)
146
+
hess_papers_details.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c63e3d30332ede78cc6f180ef1b1470a171ec6459b905c839dab86ab718116d0
3
+ size 13282576
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ pandas
2
+ plotly