Mark7549 commited on
Commit
dde99f4
1 Parent(s): 6640785

updated front end

Browse files
Files changed (3) hide show
  1. .streamlit/config.toml +2 -0
  2. app.py +266 -243
  3. images/AGALMA_logo.png +0 -0
.streamlit/config.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [theme]
2
+ primaryColor="B8E52B"
app.py CHANGED
@@ -10,7 +10,7 @@ import json
10
  from streamlit_tags import st_tags, st_tags_sidebar
11
 
12
 
13
- st.set_page_config(page_title="Ancient Greek Word2Vec", layout="centered")
14
 
15
  # Cache data
16
  @st.cache_data
@@ -46,6 +46,8 @@ models_for_word_dict = load_models_for_word_dict()
46
  lemma_counts = load_lemma_count_dict()
47
 
48
 
 
 
49
  # Set styles for menu
50
  styles = {
51
  "container": {"display": "flex", "justify-content": "center"},
@@ -70,289 +72,310 @@ styles = {
70
  "color": "#000"
71
  },
72
  "nav-link-selected": {
73
- "background-color": "rgb(254, 74, 75)",
74
  "color": "white",
75
  "font-weight": "bold"
76
  },
77
  "icon": {"display": "None"}
78
  }
79
 
80
- # Horizontal menu
81
- active_tab = option_menu(None, ["Nearest neighbours", "Cosine similarity", "3D graph", 'Dictionary', 'About', 'FAQ'],
82
- menu_icon="cast", default_index=0, orientation="horizontal", styles=styles)
 
 
 
 
 
 
 
83
 
84
 
85
- # Adding CSS style to remove list-style-type
86
- st.markdown("""
87
- <style>
88
- /* Define a class to remove list-style-type */
89
- .no-list-style {
90
- list-style-type: none;
91
- }
92
- </style>
93
- """, unsafe_allow_html=True)
94
 
95
 
96
 
97
- # Nearest neighbours tab
98
- if active_tab == "Nearest neighbours":
99
-
100
- # All models in a list
101
- eligible_models = ["Archaic", "Classical", "Hellenistic", "Early Roman", "Late Roman"]
102
- all_models_words = load_all_models_words()
103
-
104
- with st.container():
105
- st.markdown("## Nearest Neighbours")
106
- st.markdown('Here you can extract the nearest neighbours to a chosen lemma. Please select one or more time slices and the preferred number of nearest neighbours.')
107
- target_word = st.multiselect("Enter a word", options=all_models_words, max_selections=1)
108
- if len(target_word) > 0:
109
- target_word = target_word[0]
110
-
111
- eligible_models = models_for_word_dict[target_word]
112
 
113
- models = st.multiselect(
114
- "Select models to search for neighbours",
115
- eligible_models
116
- )
117
- n = st.slider("Number of neighbours", 1, 50, 15)
118
 
119
- nearest_neighbours_button = st.button("Find nearest neighbours")
120
-
121
- if nearest_neighbours_button:
122
- if validate_nearest_neighbours(target_word, n, models) == False:
123
- st.error('Please fill in all fields')
124
- else:
125
- # Rewrite models to list of all loaded models
126
- models = load_selected_models(models)
127
-
128
- nearest_neighbours = get_nearest_neighbours(target_word, n, models)
129
 
130
- all_dfs = []
131
-
132
- # Create dataframes
133
- for model in nearest_neighbours.keys():
134
- st.write(f"### {model}")
135
- df = pd.DataFrame(
136
- nearest_neighbours[model],
137
- columns = ['Word', 'Cosine Similarity']
138
  )
139
-
140
- # Add word occurences to dataframe
141
- df['Occurences'] = df['Word'].apply(lambda x: lemma_counts[model][x])
142
-
143
 
 
 
 
 
 
 
144
 
145
- all_dfs.append((model, df))
146
- st.table(df)
147
-
148
-
149
- # Store content in a temporary file
150
- tmp_file = store_df_in_temp_file(all_dfs)
151
-
152
- # Open the temporary file and read its content
153
- with open(tmp_file, "rb") as file:
154
- file_byte = file.read()
155
 
156
- # Create download button
157
- st.download_button(
158
- "Download results",
159
- data=file_byte,
160
- file_name = f'nearest_neighbours_{target_word}_TEST.xlsx',
161
- mime='application/octet-stream'
 
 
162
  )
 
 
 
 
 
 
 
 
163
 
164
-
165
- # Cosine similarity tab
166
- elif active_tab == "Cosine similarity":
167
- all_models_words = load_all_models_words()
168
-
169
- with st.container():
170
- eligible_models_1 = []
171
- eligible_models_2 = []
172
- st.markdown("## Cosine similarity")
173
- st.markdown('Here you can extract the cosine similarity between two lemmas. Please select a time slice for each lemma. You can also calculate the cosine similarity between two vectors of the same lemma in different time slices.')
174
- col1, col2 = st.columns(2)
175
- col3, col4 = st.columns(2)
176
- with col1:
177
- word_1 = st.multiselect("Enter a word", placeholder="πατήρ", max_selections=1, options=all_models_words)
178
- if len(word_1) > 0:
179
- word_1 = word_1[0]
180
- eligible_models_1 = models_for_word_dict[word_1]
181
 
182
- with col2:
183
- time_slice_1 = st.selectbox("Time slice word 1", options = eligible_models_1)
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  with st.container():
187
- with col3:
188
- word_2 = st.multiselect("Enter a word", placeholder="μήτηρ", max_selections=1, options=all_models_words)
189
- if len(word_2) > 0:
190
- word_2 = word_2[0]
191
- eligible_models_2 = models_for_word_dict[word_2]
 
192
 
193
- with col4:
194
- time_slice_2 = st.selectbox("Time slice word 2", eligible_models_2)
195
-
196
- # Create button for calculating cosine similarity
197
- cosine_similarity_button = st.button("Calculate cosine similarity")
198
-
199
- # If the button is clicked, execute calculation
200
- if cosine_similarity_button:
201
- cosine_simularity_score = get_cosine_similarity(word_1, time_slice_1, word_2, time_slice_2)
202
- st.write(cosine_simularity_score)
203
-
204
- # 3D graph tab
205
- elif active_tab == "3D graph":
206
- st.markdown("## 3D graph")
207
- st.markdown('Here you can generate a 3D representation of the semantic space surrounding a target lemma. Please choose the lemma and the time slice.')
208
-
209
- col1, col2 = st.columns(2)
210
-
211
- # Load compressed word list
212
- all_models_words = load_all_models_words()
213
-
214
- with st.container():
215
- eligible_models = []
216
- with col1:
217
- word = st.multiselect("Enter a word", all_models_words, max_selections=1)
218
- if len(word) > 0:
219
- word = word[0]
220
- eligible_models = models_for_word_dict[word]
221
-
222
- with col2:
223
- time_slice = st.selectbox("Time slice", eligible_models)
224
 
225
- n = st.slider("Number of words", 1, 50, 15)
226
 
227
- graph_button = st.button("Create 3D graph")
228
-
229
- if graph_button:
230
- time_slice_model = convert_time_name_to_model(time_slice)
231
- nearest_neighbours_vectors = get_nearest_neighbours_vectors(word, time_slice_model, n)
232
-
233
- fig, df = make_3d_plot_tSNE(nearest_neighbours_vectors, word, time_slice_model)
234
-
235
- st.plotly_chart(fig)
236
-
237
 
 
 
 
 
 
 
 
 
 
 
238
 
239
-
240
-
241
- # Dictionary tab
242
- elif active_tab == "Dictionary":
243
-
244
- with st.container():
245
- st.markdown('## Dictionary')
246
- st.markdown('Search a word in the Liddell-Scott-Jones dictionary (only Greek, no whitespaces).')
247
 
248
 
249
- all_lemmas = load_all_lemmas()
250
-
251
- # query_word = st.multiselect("Search a word in the LSJ dictionary", all_lemmas, max_selections=1)
252
-
253
- query_tag = st_tags(label='',
254
- text = '',
255
- value = [],
256
- suggestions = all_lemmas,
257
- maxtags = 1,
258
- key = '1'
259
- )
260
-
261
- # If a word has been selected by user
262
- if query_tag:
263
- st.write(f"### {query_tag[0]}")
264
 
265
- # Display word information
266
- if query_tag[0] in lemma_dict:
267
- data = lemma_dict[query_tag[0]]
268
- elif query_tag[0].capitalize() in lemma_dict: # Some words are capitalized in the dictionary
269
- data = lemma_dict[query_tag[0].capitalize()]
270
- else:
271
- st.error("Word not found in dictionary")
272
 
273
- # Put text in readable format
274
- text = format_text(data)
 
 
 
 
 
275
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
- st.markdown(format_text(data), unsafe_allow_html = True)
278
-
 
 
279
 
280
 
281
- st.markdown("""
282
- <style>
283
- .tab {
284
- display: inline-block;
285
- margin-left: 4em;
286
- }
287
- .tr {
288
- font-weight: bold;
289
- }
290
- .list-class {
291
- list-style-type: none;
292
- margin-top: 1em;
293
- }
294
- .primary-indicator {
295
- font-weight: bold;
296
- font-size: x-large;
297
- }
298
- .secondary-indicator {
299
- font-weight: bold;
300
- font-size: large;
301
- }
302
- .tertiary-indicator {
303
- font-weight: bold;
304
- font-size: medium;
305
- }
306
- .quaternary-indicator {
307
- font-weight: bold;
308
- font-size: medium;
309
- }
310
- .primary-class {
311
- padding-left: 2em;
312
- }
313
- .secondary-class {
314
- padding-left: 4em;
315
- }
316
- .tertiary-class {
317
- padding-left: 6em;
318
- }
319
- .quaternary-class {
320
- padding-left: 8em;
321
- }
322
- </style>
323
- """, unsafe_allow_html=True)
324
-
325
-
326
- # About tab
327
- elif active_tab == "About":
328
  st.markdown("""
329
  ## About
330
  Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus nec nunc ultricies ultricies.
331
  """)
332
-
333
-
334
- elif active_tab == "FAQ":
335
- st.markdown("""
336
- ## FAQ
337
- """)
338
-
339
- with st.expander('''**Which models is this interface based on?**'''):
340
- st.write(
341
- "This interface is based on five language models. \
342
- Language models are statistical models of language, \
343
- which store statistical information about word co-occurrence during the training phase. \
344
- During training they process a corpus of texts in the target language(s). \
345
- Once trained, models can be used to extract information about the language \
346
- (in this interface, we focus on the extraction of semantic information) or to perform specific linguistic tasks. \
347
- The models on which this interface is based are Word Embedding models."
348
- )
349
-
350
- with st.expander('''**Which corpus was used to train the models?**'''):
351
- st.write(
352
- "The five models on which this interface is based were trained on five slices of the Diorisis Ancient Greek Corpus (Vatri & McGillivray 2018)."
353
- )
354
 
355
 
356
-
357
-
358
 
 
 
 
 
 
 
 
 
 
 
10
  from streamlit_tags import st_tags, st_tags_sidebar
11
 
12
 
13
+ st.set_page_config(page_title="ἄγαλμα | AGALMA", layout="centered")
14
 
15
  # Cache data
16
  @st.cache_data
 
46
  lemma_counts = load_lemma_count_dict()
47
 
48
 
49
+
50
+
51
  # Set styles for menu
52
  styles = {
53
  "container": {"display": "flex", "justify-content": "center"},
 
72
  "color": "#000"
73
  },
74
  "nav-link-selected": {
75
+ "background-color": "#B8E52B",
76
  "color": "white",
77
  "font-weight": "bold"
78
  },
79
  "icon": {"display": "None"}
80
  }
81
 
82
+ with st.sidebar:
83
+ st.image('images/AGALMA_logo.png', width=250)
84
+ st.markdown('# ἄγαλμα | AGALMA')
85
+ selected = option_menu(None, ["App", "About", "FAQ", "License"],
86
+ menu_icon="menu", default_index=0, orientation="vertical")
87
+
88
+ if selected == "App":
89
+ # Horizontal menu
90
+ active_tab = option_menu(None, ["Nearest neighbours", "Cosine similarity", "3D graph", 'Dictionary'],
91
+ menu_icon="cast", default_index=0, orientation="horizontal", styles=styles)
92
 
93
 
94
+ # Adding CSS style to remove list-style-type
95
+ st.markdown("""
96
+ <style>
97
+ /* Define a class to remove list-style-type */
98
+ .no-list-style {
99
+ list-style-type: none;
100
+ }
101
+ </style>
102
+ """, unsafe_allow_html=True)
103
 
104
 
105
 
106
+ # Nearest neighbours tab
107
+ if active_tab == "Nearest neighbours":
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
+ # All models in a list
110
+ eligible_models = ["Archaic", "Classical", "Hellenistic", "Early Roman", "Late Roman"]
111
+ all_models_words = load_all_models_words()
 
 
112
 
113
+ with st.container():
114
+ st.markdown("## Nearest Neighbours")
115
+ st.markdown('Here you can extract the nearest neighbours to a chosen lemma. Please select one or more time slices and the preferred number of nearest neighbours.')
116
+ target_word = st.multiselect("Enter a word", options=all_models_words, max_selections=1)
117
+ if len(target_word) > 0:
118
+ target_word = target_word[0]
119
+
120
+ eligible_models = models_for_word_dict[target_word]
 
 
121
 
122
+ models = st.multiselect(
123
+ "Select models to search for neighbours",
124
+ eligible_models
 
 
 
 
 
125
  )
126
+ n = st.slider("Number of neighbours", 1, 50, 15)
127
+
128
+ nearest_neighbours_button = st.button("Find nearest neighbours")
 
129
 
130
+ if nearest_neighbours_button:
131
+ if validate_nearest_neighbours(target_word, n, models) == False:
132
+ st.error('Please fill in all fields')
133
+ else:
134
+ # Rewrite models to list of all loaded models
135
+ models = load_selected_models(models)
136
 
137
+ nearest_neighbours = get_nearest_neighbours(target_word, n, models)
 
 
 
 
 
 
 
 
 
138
 
139
+ all_dfs = []
140
+
141
+ # Create dataframes
142
+ for model in nearest_neighbours.keys():
143
+ st.write(f"### {model}")
144
+ df = pd.DataFrame(
145
+ nearest_neighbours[model],
146
+ columns = ['Word', 'Cosine Similarity']
147
  )
148
+
149
+ # Add word occurences to dataframe
150
+ df['Occurences'] = df['Word'].apply(lambda x: lemma_counts[model][x])
151
+
152
+
153
+
154
+ all_dfs.append((model, df))
155
+ st.table(df)
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
+ # Store content in a temporary file
159
+ tmp_file = store_df_in_temp_file(all_dfs)
160
+
161
+ # Open the temporary file and read its content
162
+ with open(tmp_file, "rb") as file:
163
+ file_byte = file.read()
164
+
165
+ # Create download button
166
+ st.download_button(
167
+ "Download results",
168
+ data=file_byte,
169
+ file_name = f'nearest_neighbours_{target_word}_TEST.xlsx',
170
+ mime='application/octet-stream'
171
+ )
172
+
173
 
174
+ # Cosine similarity tab
175
+ elif active_tab == "Cosine similarity":
176
+ all_models_words = load_all_models_words()
177
+
178
+ with st.container():
179
+ eligible_models_1 = []
180
+ eligible_models_2 = []
181
+ st.markdown("## Cosine similarity")
182
+ st.markdown('Here you can extract the cosine similarity between two lemmas. Please select a time slice for each lemma. You can also calculate the cosine similarity between two vectors of the same lemma in different time slices.')
183
+ col1, col2 = st.columns(2)
184
+ col3, col4 = st.columns(2)
185
+ with col1:
186
+ word_1 = st.multiselect("Enter a word", placeholder="πατήρ", max_selections=1, options=all_models_words)
187
+ if len(word_1) > 0:
188
+ word_1 = word_1[0]
189
+ eligible_models_1 = models_for_word_dict[word_1]
190
+
191
+ with col2:
192
+ time_slice_1 = st.selectbox("Time slice word 1", options = eligible_models_1)
193
 
194
+
195
+ with st.container():
196
+ with col3:
197
+ word_2 = st.multiselect("Enter a word", placeholder="μήτηρ", max_selections=1, options=all_models_words)
198
+ if len(word_2) > 0:
199
+ word_2 = word_2[0]
200
+ eligible_models_2 = models_for_word_dict[word_2]
201
+
202
+ with col4:
203
+ time_slice_2 = st.selectbox("Time slice word 2", eligible_models_2)
204
+
205
+ # Create button for calculating cosine similarity
206
+ cosine_similarity_button = st.button("Calculate cosine similarity")
207
+
208
+ # If the button is clicked, execute calculation
209
+ if cosine_similarity_button:
210
+ cosine_simularity_score = get_cosine_similarity(word_1, time_slice_1, word_2, time_slice_2)
211
+ st.write(cosine_simularity_score)
212
+
213
+ # 3D graph tab
214
+ elif active_tab == "3D graph":
215
+ st.markdown("## 3D graph")
216
+ st.markdown('Here you can generate a 3D representation of the semantic space surrounding a target lemma. Please choose the lemma and the time slice.')
217
+
218
+ col1, col2 = st.columns(2)
219
+
220
+ # Load compressed word list
221
+ all_models_words = load_all_models_words()
222
+
223
  with st.container():
224
+ eligible_models = []
225
+ with col1:
226
+ word = st.multiselect("Enter a word", all_models_words, max_selections=1)
227
+ if len(word) > 0:
228
+ word = word[0]
229
+ eligible_models = models_for_word_dict[word]
230
 
231
+ with col2:
232
+ time_slice = st.selectbox("Time slice", eligible_models)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
+ n = st.slider("Number of words", 1, 50, 15)
235
 
236
+ graph_button = st.button("Create 3D graph")
 
 
 
 
 
 
 
 
 
237
 
238
+ if graph_button:
239
+ time_slice_model = convert_time_name_to_model(time_slice)
240
+ nearest_neighbours_vectors = get_nearest_neighbours_vectors(word, time_slice_model, n)
241
+
242
+ fig, df = make_3d_plot_tSNE(nearest_neighbours_vectors, word, time_slice_model)
243
+
244
+ st.plotly_chart(fig)
245
+
246
+
247
+
248
 
249
+
250
+ # Dictionary tab
251
+ elif active_tab == "Dictionary":
252
+
253
+ with st.container():
254
+ st.markdown('## Dictionary')
255
+ st.markdown('Search a word in the Liddell-Scott-Jones dictionary (only Greek, no whitespaces).')
 
256
 
257
 
258
+ all_lemmas = load_all_lemmas()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
+ # query_word = st.multiselect("Search a word in the LSJ dictionary", all_lemmas, max_selections=1)
 
 
 
 
 
 
261
 
262
+ query_tag = st_tags(label='',
263
+ text = '',
264
+ value = [],
265
+ suggestions = all_lemmas,
266
+ maxtags = 1,
267
+ key = '1'
268
+ )
269
 
270
+ # If a word has been selected by user
271
+ if query_tag:
272
+ st.write(f"### {query_tag[0]}")
273
+
274
+ # Display word information
275
+ if query_tag[0] in lemma_dict:
276
+ data = lemma_dict[query_tag[0]]
277
+ elif query_tag[0].capitalize() in lemma_dict: # Some words are capitalized in the dictionary
278
+ data = lemma_dict[query_tag[0].capitalize()]
279
+ else:
280
+ st.error("Word not found in dictionary")
281
+
282
+ # Put text in readable format
283
+ text = format_text(data)
284
+
285
+
286
+ st.markdown(format_text(data), unsafe_allow_html = True)
287
+
288
+
289
+
290
+ st.markdown("""
291
+ <style>
292
+ .tab {
293
+ display: inline-block;
294
+ margin-left: 4em;
295
+ }
296
+ .tr {
297
+ font-weight: bold;
298
+ }
299
+ .list-class {
300
+ list-style-type: none;
301
+ margin-top: 1em;
302
+ }
303
+ .primary-indicator {
304
+ font-weight: bold;
305
+ font-size: x-large;
306
+ }
307
+ .secondary-indicator {
308
+ font-weight: bold;
309
+ font-size: large;
310
+ }
311
+ .tertiary-indicator {
312
+ font-weight: bold;
313
+ font-size: medium;
314
+ }
315
+ .quaternary-indicator {
316
+ font-weight: bold;
317
+ font-size: medium;
318
+ }
319
+ .primary-class {
320
+ padding-left: 2em;
321
+ }
322
+ .secondary-class {
323
+ padding-left: 4em;
324
+ }
325
+ .tertiary-class {
326
+ padding-left: 6em;
327
+ }
328
+ .quaternary-class {
329
+ padding-left: 8em;
330
+ }
331
+ </style>
332
+ """, unsafe_allow_html=True)
333
+
334
+
335
+ # About tab
336
+ elif active_tab == "About":
337
+ st.markdown("""
338
+ ## About
339
+ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus nec nunc ultricies ultricies.
340
+ """)
341
+
342
+
343
+ elif active_tab == "FAQ":
344
+ st.markdown("""
345
+ ## FAQ
346
+ """)
347
+
348
+ with st.expander('''**Which models is this interface based on?**'''):
349
+ st.write(
350
+ "This interface is based on five language models. \
351
+ Language models are statistical models of language, \
352
+ which store statistical information about word co-occurrence during the training phase. \
353
+ During training they process a corpus of texts in the target language(s). \
354
+ Once trained, models can be used to extract information about the language \
355
+ (in this interface, we focus on the extraction of semantic information) or to perform specific linguistic tasks. \
356
+ The models on which this interface is based are Word Embedding models."
357
+ )
358
 
359
+ with st.expander('''**Which corpus was used to train the models?**'''):
360
+ st.write(
361
+ "The five models on which this interface is based were trained on five slices of the Diorisis Ancient Greek Corpus (Vatri & McGillivray 2018)."
362
+ )
363
 
364
 
365
+ if selected == "About":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  st.markdown("""
367
  ## About
368
  Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus nec nunc ultricies ultricies.
369
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
 
371
 
 
 
372
 
373
+ streamlit_style = """
374
+ <style>
375
+ html, body {
376
+ font-family: 'Helvetica';
377
+ }
378
+ </style>
379
+ """
380
+
381
+ st.markdown(streamlit_style, unsafe_allow_html=True)
images/AGALMA_logo.png ADDED