Spaces:
Running
Running
first commit
Browse files- .idea/.gitignore +8 -0
- Home.py +43 -0
- pages/2_Most Similar Terms.py +46 -0
- pages/3_Semantic Change.py +81 -0
- pages/4_Word Similarity.py +112 -0
- requirements.txt +9 -0
.idea/.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Default ignored files
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
4 |
+
# Editor-based HTTP Client requests
|
5 |
+
/httpRequests/
|
6 |
+
# Datasource local storage ignored files
|
7 |
+
/dataSources/
|
8 |
+
/dataSources.local.xml
|
Home.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
#import pickle
|
3 |
+
import s3fs
|
4 |
+
from gensim.models import KeyedVectors
|
5 |
+
|
6 |
+
st.title('Historical Word Embeddings')
|
7 |
+
|
8 |
+
st.write("Welcome!")
|
9 |
+
st.write("This is an interactive web app that allows users to explore how the meaning of words change over time. Use the sidebar on the left to navigate.")
|
10 |
+
st.write("Please note: The app is still under development and things might not always work properly.")
|
11 |
+
st.write("Creator: Simon Walo")
|
12 |
+
st.write("Data source: https://nlp.stanford.edu/projects/histwords/ (All English (1800s-1990s))")
|
13 |
+
st.write("Please wait while the data is loading:")
|
14 |
+
|
15 |
+
# Create connection object.
|
16 |
+
# `anon=False` means not anonymous, i.e. it uses access keys to pull data.
|
17 |
+
|
18 |
+
fs = s3fs.S3FileSystem(anon=False)
|
19 |
+
fs.ls('bricktamlandstreamlitbucket')
|
20 |
+
|
21 |
+
def read_file(filename):
|
22 |
+
with fs.open(filename) as f:
|
23 |
+
return f.read()
|
24 |
+
|
25 |
+
|
26 |
+
#@st.cache(allow_output_mutation = True)
|
27 |
+
@st.experimental_memo
|
28 |
+
def load_data():
|
29 |
+
models_all = {
|
30 |
+
1810: KeyedVectors.load('./data/vectors1810.kv'),
|
31 |
+
1840: KeyedVectors.load('./data/vectors1840.kv'),
|
32 |
+
1870: KeyedVectors.load('./data/vectors1870.kv'),
|
33 |
+
1900: KeyedVectors.load('./data/vectors1900.kv'),
|
34 |
+
1930: KeyedVectors.load('./data/vectors1930.kv'),
|
35 |
+
1960: KeyedVectors.load('./data/vectors1960.kv'),
|
36 |
+
1990: KeyedVectors.load('./data/vectors1990.kv')
|
37 |
+
}
|
38 |
+
return models_all
|
39 |
+
|
40 |
+
if 'models_all' not in st.session_state:
|
41 |
+
st.session_state['models_all'] = load_data()
|
42 |
+
|
43 |
+
st.write("Data loaded!")
|
pages/2_Most Similar Terms.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
st.subheader('Most similar terms')
|
6 |
+
|
7 |
+
keyword = st.text_input("Input term", "gay", key="simkey")
|
8 |
+
keyword = keyword.lower()
|
9 |
+
|
10 |
+
|
11 |
+
def similarterms():
|
12 |
+
|
13 |
+
if keyword not in st.session_state['models_all'][1810]:
|
14 |
+
st.write('Keyword not found in data. Please check for spelling errors.')
|
15 |
+
return
|
16 |
+
|
17 |
+
years=[]
|
18 |
+
simterms=[]
|
19 |
+
|
20 |
+
for year, model in st.session_state['models_all'].items():
|
21 |
+
if model[keyword].all() == st.session_state['models_all'][1810]['biology'].all():
|
22 |
+
st.write('Keyword not available for ', year)
|
23 |
+
else:
|
24 |
+
years.append(year)
|
25 |
+
simterms.append(model.most_similar(keyword))
|
26 |
+
|
27 |
+
simterms2 = []
|
28 |
+
for x in simterms:
|
29 |
+
for y in x:
|
30 |
+
simterms2.append(y[0])
|
31 |
+
|
32 |
+
simterms3 = np.array_split(simterms2, len(st.session_state['models_all']))
|
33 |
+
|
34 |
+
simterms4 = []
|
35 |
+
for array in simterms3:
|
36 |
+
simterms4.append(list(array))
|
37 |
+
|
38 |
+
simterms5 = []
|
39 |
+
for x in simterms4:
|
40 |
+
simterms5.append((', '.join(x)))
|
41 |
+
|
42 |
+
simtermstable = pd.DataFrame(zip(years, simterms5))
|
43 |
+
simtermstable.columns = ["year", "terms"]
|
44 |
+
st.table(simtermstable)
|
45 |
+
|
46 |
+
similarterms()
|
pages/3_Semantic Change.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import numpy as np
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
from sklearn.decomposition import PCA
|
5 |
+
from adjustText import adjust_text
|
6 |
+
|
7 |
+
st.subheader('Semantic Change')
|
8 |
+
|
9 |
+
keyword = st.text_input("Input term", "gay", key="semkey")
|
10 |
+
keyword = keyword.lower()
|
11 |
+
|
12 |
+
def semchange(keyword):
|
13 |
+
|
14 |
+
if keyword not in st.session_state['models_all'][1810]:
|
15 |
+
st.write('Keyword not found in data. Please check for spelling errors.')
|
16 |
+
return
|
17 |
+
|
18 |
+
# get list of all similar words from different periods
|
19 |
+
|
20 |
+
sim_words = []
|
21 |
+
|
22 |
+
for year, model in st.session_state['models_all'].items():
|
23 |
+
if year in range(1810, 2000, 60):
|
24 |
+
if model[keyword].all() == st.session_state['models_all'][1810]['biology'].all():
|
25 |
+
st.write('Keyword not available for ', year)
|
26 |
+
if model[keyword].all() != st.session_state['models_all'][1810]['biology'].all():
|
27 |
+
tempsim = model.most_similar(keyword, topn=7)
|
28 |
+
for term, vector in tempsim:
|
29 |
+
sim_words.append(term)
|
30 |
+
|
31 |
+
sim_words = list(set(sim_words))
|
32 |
+
|
33 |
+
# get vectors of similar words in most recent embedding (1990)
|
34 |
+
sim_vectors1990 = np.array([st.session_state['models_all'][1990][w] for w in sim_words])
|
35 |
+
|
36 |
+
# get vectors of keyword in all periods
|
37 |
+
|
38 |
+
keyword_vectors = np.zeros(shape=(0,300))
|
39 |
+
|
40 |
+
for year, model in st.session_state['models_all'].items():
|
41 |
+
if year in range(1810, 2000, 60):
|
42 |
+
if model[keyword].all() != st.session_state['models_all'][1810]['biology'].all():
|
43 |
+
temp_keyword_vector = np.array([model[keyword]])
|
44 |
+
keyword_vectors = np.append(keyword_vectors, temp_keyword_vector, axis=0)
|
45 |
+
|
46 |
+
# add keyword vectors from all periods to vectors of similar words 1990
|
47 |
+
|
48 |
+
allvectors = np.append(sim_vectors1990, keyword_vectors, axis=0)
|
49 |
+
|
50 |
+
# "train" PCA model with only similar words
|
51 |
+
pca = PCA(n_components=2)
|
52 |
+
pca.fit(sim_vectors1990)
|
53 |
+
two_dim = pca.transform(allvectors)
|
54 |
+
|
55 |
+
# get labels
|
56 |
+
labels = sim_words
|
57 |
+
for year, model in st.session_state['models_all'].items():
|
58 |
+
if year in range(1810, 2000, 60):
|
59 |
+
if model[keyword].all() != st.session_state['models_all'][1810]['biology'].all():
|
60 |
+
labels.append(keyword + str(year))
|
61 |
+
|
62 |
+
#plot results
|
63 |
+
|
64 |
+
fig, ax = plt.subplots()
|
65 |
+
ax.scatter(two_dim[:, 0], two_dim[:, 1])
|
66 |
+
|
67 |
+
texts = [ax.text(x=two_dim[i, 0], y=two_dim[i, 1], s=labels[i]) for i in range(len(sim_words))]
|
68 |
+
adjust_text(texts)
|
69 |
+
|
70 |
+
#plot arrow between keywords
|
71 |
+
|
72 |
+
for i in range(-2, -(len(keyword_vectors)+1), -1):
|
73 |
+
ax.arrow(two_dim[i,0], two_dim[i,1],
|
74 |
+
two_dim[i+1, 0] - two_dim[i,0], two_dim[i+1, 1] - two_dim[i,1],
|
75 |
+
head_width=0.03, length_includes_head=True)
|
76 |
+
|
77 |
+
st.pyplot(fig)
|
78 |
+
fig.clear()
|
79 |
+
plt.close(fig)
|
80 |
+
|
81 |
+
semchange(keyword)
|
pages/4_Word Similarity.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
from scipy.interpolate import interp1d
|
6 |
+
|
7 |
+
st.subheader('Word Similarity')
|
8 |
+
|
9 |
+
col1, col2 = st.columns(2)
|
10 |
+
|
11 |
+
with col1:
|
12 |
+
keyword1 = st.text_input("Input term A1", "work", key="word1")
|
13 |
+
keyword1 = keyword1.lower()
|
14 |
+
|
15 |
+
keyword3 = st.text_input("Input term B1", "test", key="word3")
|
16 |
+
keyword3 = keyword3.lower()
|
17 |
+
|
18 |
+
with col2:
|
19 |
+
keyword2 = st.text_input("Input term A2", "hard", key="word2")
|
20 |
+
keyword2 = keyword2.lower()
|
21 |
+
|
22 |
+
keyword4 = st.text_input("Input term B2", "hello", key="word4")
|
23 |
+
keyword4 = keyword4.lower()
|
24 |
+
|
25 |
+
def distchange(keyword1, keyword2):
|
26 |
+
|
27 |
+
if keyword1 not in st.session_state['models_all'][1810]:
|
28 |
+
st.write('Input term A1 not found in data. Please check for spelling errors.')
|
29 |
+
return
|
30 |
+
if keyword2 not in st.session_state['models_all'][1810]:
|
31 |
+
st.write('Input term A2 not found in data. Please check for spelling errors.')
|
32 |
+
return
|
33 |
+
if keyword3 not in st.session_state['models_all'][1810]:
|
34 |
+
st.write('Input term B1 not found in data. Please check for spelling errors.')
|
35 |
+
return
|
36 |
+
if keyword4 not in st.session_state['models_all'][1810]:
|
37 |
+
st.write('Input term B2 not found in data. Please check for spelling errors.')
|
38 |
+
return
|
39 |
+
|
40 |
+
|
41 |
+
d1 = []
|
42 |
+
d2 = []
|
43 |
+
|
44 |
+
for year, model in st.session_state['models_all'].items():
|
45 |
+
if year in range(1810, 2000, 30):
|
46 |
+
if model[keyword1].all() == st.session_state['models_all'][1810]['biology'].all():
|
47 |
+
st.write('Keyword ', keyword1, ' not available for ', year)
|
48 |
+
if model[keyword2].all() == st.session_state['models_all'][1810]['biology'].all():
|
49 |
+
st.write('Keyword ', keyword2, ' not available for ', year)
|
50 |
+
else:
|
51 |
+
d1.append(
|
52 |
+
{
|
53 |
+
"year": year,
|
54 |
+
"similarity": model.n_similarity([keyword1], [keyword2])
|
55 |
+
}
|
56 |
+
)
|
57 |
+
|
58 |
+
for year, model in st.session_state['models_all'].items():
|
59 |
+
if year in range(1810, 2000, 30):
|
60 |
+
if model[keyword1].all() == st.session_state['models_all'][1810]['biology'].all():
|
61 |
+
st.write('Keyword ', keyword3, ' not available for ', year)
|
62 |
+
if model[keyword2].all() == st.session_state['models_all'][1810]['biology'].all():
|
63 |
+
st.write('Keyword ', keyword4, ' not available for ', year)
|
64 |
+
else:
|
65 |
+
d2.append(
|
66 |
+
{
|
67 |
+
"year": year,
|
68 |
+
"similarity": model.n_similarity([keyword3], [keyword4])
|
69 |
+
}
|
70 |
+
)
|
71 |
+
|
72 |
+
data1 = pd.DataFrame(d1)
|
73 |
+
data2 = pd.DataFrame(d2)
|
74 |
+
|
75 |
+
|
76 |
+
# the trendline
|
77 |
+
x1 = data1['year'].tolist()
|
78 |
+
x2 = data2['year'].tolist()
|
79 |
+
|
80 |
+
y1 = data1['similarity'].tolist()
|
81 |
+
y2 = data2['similarity'].tolist()
|
82 |
+
|
83 |
+
|
84 |
+
if len(x1) < 4 or len(x2) < 4:
|
85 |
+
st.write('Not enough data points. Please try other keywords.')
|
86 |
+
|
87 |
+
else:
|
88 |
+
|
89 |
+
fun1 = interp1d(x1, y1, kind='cubic')
|
90 |
+
fun2 = interp1d(x2, y2, kind='cubic')
|
91 |
+
|
92 |
+
|
93 |
+
x1new = np.linspace(x1[0], 1990, 100)
|
94 |
+
x2new = np.linspace(x2[0], 1990, 100)
|
95 |
+
|
96 |
+
|
97 |
+
fig, ax = plt.subplots()
|
98 |
+
ax.plot(x1new, fun1(x1new), '-', label=(keyword1, keyword2))
|
99 |
+
ax.plot(x1, y1, 'o')
|
100 |
+
ax.plot(x2new, fun2(x2new), '-', label=(keyword3, keyword4))
|
101 |
+
ax.plot(x2, y2, 'o')
|
102 |
+
ax.legend()
|
103 |
+
ax.set_xticks(range(1810, 2000, 30))
|
104 |
+
|
105 |
+
# show plot
|
106 |
+
plt.xlabel("Year")
|
107 |
+
plt.ylabel("Cosine Similarity")
|
108 |
+
st.pyplot(fig)
|
109 |
+
fig.clear()
|
110 |
+
plt.close(fig)
|
111 |
+
|
112 |
+
distchange(keyword1, keyword2)
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit~=1.13.0
|
2 |
+
gensim~=4.1.2
|
3 |
+
pandas~=1.4.2
|
4 |
+
matplotlib~=3.5.1
|
5 |
+
numpy~=1.22.3
|
6 |
+
scikit-learn~=1.0.2
|
7 |
+
adjusttext~=0.7.3
|
8 |
+
s3fs~=2022.5.0
|
9 |
+
scipy~=1.8.1
|