simonwalo commited on
Commit
661241c
1 Parent(s): 030e09c

first commit

Browse files
.idea/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Editor-based HTTP Client requests
5
+ /httpRequests/
6
+ # Datasource local storage ignored files
7
+ /dataSources/
8
+ /dataSources.local.xml
Home.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ #import pickle
3
+ import s3fs
4
+ from gensim.models import KeyedVectors
5
+
6
+ st.title('Historical Word Embeddings')
7
+
8
+ st.write("Welcome!")
9
+ st.write("This is an interactive web app that allows users to explore how the meaning of words change over time. Use the sidebar on the left to navigate.")
10
+ st.write("Please note: The app is still under development and things might not always work properly.")
11
+ st.write("Creator: Simon Walo")
12
+ st.write("Data source: https://nlp.stanford.edu/projects/histwords/ (All English (1800s-1990s))")
13
+ st.write("Please wait while the data is loading:")
14
+
15
+ # Create connection object.
16
+ # `anon=False` means not anonymous, i.e. it uses access keys to pull data.
17
+
18
+ fs = s3fs.S3FileSystem(anon=False)
19
+ fs.ls('bricktamlandstreamlitbucket')
20
+
21
+ def read_file(filename):
22
+ with fs.open(filename) as f:
23
+ return f.read()
24
+
25
+
26
+ #@st.cache(allow_output_mutation = True)
27
+ @st.experimental_memo
28
+ def load_data():
29
+ models_all = {
30
+ 1810: KeyedVectors.load('./data/vectors1810.kv'),
31
+ 1840: KeyedVectors.load('./data/vectors1840.kv'),
32
+ 1870: KeyedVectors.load('./data/vectors1870.kv'),
33
+ 1900: KeyedVectors.load('./data/vectors1900.kv'),
34
+ 1930: KeyedVectors.load('./data/vectors1930.kv'),
35
+ 1960: KeyedVectors.load('./data/vectors1960.kv'),
36
+ 1990: KeyedVectors.load('./data/vectors1990.kv')
37
+ }
38
+ return models_all
39
+
40
+ if 'models_all' not in st.session_state:
41
+ st.session_state['models_all'] = load_data()
42
+
43
+ st.write("Data loaded!")
pages/2_Most Similar Terms.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ st.subheader('Most similar terms')
6
+
7
+ keyword = st.text_input("Input term", "gay", key="simkey")
8
+ keyword = keyword.lower()
9
+
10
+
11
+ def similarterms():
12
+
13
+ if keyword not in st.session_state['models_all'][1810]:
14
+ st.write('Keyword not found in data. Please check for spelling errors.')
15
+ return
16
+
17
+ years=[]
18
+ simterms=[]
19
+
20
+ for year, model in st.session_state['models_all'].items():
21
+ if model[keyword].all() == st.session_state['models_all'][1810]['biology'].all():
22
+ st.write('Keyword not available for ', year)
23
+ else:
24
+ years.append(year)
25
+ simterms.append(model.most_similar(keyword))
26
+
27
+ simterms2 = []
28
+ for x in simterms:
29
+ for y in x:
30
+ simterms2.append(y[0])
31
+
32
+ simterms3 = np.array_split(simterms2, len(st.session_state['models_all']))
33
+
34
+ simterms4 = []
35
+ for array in simterms3:
36
+ simterms4.append(list(array))
37
+
38
+ simterms5 = []
39
+ for x in simterms4:
40
+ simterms5.append((', '.join(x)))
41
+
42
+ simtermstable = pd.DataFrame(zip(years, simterms5))
43
+ simtermstable.columns = ["year", "terms"]
44
+ st.table(simtermstable)
45
+
46
+ similarterms()
pages/3_Semantic Change.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ from sklearn.decomposition import PCA
5
+ from adjustText import adjust_text
6
+
7
+ st.subheader('Semantic Change')
8
+
9
+ keyword = st.text_input("Input term", "gay", key="semkey")
10
+ keyword = keyword.lower()
11
+
12
+ def semchange(keyword):
13
+
14
+ if keyword not in st.session_state['models_all'][1810]:
15
+ st.write('Keyword not found in data. Please check for spelling errors.')
16
+ return
17
+
18
+ # get list of all similar words from different periods
19
+
20
+ sim_words = []
21
+
22
+ for year, model in st.session_state['models_all'].items():
23
+ if year in range(1810, 2000, 60):
24
+ if model[keyword].all() == st.session_state['models_all'][1810]['biology'].all():
25
+ st.write('Keyword not available for ', year)
26
+ if model[keyword].all() != st.session_state['models_all'][1810]['biology'].all():
27
+ tempsim = model.most_similar(keyword, topn=7)
28
+ for term, vector in tempsim:
29
+ sim_words.append(term)
30
+
31
+ sim_words = list(set(sim_words))
32
+
33
+ # get vectors of similar words in most recent embedding (1990)
34
+ sim_vectors1990 = np.array([st.session_state['models_all'][1990][w] for w in sim_words])
35
+
36
+ # get vectors of keyword in all periods
37
+
38
+ keyword_vectors = np.zeros(shape=(0,300))
39
+
40
+ for year, model in st.session_state['models_all'].items():
41
+ if year in range(1810, 2000, 60):
42
+ if model[keyword].all() != st.session_state['models_all'][1810]['biology'].all():
43
+ temp_keyword_vector = np.array([model[keyword]])
44
+ keyword_vectors = np.append(keyword_vectors, temp_keyword_vector, axis=0)
45
+
46
+ # add keyword vectors from all periods to vectors of similar words 1990
47
+
48
+ allvectors = np.append(sim_vectors1990, keyword_vectors, axis=0)
49
+
50
+ # "train" PCA model with only similar words
51
+ pca = PCA(n_components=2)
52
+ pca.fit(sim_vectors1990)
53
+ two_dim = pca.transform(allvectors)
54
+
55
+ # get labels
56
+ labels = sim_words
57
+ for year, model in st.session_state['models_all'].items():
58
+ if year in range(1810, 2000, 60):
59
+ if model[keyword].all() != st.session_state['models_all'][1810]['biology'].all():
60
+ labels.append(keyword + str(year))
61
+
62
+ #plot results
63
+
64
+ fig, ax = plt.subplots()
65
+ ax.scatter(two_dim[:, 0], two_dim[:, 1])
66
+
67
+ texts = [ax.text(x=two_dim[i, 0], y=two_dim[i, 1], s=labels[i]) for i in range(len(sim_words))]
68
+ adjust_text(texts)
69
+
70
+ #plot arrow between keywords
71
+
72
+ for i in range(-2, -(len(keyword_vectors)+1), -1):
73
+ ax.arrow(two_dim[i,0], two_dim[i,1],
74
+ two_dim[i+1, 0] - two_dim[i,0], two_dim[i+1, 1] - two_dim[i,1],
75
+ head_width=0.03, length_includes_head=True)
76
+
77
+ st.pyplot(fig)
78
+ fig.clear()
79
+ plt.close(fig)
80
+
81
+ semchange(keyword)
pages/4_Word Similarity.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import matplotlib.pyplot as plt
3
+ import pandas as pd
4
+ import numpy as np
5
+ from scipy.interpolate import interp1d
6
+
7
+ st.subheader('Word Similarity')
8
+
9
+ col1, col2 = st.columns(2)
10
+
11
+ with col1:
12
+ keyword1 = st.text_input("Input term A1", "work", key="word1")
13
+ keyword1 = keyword1.lower()
14
+
15
+ keyword3 = st.text_input("Input term B1", "test", key="word3")
16
+ keyword3 = keyword3.lower()
17
+
18
+ with col2:
19
+ keyword2 = st.text_input("Input term A2", "hard", key="word2")
20
+ keyword2 = keyword2.lower()
21
+
22
+ keyword4 = st.text_input("Input term B2", "hello", key="word4")
23
+ keyword4 = keyword4.lower()
24
+
25
+ def distchange(keyword1, keyword2):
26
+
27
+ if keyword1 not in st.session_state['models_all'][1810]:
28
+ st.write('Input term A1 not found in data. Please check for spelling errors.')
29
+ return
30
+ if keyword2 not in st.session_state['models_all'][1810]:
31
+ st.write('Input term A2 not found in data. Please check for spelling errors.')
32
+ return
33
+ if keyword3 not in st.session_state['models_all'][1810]:
34
+ st.write('Input term B1 not found in data. Please check for spelling errors.')
35
+ return
36
+ if keyword4 not in st.session_state['models_all'][1810]:
37
+ st.write('Input term B2 not found in data. Please check for spelling errors.')
38
+ return
39
+
40
+
41
+ d1 = []
42
+ d2 = []
43
+
44
+ for year, model in st.session_state['models_all'].items():
45
+ if year in range(1810, 2000, 30):
46
+ if model[keyword1].all() == st.session_state['models_all'][1810]['biology'].all():
47
+ st.write('Keyword ', keyword1, ' not available for ', year)
48
+ if model[keyword2].all() == st.session_state['models_all'][1810]['biology'].all():
49
+ st.write('Keyword ', keyword2, ' not available for ', year)
50
+ else:
51
+ d1.append(
52
+ {
53
+ "year": year,
54
+ "similarity": model.n_similarity([keyword1], [keyword2])
55
+ }
56
+ )
57
+
58
+ for year, model in st.session_state['models_all'].items():
59
+ if year in range(1810, 2000, 30):
60
+ if model[keyword1].all() == st.session_state['models_all'][1810]['biology'].all():
61
+ st.write('Keyword ', keyword3, ' not available for ', year)
62
+ if model[keyword2].all() == st.session_state['models_all'][1810]['biology'].all():
63
+ st.write('Keyword ', keyword4, ' not available for ', year)
64
+ else:
65
+ d2.append(
66
+ {
67
+ "year": year,
68
+ "similarity": model.n_similarity([keyword3], [keyword4])
69
+ }
70
+ )
71
+
72
+ data1 = pd.DataFrame(d1)
73
+ data2 = pd.DataFrame(d2)
74
+
75
+
76
+ # the trendline
77
+ x1 = data1['year'].tolist()
78
+ x2 = data2['year'].tolist()
79
+
80
+ y1 = data1['similarity'].tolist()
81
+ y2 = data2['similarity'].tolist()
82
+
83
+
84
+ if len(x1) < 4 or len(x2) < 4:
85
+ st.write('Not enough data points. Please try other keywords.')
86
+
87
+ else:
88
+
89
+ fun1 = interp1d(x1, y1, kind='cubic')
90
+ fun2 = interp1d(x2, y2, kind='cubic')
91
+
92
+
93
+ x1new = np.linspace(x1[0], 1990, 100)
94
+ x2new = np.linspace(x2[0], 1990, 100)
95
+
96
+
97
+ fig, ax = plt.subplots()
98
+ ax.plot(x1new, fun1(x1new), '-', label=(keyword1, keyword2))
99
+ ax.plot(x1, y1, 'o')
100
+ ax.plot(x2new, fun2(x2new), '-', label=(keyword3, keyword4))
101
+ ax.plot(x2, y2, 'o')
102
+ ax.legend()
103
+ ax.set_xticks(range(1810, 2000, 30))
104
+
105
+ # show plot
106
+ plt.xlabel("Year")
107
+ plt.ylabel("Cosine Similarity")
108
+ st.pyplot(fig)
109
+ fig.clear()
110
+ plt.close(fig)
111
+
112
+ distchange(keyword1, keyword2)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ streamlit~=1.13.0
2
+ gensim~=4.1.2
3
+ pandas~=1.4.2
4
+ matplotlib~=3.5.1
5
+ numpy~=1.22.3
6
+ scikit-learn~=1.0.2
7
+ adjusttext~=0.7.3
8
+ s3fs~=2022.5.0
9
+ scipy~=1.8.1