nevmenandr
commited on
Commit
•
057776e
1
Parent(s):
ba2bc11
Update README.md
Browse files
README.md
CHANGED
@@ -1,3 +1,318 @@
|
|
1 |
-
---
|
2 |
-
license: mit
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: mit
|
3 |
+
language:
|
4 |
+
- ru
|
5 |
+
tags:
|
6 |
+
- natural-language-processing
|
7 |
+
- dh
|
8 |
+
- word2vec
|
9 |
+
---
|
10 |
+
|
11 |
+
The model is built on Leo Tolstoy's [collected works](https://github.com/tolstoydigital/TEI) and represents his individual semantics
|
12 |
+
|
13 |
+
## Preparation
|
14 |
+
|
15 |
+
All texts are converted from the TEI markup, splitted into sentences and lemmatized. Only modern orthography left in the data.
|
16 |
+
|
17 |
+
```python
|
18 |
+
import html
|
19 |
+
import os
|
20 |
+
import re
|
21 |
+
import shutil
|
22 |
+
from bs4 import BeautifulSoup
|
23 |
+
|
24 |
+
!pip install razdel # for splitting
|
25 |
+
|
26 |
+
from razdel import sentenize
|
27 |
+
from tqdm import tqdm
|
28 |
+
|
29 |
+
!git clone https://github.com/tolstoydigital/TEI.git
|
30 |
+
|
31 |
+
relevant_dirs = ['diaries', 'letters', 'notes', 'works']
|
32 |
+
|
33 |
+
path = 'TEI/reference/bibllist_works.xml' # allows to work with fiction and non fiction separately
|
34 |
+
xml = open(path).read()
|
35 |
+
soup = BeautifulSoup(xml, features="xml")
|
36 |
+
|
37 |
+
group_texts = {}
|
38 |
+
for it in soup.find_all("item"):
|
39 |
+
ref = it.find("ref")
|
40 |
+
for related in it.find_all("relatedItem"):
|
41 |
+
for ref_ana in related.find_all("ref"):
|
42 |
+
group_texts[ref_ana.text] = ref.text
|
43 |
+
|
44 |
+
prefix_texts = 'extracted_texts'
|
45 |
+
os.mkdir(prefix_texts)
|
46 |
+
|
47 |
+
if os.path.exists(prefix_texts):
|
48 |
+
shutil.rmtree(prefix_texts)
|
49 |
+
os.mkdir(prefix_texts)
|
50 |
+
|
51 |
+
# extract texts from XML
|
52 |
+
|
53 |
+
complex_texts = {}
|
54 |
+
for rel_dir in relevant_dirs:
|
55 |
+
path = os.path.join('TEI/texts', rel_dir)
|
56 |
+
for file in tqdm(sorted(os.listdir(path))):
|
57 |
+
fiction = 0
|
58 |
+
if not file.endswith('.xml'):
|
59 |
+
continue
|
60 |
+
xml = open(os.path.join(path, file)).read()
|
61 |
+
if 'Печатные варианты' in xml:
|
62 |
+
continue
|
63 |
+
nameID = file.replace('.xml', '')
|
64 |
+
soup = BeautifulSoup(xml, features="xml")
|
65 |
+
if soup.find("catRef", {"ana":"#fiction"}):
|
66 |
+
fiction = 1
|
67 |
+
s = soup.find("body")
|
68 |
+
paragraphs = []
|
69 |
+
for erase in s.find_all(["orig", "comments", "sic", "note"]):
|
70 |
+
erase.decompose()
|
71 |
+
for p in s.find_all(["p", "l"]):
|
72 |
+
paragraphs.append(html.unescape(p.text.replace('\n', ' ').strip()))
|
73 |
+
if not fiction:
|
74 |
+
with open(os.path.join(prefix_texts, rel_dir + '.txt'), 'a') as f:
|
75 |
+
for par in paragraphs:
|
76 |
+
par = re.sub(' ([.,;:!?)"»])', '\\1', par)
|
77 |
+
par = par.replace('\n', ' ')
|
78 |
+
par = par.strip()
|
79 |
+
par = re.sub('\s+', ' ', par)
|
80 |
+
par = re.sub('\[.+?\]', '', par)
|
81 |
+
for sent in sentenize(par):
|
82 |
+
f.write(list(sent)[2].strip() + '\n')
|
83 |
+
else:
|
84 |
+
if nameID in group_texts:
|
85 |
+
hyper_name = group_texts[nameID]
|
86 |
+
if hyper_name not in complex_texts:
|
87 |
+
complex_texts[hyper_name] = paragraphs
|
88 |
+
else:
|
89 |
+
complex_texts[hyper_name].extend(paragraphs)
|
90 |
+
else:
|
91 |
+
with open(os.path.join(prefix_texts, nameID + '.txt'), 'w') as f:
|
92 |
+
f.write('\n'.join(paragraphs))
|
93 |
+
for hyper_name in complex_texts:
|
94 |
+
with open(os.path.join(prefix_texts, hyper_name + '.txt'), 'w') as f:
|
95 |
+
f.write('\n'.join(complex_texts[hyper_name]))
|
96 |
+
|
97 |
+
# tagging
|
98 |
+
|
99 |
+
from pymystem3 import Mystem
|
100 |
+
|
101 |
+
pos = ['S', 'V', 'A', 'ADV']
|
102 |
+
|
103 |
+
def tagging():
|
104 |
+
m = Mystem()
|
105 |
+
for fl in os.listdir(prefix_texts):
|
106 |
+
#print(fl)
|
107 |
+
if 'mystem' in fl:
|
108 |
+
continue
|
109 |
+
with open(os.path.join(prefix_texts, fl)) as f:
|
110 |
+
text = f.read()
|
111 |
+
lines = text.split('\n')
|
112 |
+
ana_lines = []
|
113 |
+
for line in lines:
|
114 |
+
line = ' '.join(line.split()[1:])
|
115 |
+
line = line.replace('ò', 'о')
|
116 |
+
line = line.replace('è', 'е')
|
117 |
+
line = line.replace('à', 'а')
|
118 |
+
line = line.replace('ѝ', 'и')
|
119 |
+
line = line.replace('ỳ', 'у')
|
120 |
+
line = line.replace('о̀', 'о')
|
121 |
+
#line = line.replace('Изд.̀', 'издательство')
|
122 |
+
ana = []
|
123 |
+
info = m.analyze(line)
|
124 |
+
for token in info:
|
125 |
+
if "analysis" in token:
|
126 |
+
try:
|
127 |
+
analysis = token["analysis"][0]
|
128 |
+
except:
|
129 |
+
#print(token)
|
130 |
+
continue
|
131 |
+
# if "lex" in analysis:
|
132 |
+
lex = analysis["lex"]
|
133 |
+
#if 'gr' in analysis:
|
134 |
+
gr = analysis['gr']
|
135 |
+
#print(gr)
|
136 |
+
const = gr.split('=')[0]
|
137 |
+
if ',' in const:
|
138 |
+
pos = const.split(',')[0]
|
139 |
+
else:
|
140 |
+
pos = const
|
141 |
+
|
142 |
+
ana.append('{}_{}'.format(lex, pos))
|
143 |
+
ln = ' '.join(ana)
|
144 |
+
if re.search('[А-Яа-я]', ln):
|
145 |
+
ana_lines.append(ln)
|
146 |
+
with open('{}/mystem-{}'.format(prefix_texts, fl), 'w') as fw:
|
147 |
+
fw.write('\n'.join(ana_lines))
|
148 |
+
|
149 |
+
def mk_input():
|
150 |
+
inp = []
|
151 |
+
for fl in os.listdir(prefix_texts):
|
152 |
+
if not 'mystem' in fl:
|
153 |
+
continue
|
154 |
+
#print(fl)
|
155 |
+
with open(os.path.join(prefix_texts, fl)) as f:
|
156 |
+
text = f.read()
|
157 |
+
lines = text.split('\n')
|
158 |
+
for line in lines:
|
159 |
+
words = []
|
160 |
+
for w in line.split():
|
161 |
+
word = w.split('_')
|
162 |
+
if word[1] in pos:
|
163 |
+
words.append(w)
|
164 |
+
if len(words) > 1:
|
165 |
+
inp.append(' '.join(words))
|
166 |
+
|
167 |
+
with open('input.txt', 'w') as fw:
|
168 |
+
fw.write('\n'.join(inp))
|
169 |
+
|
170 |
+
tagging()
|
171 |
+
mk_input()
|
172 |
+
|
173 |
+
```
|
174 |
+
|
175 |
+
## Models
|
176 |
+
|
177 |
+
There are 2 models in the repository. Their parameters are taen from the general language models to be comparable from rusvectores site.
|
178 |
+
|
179 |
+
Here is the code for building models:
|
180 |
+
|
181 |
+
```python
|
182 |
+
import sys
|
183 |
+
import logging
|
184 |
+
import gensim
|
185 |
+
|
186 |
+
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
|
187 |
+
|
188 |
+
pth = './input.txt'
|
189 |
+
data = gensim.models.word2vec.LineSentence(pth) # train sentence by sentence
|
190 |
+
|
191 |
+
modelLNT1 = gensim.models.Word2Vec(data, vector_size=500, window=2, min_count=2, sg=1) # comparable with web_mystem_skipgram_500_2_2015.bin
|
192 |
+
|
193 |
+
modelLNT1.save('skipgram_500_2.model') # saving
|
194 |
+
|
195 |
+
modelLNT2 = gensim.models.Word2Vec(data, vector_size=300, window=10, min_count=2, sg=0) # comparable with ruwikiruscorpora_upos_cbow_300_10_2021
|
196 |
+
|
197 |
+
modelLNT2.save('cbow_300_10.model')
|
198 |
+
```
|
199 |
+
|
200 |
+
## Usage
|
201 |
+
|
202 |
+
```python
|
203 |
+
|
204 |
+
# load models
|
205 |
+
|
206 |
+
modelLNT1 = Word2Vec.load("skipgram_500_2.model")
|
207 |
+
|
208 |
+
# most similar words viz
|
209 |
+
|
210 |
+
import numpy as np
|
211 |
+
import pandas as pd
|
212 |
+
import matplotlib.pyplot as plt
|
213 |
+
%matplotlib inline
|
214 |
+
|
215 |
+
import seaborn as sns
|
216 |
+
sns.set_style("darkgrid")
|
217 |
+
|
218 |
+
from sklearn.decomposition import PCA
|
219 |
+
from sklearn.manifold import TSNE
|
220 |
+
|
221 |
+
def tsnescatterplot(model, word, list_names): # stolen code
|
222 |
+
""" Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
|
223 |
+
its list of most similar words, and a list of words.
|
224 |
+
"""
|
225 |
+
arrays = np.empty((0, 300), dtype='f')
|
226 |
+
word_labels = [word]
|
227 |
+
color_list = ['red']
|
228 |
+
|
229 |
+
# adds the vector of the query word
|
230 |
+
arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
|
231 |
+
|
232 |
+
# gets list of most similar words
|
233 |
+
close_words = model.wv.most_similar([word])
|
234 |
+
|
235 |
+
# adds the vector for each of the closest words to the array
|
236 |
+
for wrd_score in close_words:
|
237 |
+
wrd_vector = model.wv.__getitem__([wrd_score[0]])
|
238 |
+
word_labels.append(wrd_score[0])
|
239 |
+
color_list.append('blue')
|
240 |
+
arrays = np.append(arrays, wrd_vector, axis=0)
|
241 |
+
|
242 |
+
# adds the vector for each of the words from list_names to the array
|
243 |
+
for wrd in list_names:
|
244 |
+
wrd_vector = model.wv.__getitem__([wrd])
|
245 |
+
word_labels.append(wrd)
|
246 |
+
color_list.append('green')
|
247 |
+
arrays = np.append(arrays, wrd_vector, axis=0)
|
248 |
+
|
249 |
+
# Reduces the dimensionality from 300 to 50 dimensions with PCA
|
250 |
+
reduc = PCA(n_components=20).fit_transform(arrays)
|
251 |
+
|
252 |
+
# Finds t-SNE coordinates for 2 dimensions
|
253 |
+
np.set_printoptions(suppress=True)
|
254 |
+
|
255 |
+
Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
|
256 |
+
|
257 |
+
# Sets everything up to plot
|
258 |
+
df = pd.DataFrame({'x': [x for x in Y[:, 0]],
|
259 |
+
'y': [y for y in Y[:, 1]],
|
260 |
+
'words': word_labels,
|
261 |
+
'color': color_list})
|
262 |
+
|
263 |
+
fig, _ = plt.subplots()
|
264 |
+
fig.set_size_inches(9, 9)
|
265 |
+
|
266 |
+
# Basic plot
|
267 |
+
p1 = sns.regplot(data=df,
|
268 |
+
x="x",
|
269 |
+
y="y",
|
270 |
+
fit_reg=False,
|
271 |
+
marker="o",
|
272 |
+
scatter_kws={'s': 40,
|
273 |
+
'facecolors': df['color']
|
274 |
+
}
|
275 |
+
)
|
276 |
+
|
277 |
+
# Adds annotations one by one with a loop
|
278 |
+
for line in range(0, df.shape[0]):
|
279 |
+
p1.text(df["x"][line],
|
280 |
+
df['y'][line],
|
281 |
+
' ' + df["words"][line].title(),
|
282 |
+
horizontalalignment='left',
|
283 |
+
verticalalignment='bottom', size='medium',
|
284 |
+
color=df['color'][line],
|
285 |
+
weight='normal'
|
286 |
+
).set_size(15)
|
287 |
+
|
288 |
+
|
289 |
+
plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
|
290 |
+
plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
|
291 |
+
|
292 |
+
plt.title('t-SNE visualization for {}'.format(word.title()))
|
293 |
+
|
294 |
+
tsnescatterplot(modelLNT2, 'бог_S', [i[0] for i in modelLNT2.wv.most_similar(negative=["бог_S"])])
|
295 |
+
|
296 |
+
```
|
297 |
+
|
298 |
+
![](./god.png)
|
299 |
+
|
300 |
+
## Train data
|
301 |
+
|
302 |
+
Train corpus inclded in this repository as an `input.txt` file. It contains more than 7 mln words.
|
303 |
+
|
304 |
+
## Publication
|
305 |
+
|
306 |
+
Орехов Б. В. [Индивидуальная семантика Л. Н. Толстого в свете векторных моделей](https://human.spbstu.ru/article/2023.54.09/) // Terra Linguistica. 2023. Т. 14. No 4. С. 119–129. DOI: 10.18721/JHSS.14409
|
307 |
+
|
308 |
+
```
|
309 |
+
@article{орехов2023индивидуальная,
|
310 |
+
title={Индивидуальная семантика Л. Н. Толстого в свете векторных моделей},
|
311 |
+
author={Орехов, Б.В.},
|
312 |
+
journal={Terra Linguistica},
|
313 |
+
volume={14},
|
314 |
+
number={4},
|
315 |
+
pages={119--129},
|
316 |
+
year={2023}
|
317 |
+
}
|
318 |
+
```
|