nevmenandr commited on
Commit
057776e
1 Parent(s): ba2bc11

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +318 -3
README.md CHANGED
@@ -1,3 +1,318 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - ru
5
+ tags:
6
+ - natural-language-processing
7
+ - dh
8
+ - word2vec
9
+ ---
10
+
11
+ The model is built on Leo Tolstoy's [collected works](https://github.com/tolstoydigital/TEI) and represents his individual semantics
12
+
13
+ ## Preparation
14
+
15
+ All texts are converted from the TEI markup, splitted into sentences and lemmatized. Only modern orthography left in the data.
16
+
17
+ ```python
18
+ import html
19
+ import os
20
+ import re
21
+ import shutil
22
+ from bs4 import BeautifulSoup
23
+
24
+ !pip install razdel # for splitting
25
+
26
+ from razdel import sentenize
27
+ from tqdm import tqdm
28
+
29
+ !git clone https://github.com/tolstoydigital/TEI.git
30
+
31
+ relevant_dirs = ['diaries', 'letters', 'notes', 'works']
32
+
33
+ path = 'TEI/reference/bibllist_works.xml' # allows to work with fiction and non fiction separately
34
+ xml = open(path).read()
35
+ soup = BeautifulSoup(xml, features="xml")
36
+
37
+ group_texts = {}
38
+ for it in soup.find_all("item"):
39
+ ref = it.find("ref")
40
+ for related in it.find_all("relatedItem"):
41
+ for ref_ana in related.find_all("ref"):
42
+ group_texts[ref_ana.text] = ref.text
43
+
44
+ prefix_texts = 'extracted_texts'
45
+ os.mkdir(prefix_texts)
46
+
47
+ if os.path.exists(prefix_texts):
48
+ shutil.rmtree(prefix_texts)
49
+ os.mkdir(prefix_texts)
50
+
51
+ # extract texts from XML
52
+
53
+ complex_texts = {}
54
+ for rel_dir in relevant_dirs:
55
+ path = os.path.join('TEI/texts', rel_dir)
56
+ for file in tqdm(sorted(os.listdir(path))):
57
+ fiction = 0
58
+ if not file.endswith('.xml'):
59
+ continue
60
+ xml = open(os.path.join(path, file)).read()
61
+ if 'Печатные варианты' in xml:
62
+ continue
63
+ nameID = file.replace('.xml', '')
64
+ soup = BeautifulSoup(xml, features="xml")
65
+ if soup.find("catRef", {"ana":"#fiction"}):
66
+ fiction = 1
67
+ s = soup.find("body")
68
+ paragraphs = []
69
+ for erase in s.find_all(["orig", "comments", "sic", "note"]):
70
+ erase.decompose()
71
+ for p in s.find_all(["p", "l"]):
72
+ paragraphs.append(html.unescape(p.text.replace('\n', ' ').strip()))
73
+ if not fiction:
74
+ with open(os.path.join(prefix_texts, rel_dir + '.txt'), 'a') as f:
75
+ for par in paragraphs:
76
+ par = re.sub(' ([.,;:!?)"»])', '\\1', par)
77
+ par = par.replace('\n', ' ')
78
+ par = par.strip()
79
+ par = re.sub('\s+', ' ', par)
80
+ par = re.sub('\[.+?\]', '', par)
81
+ for sent in sentenize(par):
82
+ f.write(list(sent)[2].strip() + '\n')
83
+ else:
84
+ if nameID in group_texts:
85
+ hyper_name = group_texts[nameID]
86
+ if hyper_name not in complex_texts:
87
+ complex_texts[hyper_name] = paragraphs
88
+ else:
89
+ complex_texts[hyper_name].extend(paragraphs)
90
+ else:
91
+ with open(os.path.join(prefix_texts, nameID + '.txt'), 'w') as f:
92
+ f.write('\n'.join(paragraphs))
93
+ for hyper_name in complex_texts:
94
+ with open(os.path.join(prefix_texts, hyper_name + '.txt'), 'w') as f:
95
+ f.write('\n'.join(complex_texts[hyper_name]))
96
+
97
+ # tagging
98
+
99
+ from pymystem3 import Mystem
100
+
101
+ pos = ['S', 'V', 'A', 'ADV']
102
+
103
+ def tagging():
104
+ m = Mystem()
105
+ for fl in os.listdir(prefix_texts):
106
+ #print(fl)
107
+ if 'mystem' in fl:
108
+ continue
109
+ with open(os.path.join(prefix_texts, fl)) as f:
110
+ text = f.read()
111
+ lines = text.split('\n')
112
+ ana_lines = []
113
+ for line in lines:
114
+ line = ' '.join(line.split()[1:])
115
+ line = line.replace('ò', 'о')
116
+ line = line.replace('è', 'е')
117
+ line = line.replace('à', 'а')
118
+ line = line.replace('ѝ', 'и')
119
+ line = line.replace('ỳ', 'у')
120
+ line = line.replace('о̀', 'о')
121
+ #line = line.replace('Изд.̀', 'издательство')
122
+ ana = []
123
+ info = m.analyze(line)
124
+ for token in info:
125
+ if "analysis" in token:
126
+ try:
127
+ analysis = token["analysis"][0]
128
+ except:
129
+ #print(token)
130
+ continue
131
+ # if "lex" in analysis:
132
+ lex = analysis["lex"]
133
+ #if 'gr' in analysis:
134
+ gr = analysis['gr']
135
+ #print(gr)
136
+ const = gr.split('=')[0]
137
+ if ',' in const:
138
+ pos = const.split(',')[0]
139
+ else:
140
+ pos = const
141
+
142
+ ana.append('{}_{}'.format(lex, pos))
143
+ ln = ' '.join(ana)
144
+ if re.search('[А-Яа-я]', ln):
145
+ ana_lines.append(ln)
146
+ with open('{}/mystem-{}'.format(prefix_texts, fl), 'w') as fw:
147
+ fw.write('\n'.join(ana_lines))
148
+
149
+ def mk_input():
150
+ inp = []
151
+ for fl in os.listdir(prefix_texts):
152
+ if not 'mystem' in fl:
153
+ continue
154
+ #print(fl)
155
+ with open(os.path.join(prefix_texts, fl)) as f:
156
+ text = f.read()
157
+ lines = text.split('\n')
158
+ for line in lines:
159
+ words = []
160
+ for w in line.split():
161
+ word = w.split('_')
162
+ if word[1] in pos:
163
+ words.append(w)
164
+ if len(words) > 1:
165
+ inp.append(' '.join(words))
166
+
167
+ with open('input.txt', 'w') as fw:
168
+ fw.write('\n'.join(inp))
169
+
170
+ tagging()
171
+ mk_input()
172
+
173
+ ```
174
+
175
+ ## Models
176
+
177
+ There are 2 models in the repository. Their parameters are taen from the general language models to be comparable from rusvectores site.
178
+
179
+ Here is the code for building models:
180
+
181
+ ```python
182
+ import sys
183
+ import logging
184
+ import gensim
185
+
186
+ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
187
+
188
+ pth = './input.txt'
189
+ data = gensim.models.word2vec.LineSentence(pth) # train sentence by sentence
190
+
191
+ modelLNT1 = gensim.models.Word2Vec(data, vector_size=500, window=2, min_count=2, sg=1) # comparable with web_mystem_skipgram_500_2_2015.bin
192
+
193
+ modelLNT1.save('skipgram_500_2.model') # saving
194
+
195
+ modelLNT2 = gensim.models.Word2Vec(data, vector_size=300, window=10, min_count=2, sg=0) # comparable with ruwikiruscorpora_upos_cbow_300_10_2021
196
+
197
+ modelLNT2.save('cbow_300_10.model')
198
+ ```
199
+
200
+ ## Usage
201
+
202
+ ```python
203
+
204
+ # load models
205
+
206
+ modelLNT1 = Word2Vec.load("skipgram_500_2.model")
207
+
208
+ # most similar words viz
209
+
210
+ import numpy as np
211
+ import pandas as pd
212
+ import matplotlib.pyplot as plt
213
+ %matplotlib inline
214
+
215
+ import seaborn as sns
216
+ sns.set_style("darkgrid")
217
+
218
+ from sklearn.decomposition import PCA
219
+ from sklearn.manifold import TSNE
220
+
221
+ def tsnescatterplot(model, word, list_names): # stolen code
222
+ """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
223
+ its list of most similar words, and a list of words.
224
+ """
225
+ arrays = np.empty((0, 300), dtype='f')
226
+ word_labels = [word]
227
+ color_list = ['red']
228
+
229
+ # adds the vector of the query word
230
+ arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
231
+
232
+ # gets list of most similar words
233
+ close_words = model.wv.most_similar([word])
234
+
235
+ # adds the vector for each of the closest words to the array
236
+ for wrd_score in close_words:
237
+ wrd_vector = model.wv.__getitem__([wrd_score[0]])
238
+ word_labels.append(wrd_score[0])
239
+ color_list.append('blue')
240
+ arrays = np.append(arrays, wrd_vector, axis=0)
241
+
242
+ # adds the vector for each of the words from list_names to the array
243
+ for wrd in list_names:
244
+ wrd_vector = model.wv.__getitem__([wrd])
245
+ word_labels.append(wrd)
246
+ color_list.append('green')
247
+ arrays = np.append(arrays, wrd_vector, axis=0)
248
+
249
+ # Reduces the dimensionality from 300 to 50 dimensions with PCA
250
+ reduc = PCA(n_components=20).fit_transform(arrays)
251
+
252
+ # Finds t-SNE coordinates for 2 dimensions
253
+ np.set_printoptions(suppress=True)
254
+
255
+ Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
256
+
257
+ # Sets everything up to plot
258
+ df = pd.DataFrame({'x': [x for x in Y[:, 0]],
259
+ 'y': [y for y in Y[:, 1]],
260
+ 'words': word_labels,
261
+ 'color': color_list})
262
+
263
+ fig, _ = plt.subplots()
264
+ fig.set_size_inches(9, 9)
265
+
266
+ # Basic plot
267
+ p1 = sns.regplot(data=df,
268
+ x="x",
269
+ y="y",
270
+ fit_reg=False,
271
+ marker="o",
272
+ scatter_kws={'s': 40,
273
+ 'facecolors': df['color']
274
+ }
275
+ )
276
+
277
+ # Adds annotations one by one with a loop
278
+ for line in range(0, df.shape[0]):
279
+ p1.text(df["x"][line],
280
+ df['y'][line],
281
+ ' ' + df["words"][line].title(),
282
+ horizontalalignment='left',
283
+ verticalalignment='bottom', size='medium',
284
+ color=df['color'][line],
285
+ weight='normal'
286
+ ).set_size(15)
287
+
288
+
289
+ plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
290
+ plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
291
+
292
+ plt.title('t-SNE visualization for {}'.format(word.title()))
293
+
294
+ tsnescatterplot(modelLNT2, 'бог_S', [i[0] for i in modelLNT2.wv.most_similar(negative=["бог_S"])])
295
+
296
+ ```
297
+
298
+ ![](./god.png)
299
+
300
+ ## Train data
301
+
302
+ Train corpus inclded in this repository as an `input.txt` file. It contains more than 7 mln words.
303
+
304
+ ## Publication
305
+
306
+ Орехов Б. В. [Индивидуальная семантика Л. Н. Толстого в свете векторных моделей](https://human.spbstu.ru/article/2023.54.09/) // Terra Linguistica. 2023. Т. 14. No 4. С. 119–129. DOI: 10.18721/JHSS.14409
307
+
308
+ ```
309
+ @article{орехов2023индивидуальная,
310
+ title={Индивидуальная семантика Л. Н. Толстого в свете векторных моделей},
311
+ author={Орехов, Б.В.},
312
+ journal={Terra Linguistica},
313
+ volume={14},
314
+ number={4},
315
+ pages={119--129},
316
+ year={2023}
317
+ }
318
+ ```