nevmenandr
/

w2v-russian-tolstoy

Russian

natural-language-processing

word2vec

Model card Files Files and versions Community

nevmenandr commited on Jun 16, 2024

Commit

057776e

verified ·

1 Parent(s): ba2bc11

Update README.md

Browse files

Files changed (1) hide show

README.md +318 -3

README.md CHANGED Viewed

@@ -1,3 +1,318 @@
----
-license: mit
----

+---
+license: mit
+language:
+- ru
+tags:
+  - natural-language-processing
+  - dh
+  - word2vec
+---
+The model is built on Leo Tolstoy's [collected works](https://github.com/tolstoydigital/TEI) and represents his individual semantics
+## Preparation
+All texts are converted from the TEI markup, splitted into sentences and lemmatized. Only modern orthography left in the data.
+```python
+import html
+import os
+import re
+import shutil
+from bs4 import BeautifulSoup
+!pip install razdel # for splitting
+from razdel import sentenize
+from tqdm import tqdm
+!git clone https://github.com/tolstoydigital/TEI.git
+relevant_dirs = ['diaries', 'letters', 'notes', 'works']
+path = 'TEI/reference/bibllist_works.xml' # allows to work with fiction and non fiction separately
+xml = open(path).read()
+soup = BeautifulSoup(xml, features="xml")
+group_texts = {}
+for it in soup.find_all("item"):
+  ref = it.find("ref")
+  for related in it.find_all("relatedItem"):
+    for ref_ana in related.find_all("ref"):
+      group_texts[ref_ana.text] = ref.text
+prefix_texts = 'extracted_texts'
+os.mkdir(prefix_texts)
+if os.path.exists(prefix_texts):
+  shutil.rmtree(prefix_texts)
+os.mkdir(prefix_texts)
+# extract texts from XML
+complex_texts = {}
+for rel_dir in relevant_dirs:
+  path = os.path.join('TEI/texts', rel_dir)
+  for file in tqdm(sorted(os.listdir(path))):
+    fiction = 0
+    if not file.endswith('.xml'):
+      continue
+    xml = open(os.path.join(path, file)).read()
+    if 'Печатные варианты' in xml:
+      continue
+    nameID = file.replace('.xml', '')
+    soup = BeautifulSoup(xml, features="xml")
+    if soup.find("catRef", {"ana":"#fiction"}):
+      fiction = 1
+    s = soup.find("body")
+    paragraphs = []
+    for erase in s.find_all(["orig", "comments", "sic", "note"]):
+      erase.decompose()
+    for p in s.find_all(["p", "l"]):
+      paragraphs.append(html.unescape(p.text.replace('\n', ' ').strip()))
+    if not fiction:
+      with open(os.path.join(prefix_texts, rel_dir + '.txt'), 'a') as f:
+        for par in paragraphs:
+          par = re.sub(' ([.,;:!?)"»])', '\\1', par)
+          par = par.replace('\n', ' ')
+          par = par.strip()
+          par = re.sub('\s+', ' ', par)
+          par = re.sub('\[.+?\]', '', par)
+          for sent in sentenize(par):
+            f.write(list(sent)[2].strip() + '\n')
+    else:
+      if nameID in group_texts:
+        hyper_name = group_texts[nameID]
+        if hyper_name not in complex_texts:
+          complex_texts[hyper_name] = paragraphs
+        else:
+          complex_texts[hyper_name].extend(paragraphs)
+      else:
+        with open(os.path.join(prefix_texts, nameID + '.txt'), 'w') as f:
+          f.write('\n'.join(paragraphs))
+for hyper_name in complex_texts:
+  with open(os.path.join(prefix_texts, hyper_name + '.txt'), 'w') as f:
+    f.write('\n'.join(complex_texts[hyper_name]))
+# tagging
+from pymystem3 import Mystem
+pos = ['S', 'V', 'A', 'ADV']
+def tagging():
+    m = Mystem()
+    for fl in os.listdir(prefix_texts):
+        #print(fl)
+        if 'mystem' in fl:
+            continue
+        with open(os.path.join(prefix_texts, fl)) as f:
+            text = f.read()
+        lines = text.split('\n')
+        ana_lines = []
+        for line in lines:
+            line = ' '.join(line.split()[1:])
+            line = line.replace('ò', 'о')
+            line = line.replace('è', 'е')
+            line = line.replace('à', 'а')
+            line = line.replace('ѝ', 'и')
+            line = line.replace('ỳ', 'у')
+            line = line.replace('о̀', 'о')
+            #line = line.replace('Изд.̀', 'издательство')
+            ana = []
+            info = m.analyze(line)
+            for token in info:
+                if "analysis" in token:
+                    try:
+                        analysis = token["analysis"][0]
+                    except:
+                        #print(token)
+                        continue
+                    # if "lex" in analysis:
+                    lex = analysis["lex"]
+                    #if 'gr' in analysis:
+                    gr = analysis['gr']
+                    #print(gr)
+                    const = gr.split('=')[0]
+                    if ',' in const:
+                        pos = const.split(',')[0]
+                    else:
+                        pos = const
+                    ana.append('{}_{}'.format(lex, pos))
+            ln = ' '.join(ana)
+            if re.search('[А-Яа-я]', ln):
+                ana_lines.append(ln)
+        with open('{}/mystem-{}'.format(prefix_texts, fl), 'w') as fw:
+            fw.write('\n'.join(ana_lines))
+def mk_input():
+    inp = []
+    for fl in os.listdir(prefix_texts):
+        if not 'mystem' in fl:
+            continue
+        #print(fl)
+        with open(os.path.join(prefix_texts, fl)) as f:
+            text = f.read()
+        lines = text.split('\n')
+        for line in lines:
+            words = []
+            for w in line.split():
+                word = w.split('_')
+                if word[1] in pos:
+                    words.append(w)
+            if len(words) > 1:
+                inp.append(' '.join(words))
+    with open('input.txt', 'w') as fw:
+        fw.write('\n'.join(inp))
+tagging()
+mk_input()
+```
+## Models
+There are 2 models in the repository. Their parameters are taen from the general language models to be comparable from rusvectores site.
+Here is the code for building models:
+```python
+import sys
+import logging
+import gensim
+logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+pth = './input.txt'
+data = gensim.models.word2vec.LineSentence(pth) # train sentence by sentence
+modelLNT1 = gensim.models.Word2Vec(data, vector_size=500, window=2, min_count=2, sg=1) # comparable with web_mystem_skipgram_500_2_2015.bin
+modelLNT1.save('skipgram_500_2.model') # saving
+modelLNT2 = gensim.models.Word2Vec(data, vector_size=300, window=10, min_count=2, sg=0) # comparable with ruwikiruscorpora_upos_cbow_300_10_2021
+modelLNT2.save('cbow_300_10.model')
+```
+## Usage
+```python
+# load models
+modelLNT1 = Word2Vec.load("skipgram_500_2.model")
+# most similar words viz
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+%matplotlib inline
+import seaborn as sns
+sns.set_style("darkgrid")
+from sklearn.decomposition import PCA
+from sklearn.manifold import TSNE
+def tsnescatterplot(model, word, list_names): # stolen code
+    """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
+    its list of most similar words, and a list of words.
+    """
+    arrays = np.empty((0, 300), dtype='f')
+    word_labels = [word]
+    color_list  = ['red']
+    # adds the vector of the query word
+    arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
+    # gets list of most similar words
+    close_words = model.wv.most_similar([word])
+    # adds the vector for each of the closest words to the array
+    for wrd_score in close_words:
+        wrd_vector = model.wv.__getitem__([wrd_score[0]])
+        word_labels.append(wrd_score[0])
+        color_list.append('blue')
+        arrays = np.append(arrays, wrd_vector, axis=0)
+    # adds the vector for each of the words from list_names to the array
+    for wrd in list_names:
+        wrd_vector = model.wv.__getitem__([wrd])
+        word_labels.append(wrd)
+        color_list.append('green')
+        arrays = np.append(arrays, wrd_vector, axis=0)
+    # Reduces the dimensionality from 300 to 50 dimensions with PCA
+    reduc = PCA(n_components=20).fit_transform(arrays)
+    # Finds t-SNE coordinates for 2 dimensions
+    np.set_printoptions(suppress=True)
+    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
+    # Sets everything up to plot
+    df = pd.DataFrame({'x': [x for x in Y[:, 0]],
+                       'y': [y for y in Y[:, 1]],
+                       'words': word_labels,
+                       'color': color_list})
+    fig, _ = plt.subplots()
+    fig.set_size_inches(9, 9)
+    # Basic plot
+    p1 = sns.regplot(data=df,
+                     x="x",
+                     y="y",
+                     fit_reg=False,
+                     marker="o",
+                     scatter_kws={'s': 40,
+                                  'facecolors': df['color']
+                                 }
+                    )
+    # Adds annotations one by one with a loop
+    for line in range(0, df.shape[0]):
+         p1.text(df["x"][line],
+                 df['y'][line],
+                 '  ' + df["words"][line].title(),
+                 horizontalalignment='left',
+                 verticalalignment='bottom', size='medium',
+                 color=df['color'][line],
+                 weight='normal'
+                ).set_size(15)
+    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
+    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
+    plt.title('t-SNE visualization for {}'.format(word.title()))
+tsnescatterplot(modelLNT2, 'бог_S', [i[0] for i in modelLNT2.wv.most_similar(negative=["бог_S"])])
+```
+![](./god.png)
+## Train data
+Train corpus inclded in this repository as an `input.txt` file. It contains more than 7 mln words.
+## Publication
+Орехов Б. В. [Индивидуальная семантика Л. Н. Толстого в свете векторных моделей](https://human.spbstu.ru/article/2023.54.09/) // Terra Linguistica. 2023. Т. 14. No 4. С. 119–129. DOI: 10.18721/JHSS.14409
+```
+@article{орехов2023индивидуальная,
+  title={Индивидуальная семантика Л. Н. Толстого в свете векторных моделей},
+  author={Орехов, Б.В.},
+  journal={Terra Linguistica},
+  volume={14},
+  number={4},
+  pages={119--129},
+  year={2023}
+}
+```