Spaces:

Endre
/

SemanticSearch-HU

Runtime error

App Files Files Community

endre sukosd commited on Nov 23, 2021

Commit

3992084

•

1 Parent(s): 4b647de

Semantic Search HU implementation

Browse files

Files changed (19) hide show

.gitattributes +2 -0
.gitignore +191 -0
README.md +51 -22
approach.txt +49 -0
notebooks/QA_retrieval_precalculate_embeddings.ipynb +1 -0
notebooks/dbpedia_qa_test.ipynb +288 -0
notebooks/mqa_test.ipynb +409 -0
requirements.txt +3 -0
requirements_full.txt +133 -0
src/app.py +60 -0
src/data/dbpedia_dump_embeddings.py +56 -0
src/data/dbpedia_dump_wiki_text.py +18 -0
src/exploration/automodel_test.py +23 -0
src/exploration/datetime_test.py +10 -0
src/exploration/mqa_test.py +9 -0
src/exploration/pipeline_test.py +16 -0
src/exploration/serialize_test.py +30 -0
src/features/semantic_retreiver.py +130 -0
src/main_qa.py +51 -0

.gitattributes CHANGED Viewed

@@ -25,3 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/processed/shortened_abstracts_hu_2021_09_01.txt filter=lfs diff=lfs merge=lfs -text
+data/processed/shortened_abstracts_hu_2021_09_01_embedded.pt filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,191 @@

+# Custom
+hf_venv/
+data/
+*.DS_Store
+# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,python,jupyternotebooks,venv
+# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,python,jupyternotebooks,venv
+### JupyterNotebooks ###
+# gitignore template for Jupyter Notebooks
+# website: http://jupyter.org/
+.ipynb_checkpoints
+*/.ipynb_checkpoints/*
+# IPython
+profile_default/
+ipython_config.py
+# Remove previous ipynb_checkpoints
+#   git rm -r .ipynb_checkpoints/
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+# IPython
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+### venv ###
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+[Ss]cripts
+pyvenv.cfg
+pip-selfcheck.json
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+# Local History for Visual Studio Code
+.history/
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+# Support for Project snippet scope
+!.vscode/*.code-snippets
+# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,python,jupyternotebooks,venv

README.md CHANGED Viewed

@@ -1,37 +1,66 @@
 ---
 title: SemanticSearch HU
 emoji: 💻
-colorFrom: red
-colorTo: indigo
 sdk: streamlit
-app_file: app.py
 pinned: false
 ---
-# Configuration
-`title`: _string_
-Display title for the Space
-`emoji`: _string_
-Space emoji (emoji-only character allowed)
-`colorFrom`: _string_
-Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
-`colorTo`: _string_
-Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
-`sdk`: _string_
-Can be either `gradio` or `streamlit`
-`sdk_version` : _string_
-Only applicable for `streamlit` SDK.
-See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
-`app_file`: _string_
-Path to your main application file (which contains either `gradio` or `streamlit` Python code).
-Path is relative to the root of the repository.
-`pinned`: _boolean_
-Whether the Space stays on top of your list.

 ---
 title: SemanticSearch HU
 emoji: 💻
+colorFrom: green
+colorTo: white
 sdk: streamlit
+app_file: src/app.py
 pinned: false
 ---
+# Huggingface Course Project - 2021 November
+## Semantic Search system in Hungarian
+This repo contains my course project created during the week of the Huggingface Course Launch Community event. The selected project is a denser retrieval based semantic search system in my own language, Hungarian. It is based on [this question-answering project idea description](https://discuss.huggingface.co/t/build-a-question-answering-system-in-your-own-language/11570/2).
+## Approach
+- finding a **dataset** of question/answer pairs or descriptive paragraphs in my target language (Hungarian)
+- using a **pretrained model** (Hungarian or multilingual) to generate embeddings for all answers, preferably using sentence-transformers
+- **search for top-K matches** - when user query is entered, generate the query embedding and search through all the answer embeddings fo find the top-K most likely documents
+## Dataset - raw text
+Two datasets were evaluated:
+1. [not used] [MQA - multilingual Question-Answering](https://huggingface.co/datasets/clips/mqa), with a Hungarian subset
+This datasets contains two types of data:
+* FAQ, about 800.000 questions and answers scraped from different websites (Common Crawl). The problem with this dataset is that it only contains text from roughly 2.000 different domains (so many of the questions and answers are repetitive), and also the quality of the answers varies greatly, for some domains it is not really relevant (for example full of url references).
+* CQA, about 27.000 community question answering examples, which were scraped from different forums. Here for every questions there are several answers, but again the quality of the answers varies greatly, with many answers not being relevant.
+2. **[used] [DBpedia - short abstracts in Hungarian](https://databus.dbpedia.org/dbpedia/text/short-abstracts)**
+This data contains 450.000 shortened abstract from Wikipedia in Hungarian. This represents the text before the table of contents of Wikipedia articles, shortened to approximately 2-3 sentences. These texts seemed like high quality paragraphs, and so I decided to use them as a bank of "answers".
+The format of the data is of RDF Turtle (Resource Description Framework), which is a rich format to relate metadata and model information. In our case, we just want to use a fraction of this data, only the pure text of each abstract. The raw text was extracted using `rdflib` library seen in the script in `src/data/dbpedia_dump_wiki_text.py`.
+## Model - precalculate embeddings
+To generate the embeddings for each paragraph/shortened abstract, a sentence embedding approach was used. [SBERT.net](https://www.sbert.net/index.html) offers a framework and lots of pretrained models in more than 100 languages to create embeddings and compare them, to find the ones with similar meaning.
+This task is also called STS (Semantic Text Similarity) or Semantic Search, which seeks to find similarity not just based on lexical matches, but by comparing vector representations of the content and thus improving accuracy.
+There were various [pretrained models](https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models) to choose from. For this project the **`paraphrase-multilingual-MiniLM-L12-v2`** checkpoint is used, as this is one of the smallest multilingual models at 418 MB, but it has the second fastest encoding speed, which seems like a good compromise.
+```
+Model facts:
+- Checkpoint name: paraphrase-multilingual-MiniLM-L12-v2
+- Dimensions: 384
+- Suitable Score Functions: cosine-similarity
+- Pooling: Mean Pooling
+```
+- Embeddings were calculated based on code examples from [huggingface hub](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2)
+- Similarity scores were calculated based on code example from [sentence-transformers site](https://www.sbert.net/examples/applications/semantic-search/README.html)
+To reproduce the precalculated embedding use the notebook in `notebooks/QA_retrieval_precalculate_embeddings.ipynb`, with GPU in Google Colab.
+## Search top-k matches
+Finally, having all precalculated embeddings, we can to implement semantic search (dense retrieval).We encode the search query into vector space and retrieves the document embeddings that are closest in vector space (using cosine similarity). By default the top 5 similar wikipedia abstracts are returned. Can be seen in the main script `src/main_qa.py`.

approach.txt ADDED Viewed

	@@ -0,0 +1,49 @@

+Types of Question Answering
+    - extractive question answering (encoder only models BERT)
+        - posing questions about a document and identifying the answers as spans of text in the document itself
+    - generative question answering (encoder-decoder T5/BART)
+        - open ended questions, which need to synthesize information
+    - retrieval based/community question answering
+First approach - translate dataset, fine-tune model
+!Not really feasible, because it needs lots of human evaluation for correctly determine answer start token
+    1. Translate English QA dataset into Hungarian
+        - SQuAD - reading comprehension based on Wikipedia articles
+        - ~ 100.000 question/answers
+    2. Fine-tune a model and evaluate on this dataset
+Second approach - fine-tune multilingual model
+!MQA format different than SQuAD, cannot use ModelForQuestionAnswering
+    1. Use a Hungarian dataset
+        - MQA - multilingual parsed from Common Crawl
+            - FAQ - 878.385 (2.415 domain)
+            - CQA - 27.639 (171 domain)
+    2. Fine-tune and evaluate a model on this dataset
+    Possible steps:
+        - Use an existing pre-trained model in Hungarian/Romanian/or multilingual to generate embeddings
+            - Select Model:
+                - multilingual which includes hu:
+                    - distiluse-base-multilingual-cased-v2 (400MB)
+                    - paraphrase-multilingual-MiniLM-L12-v2 (400MB) - fastest
+                    - paraphrase-multilingual-mpnet-base-v2 (900MB) - best performing
+                - hubert
+        - Select a dataset
+            - use MQA hungarian subset
+            - use hungarian wikipedia pages data, split it up
+                - DBpedia, shortened abstracts = 500.000
+        - Pre-compute embeddings for all answers/paragraphs
+        - Compute embedding for incoming query
+            - Compare similarity between query embedding and precomputed
+            - return top-3 answers/questions
+    Alternative steps:
+        - train a sentence transformer on the Hungarian / Romanian subsets
+        - Use the trained sentence transformer to generate embeddings

notebooks/QA_retrieval_precalculate_embeddings.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"QA_retrieval_huggingface_couser_2021_Nov.ipynb","provenance":[],"collapsed_sections":[],"mount_file_id":"1e_NcpgIuSh8rfI_Xf16ltcybK8TbgJWB","authorship_tag":"ABX9TyN3TvKBRyS+wRVSLWNFgC+f"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU"},"cells":[{"cell_type":"code","metadata":{"id":"GI4Sz98ItJW7"},"source":["# TPU\n","# !pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.10-cp37-cp37m-linux_x86_64.whl"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"97-OsdFhlD20","executionInfo":{"status":"ok","timestamp":1637680969592,"user_tz":-60,"elapsed":3348,"user":{"displayName":"Sukosd Endre","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GioD4JjUyxYNK5t_w13NsB1TIlQ1P_x6Xj99-re5w=s64","userId":"02963673169135048018"}},"outputId":"c47a98a7-f016-4a4f-827b-edc9229c5eca"},"source":["!pip install transformers sentence_transformers"],"execution_count":9,"outputs":[{"output_type":"stream","name":"stdout","text":["Requirement already satisfied: transformers in /usr/local/lib/python3.7/dist-packages (4.12.5)\n","Requirement already satisfied: sentence_transformers in /usr/local/lib/python3.7/dist-packages (2.1.0)\n","Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (6.0)\n","Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n","Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n","Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.10.3)\n","Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (21.3)\n","Requirement already satisfied: huggingface-hub<1.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.1.2)\n","Requirement already satisfied: sacremoses in /usr/local/lib/python3.7/dist-packages (from transformers) (0.0.46)\n","Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.4.0)\n","Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.8.2)\n","Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.62.3)\n","Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0,>=0.1.0->transformers) (3.10.0.2)\n","Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers) (3.0.6)\n","Requirement already satisfied: nltk in /usr/local/lib/python3.7/dist-packages (from sentence_transformers) (3.2.5)\n","Requirement already satisfied: scikit-learn in /usr/local/lib/python3.7/dist-packages (from sentence_transformers) (1.0.1)\n","Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (from sentence_transformers) (1.4.1)\n","Requirement already satisfied: sentencepiece in /usr/local/lib/python3.7/dist-packages (from sentence_transformers) (0.1.96)\n","Requirement already satisfied: torchvision in /usr/local/lib/python3.7/dist-packages (from sentence_transformers) (0.11.1+cu111)\n","Requirement already satisfied: torch>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from sentence_transformers) (1.10.0+cu111)\n","Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.6.0)\n","Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from nltk->sentence_transformers) (1.15.0)\n","Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2021.10.8)\n","Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n","Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n","Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n","Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.1.0)\n","Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->sentence_transformers) (3.0.0)\n","Requirement already satisfied: pillow!=8.3.0,>=5.3.0 in /usr/local/lib/python3.7/dist-packages (from torchvision->sentence_transformers) (7.1.2)\n"]}]},{"cell_type":"code","metadata":{"id":"3-jkyQkdkdPQ","executionInfo":{"status":"ok","timestamp":1637680970023,"user_tz":-60,"elapsed":3,"user":{"displayName":"Sukosd Endre","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GioD4JjUyxYNK5t_w13NsB1TIlQ1P_x6Xj99-re5w=s64","userId":"02963673169135048018"}}},"source":["from transformers import AutoTokenizer, AutoModel\n","import torch\n","import pickle\n","from sentence_transformers import util\n","from datetime import datetime"],"execution_count":10,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"kA2h5mH8m-n8","executionInfo":{"status":"ok","timestamp":1637654036646,"user_tz":-60,"elapsed":26589,"user":{"displayName":"Sukosd Endre","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GioD4JjUyxYNK5t_w13NsB1TIlQ1P_x6Xj99-re5w=s64","userId":"02963673169135048018"}},"outputId":"88fcd97f-276c-4f70-de60-d1c5c9810443"},"source":["from google.colab import drive\n","drive.mount('/content/drive')\n","#drive.mount('/content/drive', force_remount=True)"],"execution_count":3,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}]},{"cell_type":"markdown","metadata":{"id":"b8SkQGWuB1z7"},"source":["# Load pretrained \n","\n","- multilingual sentence transformers from checkpoint\n","- tokenizer from checkpoint"]},{"cell_type":"code","metadata":{"id":"1R83LLVAk98K","executionInfo":{"status":"ok","timestamp":1637655426545,"user_tz":-60,"elapsed":6237,"user":{"displayName":"Sukosd Endre","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GioD4JjUyxYNK5t_w13NsB1TIlQ1P_x6Xj99-re5w=s64","userId":"02963673169135048018"}}},"source":["multilingual_checkpoint = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'\n","tokenizer = AutoTokenizer.from_pretrained(multilingual_checkpoint)\n","model = AutoModel.from_pretrained(multilingual_checkpoint)"],"execution_count":3,"outputs":[]},{"cell_type":"code","metadata":{"id":"wcdik3tQpkyi"},"source":["# GPU\n","device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n","model.to(device)\n","print(device)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"-YzAkemLsrC9"},"source":["# TPU\n","# unfortunately incompatible wheel package for pytorch-xla 1.10 version\n","#import torch_xla.core.xla_model as xm\n","#device = xm.xla_device()\n","#print(device)\n","#pip list | grep torch"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"dfeEQJOglxdw","executionInfo":{"status":"ok","timestamp":1637682096594,"user_tz":-60,"elapsed":362,"user":{"displayName":"Sukosd Endre","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GioD4JjUyxYNK5t_w13NsB1TIlQ1P_x6Xj99-re5w=s64","userId":"02963673169135048018"}}},"source":["#Mean Pooling - Take attention mask into account for correct averaging\n","def mean_pooling(model_output, attention_mask):\n"," token_embeddings = model_output[0] #First element of model_output contains all token embeddings\n"," input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n"," sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)\n"," sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)\n"," return sum_embeddings / sum_mask\n","\n","def calculateEmbeddings(sentences,tokenizer,model,device=\"cpu\"):\n"," tokenized_sentences = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')\n"," tokenized_sentences.to(device)\n"," with torch.no_grad():\n"," model_output = model(**tokenized_sentences)\n"," sentence_embeddings = mean_pooling(model_output, tokenized_sentences['attention_mask'])\n"," del tokenized_sentences\n"," torch.cuda.empty_cache()\n"," return sentence_embeddings\n","\n","def findTopKMostSimilar(query_embedding, embeddings, k):\n"," cosine_scores = util.pytorch_cos_sim(query_embedding, embeddings)\n"," cosine_scores_list = cosine_scores.squeeze().tolist()\n"," pairs = []\n"," for idx,score in enumerate(cosine_scores_list):\n"," pairs.append({'index': idx, 'score': score})\n"," pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)\n"," return pairs[0:k]\n","\n","def saveToDisc(embeddings, output_filename):\n"," with open(output_filename, \"ab\") as f:\n"," pickle.dump(embeddings, f, protocol=pickle.HIGHEST_PROTOCOL)"],"execution_count":23,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"MddjkKfMCH81"},"source":["# Create sentence embeddings\n","\n","\n","* Load sentences from raw text file\n","* Precalculate in batches of 1000, to avoid running out of memory\n","* Save to disc/files incrementally, to be able to reuse later (in total 5 files of 100.000 embedding each)\n","\n"]},{"cell_type":"code","metadata":{"id":"yfOsCAVImIAl"},"source":["batch_size = 1000\n","\n","raw_text_file = '/content/drive/MyDrive/huggingface/shortened_abstracts_hu_2021_09_01.txt'\n","datetime_formatted = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')\n","output_embeddings_file_batched = f'/content/drive/MyDrive/huggingface/embeddings_{batch_size}_batches_at_{datetime_formatted}.pkl'\n","output_embeddings_file = f'/content/drive/MyDrive/huggingface/embeddings_at_{datetime_formatted}.pkl'\n","\n","print(datetime.now())\n","concated_sentence_embeddings = None\n","all_sentences = []\n","line = 'init'\n","total_read = 0\n","total_read_limit = 500000\n","skip_index = 400000\n","with open(raw_text_file) as f:\n"," while line and total_read < total_read_limit:\n"," count = 0\n"," sentence_batch = []\n"," while line and count < batch_size:\n"," line = f.readline()\n"," sentence_batch.append(line)\n"," count += 1\n"," \n"," all_sentences.extend(sentence_batch)\n"," \n"," if total_read >= skip_index:\n"," sentence_embeddings = calculateEmbeddings(sentence_batch,tokenizer,model,device)\n"," if concated_sentence_embeddings == None:\n"," concated_sentence_embeddings = sentence_embeddings\n"," else:\n"," concated_sentence_embeddings = torch.cat([concated_sentence_embeddings, sentence_embeddings], dim=0)\n"," print(concated_sentence_embeddings.size())\n"," saveToDisc(sentence_embeddings,output_embeddings_file_batched)\n"," total_read += count\n","print(datetime.now())"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"1rGQc9GRCuNy"},"source":["# Test: Query embeddings"]},{"cell_type":"code","metadata":{"id":"FT7CwpM0Bwhi"},"source":["query_embedding = calculateEmbeddings(['Melyik a legnépesebb város a világon?'],tokenizer,model,device)\n","top_pairs = findTopKMostSimilar(query_embedding, concated_sentence_embeddings, 5)\n","\n","for pair in top_pairs:\n"," i = pair['index']\n"," score = pair['score']\n"," print(\"{} \\t\\t Score: {:.4f}\".format(all_sentences[skip_index+i], score))"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"6Hdu_5FiDYJr"},"source":["# Test: Load pre-calculated embeddings\n","\n","* Load embedding from files and stitch them together\n","* Save into one file\n"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"gkWt0Uj_Ddsp","executionInfo":{"status":"ok","timestamp":1637682006152,"user_tz":-60,"elapsed":1722,"user":{"displayName":"Sukosd Endre","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GioD4JjUyxYNK5t_w13NsB1TIlQ1P_x6Xj99-re5w=s64","userId":"02963673169135048018"}},"outputId":"1921456e-1fd6-4218-9ebb-cbe503f402b1"},"source":["def concatTensors(new_tensor, acc_tensor='None'):\n"," if acc_tensor == None:\n"," acc_tensor = new_tensor\n"," else:\n"," acc_tensor = torch.cat([acc_tensor, new_tensor], dim=0)\n"," return acc_tensor\n","\n","def loadFromDisc(batch_size, number_of_batches, filename):\n"," concated_sentence_embeddings = None\n"," count = 0\n"," batches = 0\n"," with open(filename, \"rb\") as f:\n"," loaded_embeddings = torch.empty([batch_size])\n"," while count < number_of_batches and loaded_embeddings.size()[0]==batch_size:\n"," loaded_embeddings = pickle.load(f)\n"," count += 1\n"," concated_sentence_embeddings = concatTensors(loaded_embeddings,concated_sentence_embeddings)\n"," print(f'Read file using {count} number of read+unpickle operations')\n"," print(concated_sentence_embeddings.size())\n"," return concated_sentence_embeddings\n","\n","\n","output_embeddings_file = 'data/processed/DBpedia_shortened_abstracts_hu_embeddings.pkl'\n","\n","embeddings_files = [\n"," '/content/drive/MyDrive/huggingface/embeddings_1000_batches_at_2021-11-23_08:17:17.pkl',\n"," '/content/drive/MyDrive/huggingface/embeddings_1000_batches_at_2021-11-23_08:28:46.pkl',\n"," '/content/drive/MyDrive/huggingface/embeddings_1000_batches_at_2021-11-23_08:40:54.pkl',\n"," '/content/drive/MyDrive/huggingface/embeddings_1000_batches_at_2021-11-23_08:56:26.pkl',\n"," '/content/drive/MyDrive/huggingface/embeddings_1000_batches_at_2021-11-23_09:31:47.pkl'\n","]\n","\n","all_embeddings = None\n","for idx,emb_file in enumerate(embeddings_files):\n"," print(f'Processing file {idx}')\n"," file_embeddings = loadFromDisc(1000, 100, emb_file)\n"," all_embeddings = concatTensors(file_embeddings,all_embeddings)\n","\n","print(all_embeddings.size())"],"execution_count":20,"outputs":[{"output_type":"stream","name":"stdout","text":["Processing file 0\n","Read file using 100 number of read+unpickle operations\n","torch.Size([100000, 384])\n","Processing file 1\n","Read file using 100 number of read+unpickle operations\n","torch.Size([100000, 384])\n","Processing file 2\n","Read file using 100 number of read+unpickle operations\n","torch.Size([100000, 384])\n","Processing file 3\n","Read file using 100 number of read+unpickle operations\n","torch.Size([100000, 384])\n","Processing file 4\n","Read file using 67 number of read+unpickle operations\n","torch.Size([66529, 384])\n","torch.Size([466529, 384])\n"]}]},{"cell_type":"code","metadata":{"id":"M_8RHpNnIU7o","executionInfo":{"status":"ok","timestamp":1637683739951,"user_tz":-60,"elapsed":384,"user":{"displayName":"Sukosd Endre","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GioD4JjUyxYNK5t_w13NsB1TIlQ1P_x6Xj99-re5w=s64","userId":"02963673169135048018"}}},"source":["all_embeddings_output_file = '/content/drive/MyDrive/huggingface/shortened_abstracts_hu_2021_09_01_embedded.pt'\n","#saveToDisc(all_embeddings, all_embeddings_output_file)\n","torch.save(all_embeddings,all_embeddings_output_file)"],"execution_count":28,"outputs":[]},{"cell_type":"code","metadata":{"id":"LYCwyDpMjsXg"},"source":[""],"execution_count":null,"outputs":[]}]}

notebooks/dbpedia_qa_test.ipynb ADDED Viewed

	@@ -0,0 +1,288 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using custom data configuration hu-faq-question-language=hu,scope=faq\n",
+      "Reusing dataset mqa (/Users/eend/.cache/huggingface/datasets/clips___mqa/hu-faq-question-language=hu,scope=faq/0.0.0/7eda4cdcbd6f009259fc516f204d776915a5f54ea2ad414c3dcddfaacd4dfe0b)\n",
+      "100%|██████████| 1/1 [00:00<00:00, 70.47it/s]\n",
+      "Using custom data configuration hu-cqa-question-language=hu,scope=cqa\n",
+      "Reusing dataset mqa (/Users/eend/.cache/huggingface/datasets/clips___mqa/hu-cqa-question-language=hu,scope=cqa/0.0.0/7eda4cdcbd6f009259fc516f204d776915a5f54ea2ad414c3dcddfaacd4dfe0b)\n",
+      "100%|██████████| 1/1 [00:00<00:00, 389.26it/s]\n",
+      "Downloading: 5.27kB [00:00, 2.07MB/s]                   \n",
+      "Downloading: 2.36kB [00:00, 1.39MB/s]                   \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.63 MiB, post-processed: Unknown size, total: 119.14 MiB) to /Users/eend/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading: 30.3MB [00:00, 78.5MB/s]\n",
+      "Downloading: 4.85MB [00:00, 63.4MB/s]                   \n",
+      "100%|██████████| 2/2 [00:01<00:00,  1.16it/s]\n",
+      "100%|██████████| 2/2 [00:00<00:00, 709.70it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset squad downloaded and prepared to /Users/eend/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 2/2 [00:00<00:00, 259.74it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "faq_hu = load_dataset(\"clips/mqa\", scope=\"faq\", language=\"hu\")\n",
+    "cqa_hu = load_dataset(\"clips/mqa\", scope=\"cqa\", language=\"hu\")\n",
+    "squad = load_dataset(\"squad\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'id': ['5733be284776f41900661182', '5733be284776f4190066117f'],\n",
+       " 'title': ['University_of_Notre_Dame', 'University_of_Notre_Dame'],\n",
+       " 'context': ['Architecturally, the school has a Catholic character. Atop the Main Building\\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',\n",
+       "  'Architecturally, the school has a Catholic character. Atop the Main Building\\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'],\n",
+       " 'question': ['To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',\n",
+       "  'What is in front of the Notre Dame Main Building?'],\n",
+       " 'answers': [{'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]},\n",
+       "  {'text': ['a copper statue of Christ'], 'answer_start': [188]}]}"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "squad['train'][:2]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'id': ['4ddf184a540032092a43461d4904ffc0',\n",
+       "  '2d3fd2e40d3369e9e03acb43f8290d23'],\n",
+       " 'text': ['\\n**[JavaFX 1.0](http://www.javafx.com/)** adták csütörtök december 4. \\n\\n\\nMint a fejlesztő, mit gondol a JavaFX? A munkahelyen, van olyan tervei, hogy lépjenek előre JavaFX-alapú alkalmazások vagy weboldalak? Van rövid távú tervek tanulni JavaFX?\\n\\n ',\n",
+       "  '\\nÉn portolása egy játék, amelyet eredetileg írt a Win32 API, Linux (jó, portolása az OS X port a Win32 port Linux).\\n\\n\\nAzt már végre `QueryPerformanceCounter`azzal, hogy a uSeconds mivel az eljárás elindításához: \\n\\n\\n\\n```\\nBOOL QueryPerformanceCounter(LARGE_INTEGER* performanceCount)\\n{\\n    gettimeofday(&currentTimeVal, NULL);\\n    performanceCount->QuadPart = (currentTimeVal.tv_sec - startTimeVal.tv_sec);\\n    performanceCount->QuadPart *= (1000 * 1000);\\n    performanceCount->QuadPart += (currentTimeVal.tv_usec - startTimeVal.tv_usec);\\n\\n    return true;\\n}\\n\\n```\\n\\nEz, párosulva `QueryPerformanceFrequency()`így állandó 1000000 a frekvencia, jól működik **a gépemen** , hogy nekem egy 64 bites változót, amely `uSeconds`, mivel a program induló.\\n\\n\\nÍgy *van ez a hordozható?* Nem akarom, hogy felfedezzék azt másként működik, ha a kernel-ben összeállított egy bizonyos módon, vagy ilyesmi. Jól vagyok vele, hogy nem hordozható, hogy valami más, mint a Linux, de.\\n\\n '],\n",
+       " 'name': ['JavaFX 1.0 megjelent. Ön mit gondol?\\n====================================\\n\\n',\n",
+       "  'Van gettimeofday () garantáltan a us felbontás?\\n===============================================\\n\\n'],\n",
+       " 'domain': ['coredump.biz', 'coredump.biz'],\n",
+       " 'bucket': ['2020.05', '2020.10'],\n",
+       " 'answers': [[{'text': '\\nAmennyire én tudom JavaFX esik lapos rajta arcát. Átnéztem a demókat és példa forráskódot és nem vagyok lenyűgözve. JavaFX egy fárasztó csata, hogy a versenyt az Adobe és a Silverlight amelyeket már a vad egy ideig. Figyeljük meg, hogy én vagyok sokáig Java fejlesztő.\\n\\n ',\n",
+       "    'name': '',\n",
+       "    'is_accepted': False},\n",
+       "   {'text': '\\nÉn biztosan gondolom, hogy érdemes egy pillantást, mint amilyennek látszik, mint a RIA itt maradni, és a több platformon / döntéseket, annál jobb. Sun biztos módja mögött, bár tekintve Micrsoft késett a játék Siliverlight és még előttünk álló út V Ha mást nem is, azt szeretném látni, hogy mit tett a Sun másként azok végrehajtását az Adobe és a Microsoft.\\n\\n ',\n",
+       "    'name': '',\n",
+       "    'is_accepted': False},\n",
+       "   {'text': '\\nJavaFX az egyetlen nyílt RIA platform, így azt hiszem, hogy felzárkózzon a verseny előbb vagy utóbb.\\n\\n\\nÚgy néz ki, nagyon jó a 1.0 verzió. Demos jól dokumentáltak, és kínál mindent, amit kell.\\n\\n\\nVannak problémák természetesen. Java applet tűnik javult egy kicsit, de ez még mindig messze elmarad. Berakás hosszú ideig magas CPU terhelés. Ez nem mutat előrelépést, mint a szokásos flash alkalmazás tenni, így a felhasználó nem lehet biztos abban, hogy az ő internet lassú, applet vagy nagy java lassú. Azt is el kellett fogadnia bizonyítvány, még több, mint egy néhány demót.\\n\\n ',\n",
+       "    'name': '',\n",
+       "    'is_accepted': False},\n",
+       "   {'text': '\\nSzeretem a koncepció JavaFX, de nem volt az esélye, hogy bármit vele. Én nem rendesen Internet alkalmazások, így azt kell menni az utamból, hogy próbálja ki a legújabb platformokon.\\n\\n ',\n",
+       "    'name': '',\n",
+       "    'is_accepted': False},\n",
+       "   {'text': '\\nMég egy dolog, hogy megtanulják, hogy szeretnék tanulni, hogy nem volt ideje megtanulni.\\n\\n\\nÍgéretesnek tűnik, de egyetértek másokkal. Ez egy fárasztó csata, és ott van a kétség, hogy ez lesz jellemző a hosszú távon. Egy pozitív Java FX, hogy ez lesz meghosszabbítja a karrierem-beruházások a Java nyelvet.\\n\\n\\nAzt is minél több RIA platformok kialakulni a JVM - így míg a Java csökkenhetnek a JVM továbbra is.\\n\\n ',\n",
+       "    'name': '',\n",
+       "    'is_accepted': False}],\n",
+       "  [{'text': '\\nTalán. De van nagyobb problémákat. `gettimeofday()`eredményezhet helytelen időzítés, ha vannak olyan folyamatok a rendszer, hogy a változás az időzítő (azaz ntpd). Egy „normális” linux, bár úgy vélem, a felbontás `gettimeofday()`is 10us. Meg lehet ugrani előre és hátra, és időt, következésképpen alapuló folyamatok fut a rendszer. Ez hatékonyan teszi a választ a kérdésre nincs.\\n\\n\\nMeg kell nézni `clock_gettime(CLOCK_MONOTONIC)`az időzítés időközönként. Ez azonban számos kisebb problémák miatt a dolgok, mint a többmagos rendszerek és külső órajel beállításokat.\\n\\n\\nIs, nézd át a `clock_getres()`funkciót.\\n\\n ',\n",
+       "    'name': '',\n",
+       "    'is_accepted': True},\n",
+       "   {'text': '\\nAz én tapasztalataim és amit olvastam az interneten keresztül, a válasz „Nem”, akkor nem garantált. Attól függ, hogy a processzor sebességét, az operációs rendszer, ízét Linux, stb\\n\\n ',\n",
+       "    'name': '',\n",
+       "    'is_accepted': False},\n",
+       "   {'text': '\\n\\n> \\n> A tényleges felbontása gettimeofday () függ a hardver architektúra. Intel processzorok, valamint a SPARC gépeket kínálnak nagyfelbontású időzítő, amelyek mérik ezredmásodperc. Egyéb hardverarchitektúrák esik vissza a rendszer időzítő, amely tipikusan beállítása 100 Hz. Ezekben az esetekben az idő felbontás kevésbé lesznek pontosak.\\n> \\n> \\n> \\n\\n\\nKaptam ezt a választ [High Resolution Időmérés és időzítők, I. rész](http://web.archive.org/web/20160711223333/http://www.informit.com/guides/content.aspx?g=cplusplus&seqNum=272)\\n\\n ',\n",
+       "    'name': '',\n",
+       "    'is_accepted': False},\n",
+       "   {'text': '\\n**Nagy felbontású, alacsony rezsi időzítése Intel processzorok**\\n\\n\\nHa az Intel hardver, itt van, hogy olvassa el a CPU valós idejű használati számláló. Azt fogja mondani, a CPU-ciklusok számát óta végrehajtott processzort elindult. Ez talán a legfinomabb szemcséjű számláló kaphat a teljesítmény méréséhez.\\n\\n\\nMegjegyzendő, hogy ez a szám a CPU ciklusokat. A linux kaphat a processzor sebességét a / proc / cpuinfo és osztódnak, hogy a másodpercek száma. Alakítja át ezt a kettős elég praktikus.\\n\\n\\nAmikor futtatom ezt én doboz, kapok\\n\\n\\n\\n```\\n11867927879484732\\n11867927879692217\\nit took this long to call printf: 207485\\n\\n```\\n\\nItt a [Intel fejlesztői útmutatót](http://cs.smu.ca/~jamuir/rdtscpm1.pdf) ad tonna részletességgel.\\n\\n\\n\\n```\\n#include <stdio.h>\\n#include <stdint.h>\\n\\ninline uint64_t rdtsc() {\\n    uint32_t lo, hi;\\n    __asm__ __volatile__ (\\n      \"xorl %%eax, %%eax\\\\n\"\\n      \"cpuid\\\\n\"\\n      \"rdtsc\\\\n\"\\n      : \"=a\" (lo), \"=d\" (hi)\\n      :\\n      : \"%ebx\", \"%ecx\");\\n    return (uint64_t)hi << 32 | lo;\\n}\\n\\nmain()\\n{\\n    unsigned long long x;\\n    unsigned long long y;\\n    x = rdtsc();\\n    printf(\"%lld\\\\n\",x);\\n    y = rdtsc();\\n    printf(\"%lld\\\\n\",y);\\n    printf(\"it took this long to call printf: %lld\\\\n\",y-x);\\n}\\n\\n```\\n ',\n",
+       "    'name': '',\n",
+       "    'is_accepted': False},\n",
+       "   {'text': '\\n\\n> \\n> Tehát azt mondja ezredmásodperc kifejezetten, de azt mondja, a felbontás a rendszer órája nincs megadva. Gondolom felbontás ebben az összefüggésben azt jelenti, hogy az a legkisebb összeg, hogy valaha is növekedhet?\\n> \\n> \\n> \\n\\n\\nAz adatstruktúra úgy definiáljuk, mint amelynek mikroszekundum, mint egy mértékegység, de ez nem jelenti azt, hogy az óra vagy az operációs rendszer valójában képes mérni, hogy finoman.\\n\\n\\nMint a többi ember azt, `gettimeofday()`rossz, mert az idő beállításával okozhat órajelelcsúszás és dobja ki a számításból. `clock_gettime(CLOCK_MONOTONIC)`az, amit akarsz, és `clock_getres()`megmondja, hogy a pontosság az óra.\\n\\n ',\n",
+       "    'name': '',\n",
+       "    'is_accepted': False},\n",
+       "   {'text': '\\n@Bernard:\\n\\n\\n\\n> \\n> Be kell vallanom, a legtöbb példa egyenesen a fejem fölött. Ez nem fordul le, és úgy tűnik, működik, mégis. Biztonságos ez az SMP rendszerek vagy SpeedStep?\\n> \\n> \\n> \\n\\n\\nEz egy jó kérdés ... Azt hiszem, hogy a kód rendben van. Gyakorlati szempontból, tudjuk használni a cégem minden nap, és mi fut elég széles skáláját dobozok, minden 2-8 magot. Természetesen YMMV stb, de úgy tűnik, hogy egy megbízható és alacsony rezsi (mert nem teszi vál- tani rendszer-space) módszer az időzítés.\\n\\n\\nÁltalában hogyan működik:\\n\\n\\n* állapítsa meg a blokk kódot kell szerelő (és illékony, ezért az optimalizáló hagyják egyedül).\\n* végrehajtja a CPUID utasítást. Amellett, hogy egyre néhány CPU információk (amelyek nem teszünk semmit) szinkronizálja a CPU végrehajtási puffert úgy hogy az időzítést nem befolyásolja out-of-order végrehajtás.\\n* végrehajtja a rdtsc (értsd timestamp) végrehajtását. Ez letölti száma gépi ciklus óta végrehajtott processzor alaphelyzetbe állt. Ez egy 64 bites érték, így a jelenlegi CPU sebességet akkor körülveszi minden 194 év múlva. Érdekes, hogy az eredeti Pentium referencia megjegyzik, hogy körbe minden 5800 évben.\\n* Az elmúlt pár sor tárolja az értékeket a regiszterek a változók hi és lo, és tegye, hogy a 64 bites visszatérési értéke.\\n\\n\\nKülönös megjegyzések:\\n\\n\\n* out-of-order végrehajtás okozhat hibás eredményeket, így végre a „CPUID” utasítás, amely azon túlmenően, hogy egy kis információt a processzor is szinkronizálja bármely out-of-order utasítás végrehajtását.\\n* A legtöbb operációs rendszer szinkronizálja a számlálók a CPU mikor indul el, így a válasz jó, hogy egy pár nano-másodperc.\\n* A téli álmot alvó megjegyzés valószínűleg igaz, de a gyakorlatban valószínűleg nem törődnek időzítések között hibernáció határokat.\\n* kapcsolatos SpeedStep: Újabb Intel CPU kompenzálja a sebesség változik, és visszatér egy beállított száma. Tettem egy gyors át néhány doboz a hálózatunkon, és már csak egy doboz, amely nem volt meg: a Pentium 3 fut néhány régi adatbázis szerver. (Ezek linux dobozokat, így egyeztettem: grep constant\\\\_tsc / proc / cpuinfo)\\n* Nem vagyok biztos abban, hogy a AMD CPU vagyunk elsősorban Intel bolt, bár tudom, hogy néhány alacsony szintű rendszerek guruk volt egy AMD értékelést.\\n\\n\\nRemélem ez kielégíti a kíváncsiságát, ez egy érdekes és (IMHO) keretében tanulmányozott programozás területén. Tudod, amikor Jeff és Joel volt szó, hogy egy programozó kell tudni C? Azt kiabált nekik, hogy „hé elfelejteni, hogy a magas szintű C dolgok ... szerelő, amit meg kell tanulni, ha azt szeretné tudni, hogy mi a számítógép csinál!”\\n\\n ',\n",
+       "    'name': '',\n",
+       "    'is_accepted': False},\n",
+       "   {'text': '\\nA bor valóban használja gettimeofday (), hogy végre QueryPerformanceCounter (), és köztudott, hogy sok Windows játékok dolgozni Linux és Mac.\\n\\n\\nElindítja <http://source.winehq.org/source/dlls/kernel32/cpu.c#L312>\\n\\n\\nvezet <http://source.winehq.org/source/dlls/ntdll/time.c#L448>\\n\\n ',\n",
+       "    'name': '',\n",
+       "    'is_accepted': False},\n",
+       "   {'text': '\\nReading a RDTSC nem megbízható az SMP rendszerek, mivel minden egyes CPU fenntartja saját számlálót és minden ellen nem garantált, hogy a szinkronizált a másikhoz képest CPU.\\n\\n\\nLehet, hogy azt sugallják, próbál **`clock_gettime(CLOCK_REALTIME)`**. A POSIX utasítás azt jelzi, hogy ez végre kell hajtani minden kompatibilis rendszereket. Ez olyan ns száma, de valószínűleg ellenőrizni fogja majd **`clock_getres(CLOCK_REALTIME)`**a rendszer, hogy mi a tényleges felbontás.\\n\\n ',\n",
+       "    'name': '',\n",
+       "    'is_accepted': False},\n",
+       "   {'text': '\\nLehet, hogy érdekli a [Linux GYIK-`clock_gettime(CLOCK_REALTIME)`](http://juliusdavies.ca/posix_clocks/clock_realtime_linux_faq.html)\\n\\n ',\n",
+       "    'name': '',\n",
+       "    'is_accepted': False},\n",
+       "   {'text': '\\n[Ez a válasz](https://stackoverflow.com/a/98/) említi problémák az óra beállítása közben. Mindkét problémákra garantálja kullancs egységek és a problémák az idő beállítása is megoldódnak C ++ 11 a `<chrono>`könyvtárban.\\n\\n\\nAz óra `std::chrono::steady_clock`garantáltan nem kell korrigálni, továbbá előre lép állandó sebességgel képest valós időben, így technológiák, mint a SpeedStep nem befolyásolja azt.\\n\\n\\nTudod kap typesafe egységek átalakításával az egyik `std::chrono::duration`szakterületek, például `std::chrono::microseconds`. Az ilyen típusú, nincs kétség, az egységek által használt kullancs értéket. Ugyanakkor szem előtt tartani, hogy az óra nem feltétlenül ezt az állásfoglalást. Ön tudja alakítani egy időtartamot attoseconds nélkül, hogy ténylegesen egy órát, hogy pontos.\\n\\n ',\n",
+       "    'name': '',\n",
+       "    'is_accepted': False}]]}"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cqa_hu['train'][100:102]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'id': ['bbfce58894a1bb9140659cfbfe334fb6',\n",
+       "  '3f98c644c947963c7990c047661fcfc5'],\n",
+       " 'text': ['', ''],\n",
+       " 'name': ['a rendszerpartíciót újratelepítés nélkül nagyobbá tehetem windows és programok?',\n",
+       "  'van-e ingyenes eszköz a c meghajtó nagyobbá tételéhez?'],\n",
+       " 'domain': ['hdd-tool.com', 'hdd-tool.com'],\n",
+       " 'bucket': ['2020.40', '2020.40'],\n",
+       " 'answers': [[{'text': 'igen, ez a cikk háromféle módszert mutat be 3féle eszköz segítségével e feladat elvégzéséhez.',\n",
+       "    'name': '',\n",
+       "    'is_accepted': True}],\n",
+       "  [{'text': 'igen, niubi partition editor ingyenes kiadása van a windows 10/8/7/vista/xp otthoni felhasználók.',\n",
+       "    'name': '',\n",
+       "    'is_accepted': True}]]}"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "faq_hu['train'][100:102]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "171"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a = set(cqa_hu['train']['domain'])\n",
+    "len(a)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('hotels.com', 321784),\n",
+       " ('travelminit.hu', 105216),\n",
+       " ('tripadvisor.co.hu', 84606),\n",
+       " ('travelminit.ro', 50327),\n",
+       " ('booking.com', 20315),\n",
+       " ('aszinonimaszotar.hu', 18896),\n",
+       " ('skyscanner.hu', 16717),\n",
+       " ('szallasvadasz.hu', 13759),\n",
+       " ('esky.hu', 12513),\n",
+       " ('travelminit.com', 12455),\n",
+       " ('pitchup.com', 9906),\n",
+       " ('kiwi.com', 9452),\n",
+       " ('languagecourse.net', 8284),\n",
+       " ('ekuponok.com', 7385),\n",
+       " ('rentalcargroup.com', 6980),\n",
+       " ('solvusoft.com', 6807),\n",
+       " ('flatio.hu', 5650),\n",
+       " ('haziallat.hu', 4255),\n",
+       " ('miapanasz.hu', 3814),\n",
+       " ('liveagent.hu', 3632)]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from collections import Counter\n",
+    "\n",
+    "faq_domains = Counter(faq_hu['train']['domain'])\n",
+    "faq_domains.most_common(20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "02e357c7440d8ed11be29edfeecade50b9c6cce68ea0a63234d5a765afff05f4"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.9.6 64-bit ('hf_venv': venv)",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

notebooks/mqa_test.ipynb ADDED Viewed

	@@ -0,0 +1,409 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using custom data configuration hu-faq-question-language=hu,scope=faq\n",
+      "Reusing dataset mqa (/Users/eend/.cache/huggingface/datasets/clips___mqa/hu-faq-question-language=hu,scope=faq/0.0.0/7eda4cdcbd6f009259fc516f204d776915a5f54ea2ad414c3dcddfaacd4dfe0b)\n",
+      "100%|██████████| 1/1 [00:00<00:00, 19.53it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "faq_hu = load_dataset(\"clips/mqa\", scope=\"faq\", language=\"hu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'id': 'a44ad85683f3d8afd1ffa42ce55fefcd',\n",
+       " 'text': '',\n",
+       " 'name': 'szingapúr területén mely kisállatbarát hotelek ideálisak a családok számára?',\n",
+       " 'domain': 'tripadvisor.co.hu',\n",
+       " 'bucket': '2020.29',\n",
+       " 'answers': [{'text': 'a(z) szingapúr területén nyaraló családok tapasztalatai szerint ezek igazán jó kisállatbarát hotelek:  \\n**[intercontinental singapore](https://www.tripadvisor.co.hu/hotel_review-g294265-d299199-reviews-intercontinental_singapore-singapore.html?faqtqr=5&faqts=hotels&faqtt=214&faqtup=geo%3a294265%3bzfa%3a9&m=63287)** utazói osztályozás: 4.5/5  \\n**[fraser suites singapore](https://www.tripadvisor.co.hu/hotel_review-g294265-d306172-reviews-fraser_suites_singapore-singapore.html?faqtqr=5&faqts=hotels&faqtt=214&faqtup=geo%3a294265%3bzfa%3a9&m=63287)** utazói osztályozás: 4.5/5  \\n**[holiday inn express singapore katong](https://www.tripadvisor.co.hu/hotel_review-g294265-d8777586-reviews-holiday_inn_express_singapore_katong-singapore.html?faqtqr=5&faqts=hotels&faqtt=214&faqtup=geo%3a294265%3bzfa%3a9&m=63287)** utazói osztályozás: 4.0/5',\n",
+       "   'name': '',\n",
+       "   'is_accepted': True}]}"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "faq_hu['train'][810000]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[ 1,  2,  2,  3,  4],\n",
+       "        [ 2,  3,  4,  5,  7],\n",
+       "        [ 2,  4,  4,  6,  8],\n",
+       "        [ 4,  6,  8, 10, 14]])"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "\n",
+    "a = torch.tensor([[1,2,2,3,4],[2,3,4,5,7]])\n",
+    "b = a * 2\n",
+    "\n",
+    "tensor_list = []\n",
+    "tensor_list.append(a)\n",
+    "tensor_list.append(b)\n",
+    "torch.cat((a,b),dim=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "5"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a.size()[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[1, 2, 2, 3, 4],\n",
+       "        [2, 3, 4, 5, 7]])"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a[:2]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[[1, 2, 2, 3, 4], [2, 3, 4, 5, 7]]"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a.tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "c = torch.empty([1,5])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[1.4569e-19, 1.0658e-32, 1.1258e+24, 1.5789e-19, 1.1819e+22]])"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "c"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[1.4568973155122501e-19,\n",
+       " 1.0658291767562146e-32,\n",
+       " 1.1257918204515671e+24,\n",
+       " 1.5789373458898217e-19,\n",
+       " 1.1818655764620037e+22]"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "c.squeeze().tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "None\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = [1,2,3]\n",
+    "b= [2,4,5]\n",
+    "print(a.extend(b))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[1, 2, 3, 2, 4, 5]"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Types of Question Answering\n",
+      "\n",
+      "    - extractive question answering (encoder only models BERT)\n",
+      "\n",
+      "        - posing questions about a document and identifying the answers as spans of text in the document itself\n",
+      "\n",
+      "    - generative question answering (encoder-decoder T5/BART)\n",
+      "\n",
+      "        - open ended questions, which need to synthesize information\n",
+      "\n",
+      "    - retrieval based/community question answering \n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "First approach - translate dataset, fine-tune model\n",
+      "\n",
+      "!Not really feasible, because it needs lots of human evaluation for correctly determine answer start token\n",
+      "\n",
+      "\n",
+      "\n",
+      "    1. Translate English QA dataset into Hungarian\n",
+      "\n",
+      "        - SQuAD - reading comprehension based on Wikipedia articles\n",
+      "\n",
+      "        - ~ 100.000 question/answers\n",
+      "\n",
+      "    2. Fine-tune a model and evaluate on this dataset\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "Second approach - fine-tune multilingual model\n",
+      "\n",
+      "!MQA format different than SQuAD, cannot use ModelForQuestionAnswering\n",
+      "\n",
+      "\n",
+      "\n",
+      "    1. Use a Hungarian dataset\n",
+      "\n",
+      "        - MQA - multilingual parsed from Common Crawl\n",
+      "\n",
+      "            - FAQ - 878.385 (2.415 domain)\n",
+      "\n",
+      "            - CQA - 27.639 (171 domain)\n",
+      "\n",
+      "    2. Fine-tune and evaluate a model on this dataset\n",
+      "\n",
+      "        \n",
+      "\n",
+      "        \n",
+      "\n",
+      "    Possible steps:\n",
+      "\n",
+      "        - Use an existing pre-trained model in Hungarian/Romanian/or multilingual to generate embeddings\n",
+      "\n",
+      "            - Select Model:\n",
+      "\n",
+      "                - multilingual which includes hu:\n",
+      "\n",
+      "                    - distiluse-base-multilingual-cased-v2 (400MB)\n",
+      "\n",
+      "                    - paraphrase-multilingual-MiniLM-L12-v2 (400MB) - fastest\n",
+      "\n",
+      "                    - paraphrase-multilingual-mpnet-base-v2 (900MB) - best performing\n",
+      "\n",
+      "                - hubert\n",
+      "\n",
+      "        - Select a dataset\n",
+      "\n",
+      "            - use MQA hungarian subset\n",
+      "\n",
+      "            - use hungarian wikipedia pages data, split it up\n",
+      "\n",
+      "                - DBpedia, shortened abstracts = 500.000\n",
+      "\n",
+      "        - Pre-compute embeddings for all answers/paragraphs\n",
+      "\n",
+      "        - Compute embedding for incoming query\n",
+      "\n",
+      "            - Compare similarity between query embedding and precomputed \n",
+      "\n",
+      "            - return top-3 answers/questions\n",
+      "\n",
+      "    \n",
+      "\n",
+      "    Alternative steps:\n",
+      "\n",
+      "        - train a sentence transformer on the Hungarian / Romanian subsets\n",
+      "\n",
+      "        - Use the trained sentence transformer to generate embeddings\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open('../approach.txt','r') as f:\n",
+    "    line = 'init'\n",
+    "    while line != '':\n",
+    "        line=f.readline();\n",
+    "        print(line)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([1.4013e-45, 0.0000e+00, 2.8026e-45, 0.0000e+00, 2.8026e-45, 0.0000e+00,\n",
+       "        4.2039e-45, 0.0000e+00, 5.6052e-45, 0.0000e+00, 2.8026e-45, 0.0000e+00,\n",
+       "        4.2039e-45, 0.0000e+00, 5.6052e-45, 0.0000e+00, 7.0065e-45, 0.0000e+00,\n",
+       "        9.8091e-45, 0.0000e+00])"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "d = torch.empty([20])\n",
+    "d"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "02e357c7440d8ed11be29edfeecade50b9c6cce68ea0a63234d5a765afff05f4"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.9.6 64-bit ('hf_venv': venv)",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+transformers
+torch
+sentence_transformers

requirements_full.txt ADDED Viewed

	@@ -0,0 +1,133 @@

+aiohttp==3.8.1
+aiosignal==1.2.0
+altair==4.1.0
+appnope==0.1.2
+argon2-cffi==21.1.0
+arrow==1.2.1
+astor==0.8.1
+async-timeout==4.0.1
+attrs==21.2.0
+backcall==0.2.0
+base58==2.1.1
+binaryornot==0.4.4
+bleach==4.1.0
+blinker==1.4
+cachetools==4.2.4
+certifi==2021.10.8
+cffi==1.15.0
+chardet==4.0.0
+charset-normalizer==2.0.7
+click==7.1.2
+cycler==0.11.0
+datasets==1.15.1
+debugpy==1.5.1
+decorator==5.1.0
+defusedxml==0.7.1
+dill==0.3.4
+entrypoints==0.3
+filelock==3.3.2
+fonttools==4.28.1
+frozenlist==1.2.0
+fsspec==2021.11.0
+gitdb==4.0.9
+GitPython==3.1.24
+huggingface-hub==0.1.2
+idna==3.3
+ipykernel==6.5.0
+ipython==7.29.0
+ipython-genutils==0.2.0
+ipywidgets==7.6.5
+isodate==0.6.0
+jedi==0.18.1
+Jinja2==3.0.3
+jinja2-time==0.2.0
+joblib==1.1.0
+jsonschema==4.2.1
+jupyter==1.0.0
+jupyter-client==7.0.6
+jupyter-console==6.4.0
+jupyter-core==4.9.1
+jupyterlab-pygments==0.1.2
+jupyterlab-widgets==1.0.2
+kiwisolver==1.3.2
+MarkupSafe==2.0.1
+matplotlib==3.5.0
+matplotlib-inline==0.1.3
+mistune==0.8.4
+multidict==5.2.0
+multiprocess==0.70.12.2
+nbclient==0.5.8
+nbconvert==6.3.0
+nbformat==5.1.3
+nest-asyncio==1.5.1
+nltk==3.6.5
+notebook==6.4.6
+numpy==1.21.4
+packaging==21.2
+pandas==1.3.4
+pandocfilters==1.5.0
+parso==0.8.2
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==8.4.0
+plotly==5.4.0
+poyo==0.5.0
+prometheus-client==0.12.0
+prompt-toolkit==3.0.22
+protobuf==3.19.1
+ptyprocess==0.7.0
+pyarrow==6.0.0
+pycparser==2.21
+pydeck==0.7.1
+Pygments==2.10.0
+Pympler==0.9
+pyparsing==2.4.7
+pyrsistent==0.18.0
+python-dateutil==2.8.2
+python-slugify==5.0.2
+pytz==2021.3
+pytz-deprecation-shim==0.1.0.post0
+PyYAML==6.0
+pyzmq==22.3.0
+qtconsole==5.2.0
+QtPy==1.11.2
+rdflib==6.0.2
+regex==2021.11.10
+requests==2.26.0
+sacremoses==0.0.46
+scikit-learn==1.0.1
+scipy==1.7.2
+seaborn==0.11.2
+Send2Trash==1.8.0
+sentence-transformers==2.1.0
+sentencepiece==0.1.96
+setuptools-scm==6.3.2
+six==1.16.0
+smmap==5.0.0
+streamlit==1.2.0
+tenacity==8.0.1
+terminado==0.12.1
+testpath==0.5.0
+text-unidecode==1.3
+threadpoolctl==3.0.0
+tokenizers==0.10.3
+toml==0.10.2
+tomli==1.2.2
+toolz==0.11.2
+torch==1.10.0
+torchaudio==0.10.0
+torchvision==0.11.1
+tornado==6.1
+tqdm==4.62.3
+traitlets==5.1.1
+transformers==4.12.3
+typing-extensions==3.10.0.2
+tzdata==2021.5
+tzlocal==4.1
+urllib3==1.26.7
+validators==0.18.2
+wcwidth==0.2.5
+webencodings==0.5.1
+widgetsnbextension==3.5.2
+xxhash==2.0.2
+yarl==1.7.2

src/app.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import streamlit as st
+from transformers import AutoTokenizer, AutoModel
+import torch
+from sentence_transformers import util
+@st.cache
+def load_raw_sentences(filename):
+    with open(filename) as f:
+        return f.readlines()
+@st.cache
+def load_embeddings(filename):
+    with open(filename) as f:
+        return torch.load(filename,map_location=torch.device('cpu') )
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
+    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    return sum_embeddings / sum_mask
+def findTopKMostSimilar(query_embedding, embeddings, all_sentences, k):
+    cosine_scores = util.pytorch_cos_sim(query_embedding, embeddings)
+    cosine_scores_list = cosine_scores.squeeze().tolist()
+    pairs = []
+    for idx,score in enumerate(cosine_scores_list):
+        pairs.append({'index': idx, 'score': score, 'text': all_sentences[idx]})
+    pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
+    return pairs[0:k]
+def calculateEmbeddings(sentences,tokenizer,model):
+    tokenized_sentences = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
+    with torch.no_grad():
+        model_output = model(**tokenized_sentences)
+    sentence_embeddings = mean_pooling(model_output, tokenized_sentences['attention_mask'])
+    return sentence_embeddings
+multilingual_checkpoint = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
+tokenizer = AutoTokenizer.from_pretrained(multilingual_checkpoint)
+model = AutoModel.from_pretrained(multilingual_checkpoint)
+raw_text_file = 'data/processed/shortened_abstracts_hu_2021_09_01.txt'
+all_sentences = load_raw_sentences(raw_text_file)
+embeddings_file = 'data/processed/shortened_abstracts_hu_2021_09_01_embedded.pt'
+all_embeddings = load_embeddings(embeddings_file)
+st.text('Search Wikipedia abstracts in Hungarian - Input some search term and see the top-5 most similar wikipedia abstracts')
+st.text('Wikipedia absztrakt kereső - adjon meg egy tetszőleges kifejezést és a rendszer visszaadja az 5 hozzá legjobban hasonlító Wikipedia absztraktot')
+input_query = st.text_area("Hol élnek a bengali tigrisek?")
+if input_query:
+    query_embedding = calculateEmbeddings([input_query],tokenizer,model)
+    top_pairs = findTopKMostSimilar(query_embedding, all_embeddings, all_sentences, 5)
+    st.json(top_pairs)

src/data/dbpedia_dump_embeddings.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from transformers import AutoTokenizer, AutoModel
+from datetime import datetime
+import torch
+import pickle
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
+    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    return sum_embeddings / sum_mask
+def calculateEmbeddings(sentences,tokenizer,model):
+    tokenized_sentences = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
+    with torch.no_grad():
+        model_output = model(**tokenized_sentences)
+    sentence_embeddings = mean_pooling(model_output, tokenized_sentences['attention_mask'])
+    return sentence_embeddings
+def saveToDisc(embeddings, filename):
+    with open(filename, "ab") as f:
+        pickle.dump(embeddings, f, protocol=pickle.HIGHEST_PROTOCOL)
+def saveToDisc(sentences, embeddings, filename):
+    with open(filename, "ab") as f:
+        pickle.dump({'sentences': sentences, 'embeddings': embeddings}, f, protocol=pickle.HIGHEST_PROTOCOL)
+dt = datetime.now()
+datetime_formatted = dt.strftime('%Y-%m-%d_%H:%M:%S')
+batch_size = 1000
+input_text_file = 'data/processed/shortened_abstracts_hu_2021_09_01.txt'
+output_embeddings_file = f'data/processed/embeddings_{batch_size}_batches_at_{datetime_formatted}.pkl'
+multilingual_checkpoint = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
+tokenizer = AutoTokenizer.from_pretrained(multilingual_checkpoint)
+model = AutoModel.from_pretrained(multilingual_checkpoint)
+total_read = 0
+total_read_limit = 3 * batch_size
+with open(input_text_file) as f:
+    while total_read < total_read_limit:
+        count = 0
+        sentences = []
+        line = 'init'
+        while line and count < batch_size:
+            line = f.readline()
+            sentences.append(line)
+            count += 1
+        sentence_embeddings = calculateEmbeddings(sentences,tokenizer,model)
+        saveToDisc(sentences, sentence_embeddings,output_embeddings_file)
+        total_read += count

src/data/dbpedia_dump_wiki_text.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from rdflib import Graph
+# Downloaded from https://databus.dbpedia.org/dbpedia/text/short-abstracts
+raw_data_path = 'data/raw/short-abstracts_lang=hu.ttl'
+processed_data_path = 'data/processed/shortened_abstracts_hu_2021_09_01.txt'
+g = Graph()
+g.parse(raw_data_path, format='turtle')
+i = 0
+objects = []
+with open(processed_data_path, 'w') as f:
+    print(len(g))
+    for subject, predicate, object in g:
+        objects.append(object.replace(' +/-','').replace('\n',' '))
+        objects.append('\n')
+        i += 1
+    f.writelines(objects)

src/exploration/automodel_test.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
+raw_inputs = ["This is the second best day of my life.", "Are you freaking kidding me right now?"]
+tokens = tokenizer(raw_inputs, padding=True, return_tensors="pt")
+print(tokens)
+raw_outputs = model(**tokens)
+print(raw_outputs.logits)
+predictions = torch.nn.functional.softmax(raw_outputs.logits, dim=-1)
+print(predictions)
+# max value, index of max value, and corresponding label
+labels = model.config.id2label
+max_value_index = [(torch.max(p), torch.argmax(p)) for p in predictions]
+[print("{:.5f}".format(e[0].item()),labels[e[1].item()]) for e in max_value_index]

src/exploration/datetime_test.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from datetime import datetime
+dt = datetime.now()
+print(dt)
+print(dt.strftime('%a %d-%m-%Y'))
+print(dt.strftime('%a %d/%m/%Y'))
+print(dt.strftime('%a %d/%m/%y'))
+print(dt.strftime('%A %d-%m-%Y, %H:%M:%S'))
+print(dt.strftime('%X %x'))
+print(dt.strftime('%Y-%m-%d_%H:%M:%S'))

src/exploration/mqa_test.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from datasets import load_dataset
+faq_hu = load_dataset("clips/mqa", scope="faq", language="hu")
+cqa_hu = load_dataset("clips/mqa", scope="cqa", language="hu")
+print(faq_hu)
+print(cqa_hu)
+print(faq_hu['train'][:5])
+print(cqa_hu['train'][:5])

src/exploration/pipeline_test.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from transformers import pipeline
+generator = pipeline("text-generation", model="distilgpt2")
+res = lambda _:generator("My girlfriend told me that I have a huge", max_length=40)
+print(res(0))
+top_k=10
+maskfiller = pipeline("fill-mask", model="distilbert-base-uncased")
+hu_res = lambda _: maskfiller("Hungarians are a very [MASK] nation.", top_k=top_k)
+ju_res = lambda _:maskfiller("Jews are a very [MASK] nation.", top_k=top_k)
+it_res = lambda _:maskfiller("Italians are a very [MASK] nation.", top_k=top_k)
+token_str = lambda x:[e["token_str"] for e in x]
+print(token_str(hu_res(0)))
+print(token_str(ju_res(0)))
+print(token_str(it_res(0)))

src/exploration/serialize_test.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+import pickle
+'''
+a = [1,2,3]
+b = [4,5,6]
+at = torch.tensor([a,a])
+bt = torch.tensor([b,b])
+with open('serialize_test.pkl', "ab") as f:
+    pickle.dump(at,f)
+    pickle.dump(bt,f)
+with open('serialize_test.pkl', "rb") as f:
+    print(pickle.load(f))
+    print(pickle.load(f))
+'''
+def loadFromDiskRaw(batch_number, filename='embeddings.pkl'):
+    count = 0
+    with open(filename, "rb") as f:
+        while count < batch_number:
+            stored_data = pickle.load(f)
+            print(stored_data.size())
+            print(stored_data[0][:15])
+            count += 1
+    return stored_data
+output_embeddings_file = 'data/processed/DBpedia_shortened_abstracts_hu_embeddings.pkl'
+loadFromDiskRaw(3, output_embeddings_file)

src/features/semantic_retreiver.py ADDED Viewed

	@@ -0,0 +1,130 @@

+from transformers import AutoTokenizer, AutoModel
+import torch
+import pickle
+from sentence_transformers import util
+from datetime import datetime
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
+    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    return sum_embeddings / sum_mask
+dt = datetime.now()
+datetime_formatted = dt.strftime('%Y-%m-%d_%H:%M:%S')
+batch_size = 1000
+output_embeddings_file = f'data/processed/embeddings_{batch_size}_batches_at_{datetime_formatted}.pkl'
+def saveToDisc(embeddings):
+    with open(output_embeddings_file, "ab") as f:
+        pickle.dump(embeddings, f, protocol=pickle.HIGHEST_PROTOCOL)
+def saveToDisc(sentences, embeddings, filename='embeddings.pkl'):
+    with open(filename, "ab") as f:
+        pickle.dump({'sentences': sentences, 'embeddings': embeddings}, f, protocol=pickle.HIGHEST_PROTOCOL)
+def saveToDiscRaw(embeddings, filename='embeddings.pkl'):
+    with open(filename, "ab") as f:
+        pickle.dump(embeddings, f, protocol=pickle.HIGHEST_PROTOCOL)
+        #for emb in embeddings:
+        #    torch.save(emb,f)
+def loadFromDiskRaw(filename='embeddings.pkl'):
+    with open(filename, "rb") as f:
+        stored_data = pickle.load(f)
+    return stored_data
+def loadFromDisk(filename='embeddings.pkl'):
+    with open(filename, "rb") as f:
+        stored_data = pickle.load(f)
+        stored_sentences = stored_data['sentences']
+        stored_embeddings = stored_data['embeddings']
+    return stored_sentences, stored_embeddings
+def findTopKMostSimilarPairs(embeddings, k):
+    cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
+    pairs = []
+    for i in range(len(cosine_scores)-1):
+        for j in range(i+1, len(cosine_scores)):
+            pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})
+    pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
+    return pairs[0:k]
+def findTopKMostSimilar(query_embedding, embeddings, k):
+    cosine_scores = util.pytorch_cos_sim(query_embedding, embeddings)
+    cosine_scores_list = cosine_scores.squeeze().tolist()
+    pairs = []
+    for idx,score in enumerate(cosine_scores_list):
+        pairs.append({'index': idx, 'score': score})
+    pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
+    return pairs[0:k]
+def calculateEmbeddings(sentences,tokenizer,model):
+    tokenized_sentences = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
+    with torch.no_grad():
+        model_output = model(**tokenized_sentences)
+    sentence_embeddings = mean_pooling(model_output, tokenized_sentences['attention_mask'])
+    return sentence_embeddings
+multilingual_checkpoint = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
+tokenizer = AutoTokenizer.from_pretrained(multilingual_checkpoint)
+model = AutoModel.from_pretrained(multilingual_checkpoint)
+raw_text_file = 'data/processed/shortened_abstracts_hu_2021_09_01.txt'
+concated_sentence_embeddings = None
+all_sentences = []
+print(datetime.now())
+batch_size = 5
+line = 'init'
+total_read = 0
+total_read_limit = 120
+skip_index = 100
+with open(raw_text_file) as f:
+    while line and total_read < total_read_limit:
+        count = 0
+        sentence_batch = []
+        while line and count < batch_size:
+            line = f.readline()
+            sentence_batch.append(line)
+            count += 1
+        all_sentences.extend(sentence_batch)
+        if total_read >= skip_index:
+            sentence_embeddings = calculateEmbeddings(sentence_batch,tokenizer,model)
+            if concated_sentence_embeddings == None:
+                concated_sentence_embeddings = sentence_embeddings
+            else:
+                concated_sentence_embeddings = torch.cat([concated_sentence_embeddings, sentence_embeddings], dim=0)
+            print(concated_sentence_embeddings.size())
+        #saveToDiscRaw(sentence_embeddings)
+        total_read += count
+        if total_read%5==0:
+            print(f'total_read:{total_read}')
+print(datetime.now())
+query_embedding = calculateEmbeddings(['Melyik a legnépesebb város a világon?'],tokenizer,model)
+top_pairs = findTopKMostSimilar(query_embedding, concated_sentence_embeddings, 5)
+for pair in top_pairs:
+    i = pair['index']
+    score = pair['score']
+    print("{} \t\t Score: {:.4f}".format(all_sentences[skip_index+i], score))
+'''
+query = ''
+while query != 'exit':
+    query = input("Enter your query: ")
+    query_embedding = calculateEmbeddings([query],tokenizer,model)
+'''

src/main_qa.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from transformers import AutoTokenizer, AutoModel
+import torch
+from sentence_transformers import util
+def load_raw_sentences(filename):
+    with open(filename) as f:
+        return f.readlines()
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
+    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    return sum_embeddings / sum_mask
+def findTopKMostSimilar(query_embedding, embeddings, k):
+    cosine_scores = util.pytorch_cos_sim(query_embedding, embeddings)
+    cosine_scores_list = cosine_scores.squeeze().tolist()
+    pairs = []
+    for idx,score in enumerate(cosine_scores_list):
+        pairs.append({'index': idx, 'score': score})
+    pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
+    return pairs[0:k]
+def calculateEmbeddings(sentences,tokenizer,model):
+    tokenized_sentences = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
+    with torch.no_grad():
+        model_output = model(**tokenized_sentences)
+    sentence_embeddings = mean_pooling(model_output, tokenized_sentences['attention_mask'])
+    return sentence_embeddings
+multilingual_checkpoint = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
+tokenizer = AutoTokenizer.from_pretrained(multilingual_checkpoint)
+model = AutoModel.from_pretrained(multilingual_checkpoint)
+raw_text_file = 'data/processed/shortened_abstracts_hu_2021_09_01.txt'
+embeddings_file = 'data/processed/shortened_abstracts_hu_2021_09_01_embedded.pt'
+all_sentences = load_raw_sentences(raw_text_file)
+all_embeddings = torch.load(embeddings_file,map_location=torch.device('cpu') )
+query = ''
+while query != 'exit':
+    query = input("Enter your query: ")
+    query_embedding = calculateEmbeddings([query],tokenizer,model)
+    top_pairs = findTopKMostSimilar(query_embedding, all_embeddings, 5)
+    for pair in top_pairs:
+        i = pair['index']
+        score = pair['score']
+        print("{} \t\t Score: {:.4f}".format(all_sentences[i], score))