Spaces:

Endre
/

SemanticSearch-HU

Runtime error

File size: 10,426 Bytes
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration hu-faq-question-language=hu,scope=faq\n",
      "Reusing dataset mqa (/Users/eend/.cache/huggingface/datasets/clips___mqa/hu-faq-question-language=hu,scope=faq/0.0.0/7eda4cdcbd6f009259fc516f204d776915a5f54ea2ad414c3dcddfaacd4dfe0b)\n",
      "100%|██████████| 1/1 [00:00<00:00, 19.53it/s]\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "faq_hu = load_dataset(\"clips/mqa\", scope=\"faq\", language=\"hu\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': 'a44ad85683f3d8afd1ffa42ce55fefcd',\n",
       " 'text': '',\n",
       " 'name': 'szingapúr területén mely kisállatbarát hotelek ideálisak a családok számára?',\n",
       " 'domain': 'tripadvisor.co.hu',\n",
       " 'bucket': '2020.29',\n",
       " 'answers': [{'text': 'a(z) szingapúr területén nyaraló családok tapasztalatai szerint ezek igazán jó kisállatbarát hotelek:  \\n**[intercontinental singapore](https://www.tripadvisor.co.hu/hotel_review-g294265-d299199-reviews-intercontinental_singapore-singapore.html?faqtqr=5&faqts=hotels&faqtt=214&faqtup=geo%3a294265%3bzfa%3a9&m=63287)** utazói osztályozás: 4.5/5  \\n**[fraser suites singapore](https://www.tripadvisor.co.hu/hotel_review-g294265-d306172-reviews-fraser_suites_singapore-singapore.html?faqtqr=5&faqts=hotels&faqtt=214&faqtup=geo%3a294265%3bzfa%3a9&m=63287)** utazói osztályozás: 4.5/5  \\n**[holiday inn express singapore katong](https://www.tripadvisor.co.hu/hotel_review-g294265-d8777586-reviews-holiday_inn_express_singapore_katong-singapore.html?faqtqr=5&faqts=hotels&faqtt=214&faqtup=geo%3a294265%3bzfa%3a9&m=63287)** utazói osztályozás: 4.0/5',\n",
       "   'name': '',\n",
       "   'is_accepted': True}]}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "faq_hu['train'][810000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[ 1,  2,  2,  3,  4],\n",
       "        [ 2,  3,  4,  5,  7],\n",
       "        [ 2,  4,  4,  6,  8],\n",
       "        [ 4,  6,  8, 10, 14]])"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "a = torch.tensor([[1,2,2,3,4],[2,3,4,5,7]])\n",
    "b = a * 2\n",
    "\n",
    "tensor_list = []\n",
    "tensor_list.append(a)\n",
    "tensor_list.append(b)\n",
    "torch.cat((a,b),dim=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a.size()[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[1, 2, 2, 3, 4],\n",
       "        [2, 3, 4, 5, 7]])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a[:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[1, 2, 2, 3, 4], [2, 3, 4, 5, 7]]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "c = torch.empty([1,5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[1.4569e-19, 1.0658e-32, 1.1258e+24, 1.5789e-19, 1.1819e+22]])"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1.4568973155122501e-19,\n",
       " 1.0658291767562146e-32,\n",
       " 1.1257918204515671e+24,\n",
       " 1.5789373458898217e-19,\n",
       " 1.1818655764620037e+22]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c.squeeze().tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "None\n"
     ]
    }
   ],
   "source": [
    "a = [1,2,3]\n",
    "b= [2,4,5]\n",
    "print(a.extend(b))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1, 2, 3, 2, 4, 5]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "Types of Question Answering\n",
      "\n",
      "    - extractive question answering (encoder only models BERT)\n",
      "\n",
      "        - posing questions about a document and identifying the answers as spans of text in the document itself\n",
      "\n",
      "    - generative question answering (encoder-decoder T5/BART)\n",
      "\n",
      "        - open ended questions, which need to synthesize information\n",
      "\n",
      "    - retrieval based/community question answering \n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "First approach - translate dataset, fine-tune model\n",
      "\n",
      "!Not really feasible, because it needs lots of human evaluation for correctly determine answer start token\n",
      "\n",
      "\n",
      "\n",
      "    1. Translate English QA dataset into Hungarian\n",
      "\n",
      "        - SQuAD - reading comprehension based on Wikipedia articles\n",
      "\n",
      "        - ~ 100.000 question/answers\n",
      "\n",
      "    2. Fine-tune a model and evaluate on this dataset\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "Second approach - fine-tune multilingual model\n",
      "\n",
      "!MQA format different than SQuAD, cannot use ModelForQuestionAnswering\n",
      "\n",
      "\n",
      "\n",
      "    1. Use a Hungarian dataset\n",
      "\n",
      "        - MQA - multilingual parsed from Common Crawl\n",
      "\n",
      "            - FAQ - 878.385 (2.415 domain)\n",
      "\n",
      "            - CQA - 27.639 (171 domain)\n",
      "\n",
      "    2. Fine-tune and evaluate a model on this dataset\n",
      "\n",
      "        \n",
      "\n",
      "        \n",
      "\n",
      "    Possible steps:\n",
      "\n",
      "        - Use an existing pre-trained model in Hungarian/Romanian/or multilingual to generate embeddings\n",
      "\n",
      "            - Select Model:\n",
      "\n",
      "                - multilingual which includes hu:\n",
      "\n",
      "                    - distiluse-base-multilingual-cased-v2 (400MB)\n",
      "\n",
      "                    - paraphrase-multilingual-MiniLM-L12-v2 (400MB) - fastest\n",
      "\n",
      "                    - paraphrase-multilingual-mpnet-base-v2 (900MB) - best performing\n",
      "\n",
      "                - hubert\n",
      "\n",
      "        - Select a dataset\n",
      "\n",
      "            - use MQA hungarian subset\n",
      "\n",
      "            - use hungarian wikipedia pages data, split it up\n",
      "\n",
      "                - DBpedia, shortened abstracts = 500.000\n",
      "\n",
      "        - Pre-compute embeddings for all answers/paragraphs\n",
      "\n",
      "        - Compute embedding for incoming query\n",
      "\n",
      "            - Compare similarity between query embedding and precomputed \n",
      "\n",
      "            - return top-3 answers/questions\n",
      "\n",
      "    \n",
      "\n",
      "    Alternative steps:\n",
      "\n",
      "        - train a sentence transformer on the Hungarian / Romanian subsets\n",
      "\n",
      "        - Use the trained sentence transformer to generate embeddings\n",
      "\n"
     ]
    }
   ],
   "source": [
    "with open('../approach.txt','r') as f:\n",
    "    line = 'init'\n",
    "    while line != '':\n",
    "        line=f.readline();\n",
    "        print(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([1.4013e-45, 0.0000e+00, 2.8026e-45, 0.0000e+00, 2.8026e-45, 0.0000e+00,\n",
       "        4.2039e-45, 0.0000e+00, 5.6052e-45, 0.0000e+00, 2.8026e-45, 0.0000e+00,\n",
       "        4.2039e-45, 0.0000e+00, 5.6052e-45, 0.0000e+00, 7.0065e-45, 0.0000e+00,\n",
       "        9.8091e-45, 0.0000e+00])"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d = torch.empty([20])\n",
    "d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "02e357c7440d8ed11be29edfeecade50b9c6cce68ea0a63234d5a765afff05f4"
  },
  "kernelspec": {
   "display_name": "Python 3.9.6 64-bit ('hf_venv': venv)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}