{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 82841986 is_char and is_digit"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 82075350 regrex non-ascii and none-digit"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 86460763 left"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import random\n",
    "import re\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_length = 25\n",
    "min_length = 1\n",
    "root = '../data'\n",
    "charset = 'abcdefghijklmnopqrstuvwxyz'\n",
    "digits = '0123456789'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def is_char(text, ratio=0.5):\n",
    "    text = text.lower()\n",
    "    length = max(len(text), 1)\n",
    "    char_num = sum([t in charset for t in text])\n",
    "    if char_num < min_length: return False\n",
    "    if char_num / length < ratio: return False\n",
    "    return True\n",
    "\n",
    "def is_digit(text, ratio=0.5):\n",
    "    length = max(len(text), 1)\n",
    "    digit_num = sum([t in digits for t in text])\n",
    "    if digit_num / length < ratio: return False\n",
    "    return True"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# generate training dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('/tmp/wikitext-103/wiki.train.tokens', 'r') as file:\n",
    "    lines = file.readlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "inp, gt = [], []\n",
    "for line in lines:\n",
    "    token = line.lower().split()\n",
    "    for text in token:\n",
    "        text = re.sub('[^0-9a-zA-Z]+', '', text)\n",
    "        if len(text) < min_length:\n",
    "            # print('short-text', text)\n",
    "            continue\n",
    "        if len(text) > max_length:\n",
    "            # print('long-text', text)\n",
    "            continue\n",
    "        inp.append(text)\n",
    "        gt.append(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_voc = os.path.join(root, 'WikiText-103.csv')\n",
    "pd.DataFrame({'inp':inp, 'gt':gt}).to_csv(train_voc, index=None, sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "86460763"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(inp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['valkyria',\n",
       " 'chronicles',\n",
       " 'iii',\n",
       " 'senj',\n",
       " 'no',\n",
       " 'valkyria',\n",
       " '3',\n",
       " 'unk',\n",
       " 'chronicles',\n",
       " 'japanese',\n",
       " '3',\n",
       " 'lit',\n",
       " 'valkyria',\n",
       " 'of',\n",
       " 'the',\n",
       " 'battlefield',\n",
       " '3',\n",
       " 'commonly',\n",
       " 'referred',\n",
       " 'to',\n",
       " 'as',\n",
       " 'valkyria',\n",
       " 'chronicles',\n",
       " 'iii',\n",
       " 'outside',\n",
       " 'japan',\n",
       " 'is',\n",
       " 'a',\n",
       " 'tactical',\n",
       " 'role',\n",
       " 'playing',\n",
       " 'video',\n",
       " 'game',\n",
       " 'developed',\n",
       " 'by',\n",
       " 'sega',\n",
       " 'and',\n",
       " 'mediavision',\n",
       " 'for',\n",
       " 'the',\n",
       " 'playstation',\n",
       " 'portable',\n",
       " 'released',\n",
       " 'in',\n",
       " 'january',\n",
       " '2011',\n",
       " 'in',\n",
       " 'japan',\n",
       " 'it',\n",
       " 'is',\n",
       " 'the',\n",
       " 'third',\n",
       " 'game',\n",
       " 'in',\n",
       " 'the',\n",
       " 'valkyria',\n",
       " 'series',\n",
       " 'employing',\n",
       " 'the',\n",
       " 'same',\n",
       " 'fusion',\n",
       " 'of',\n",
       " 'tactical',\n",
       " 'and',\n",
       " 'real',\n",
       " 'time',\n",
       " 'gameplay',\n",
       " 'as',\n",
       " 'its',\n",
       " 'predecessors',\n",
       " 'the',\n",
       " 'story',\n",
       " 'runs',\n",
       " 'parallel',\n",
       " 'to',\n",
       " 'the',\n",
       " 'first',\n",
       " 'game',\n",
       " 'and',\n",
       " 'follows',\n",
       " 'the',\n",
       " 'nameless',\n",
       " 'a',\n",
       " 'penal',\n",
       " 'military',\n",
       " 'unit',\n",
       " 'serving',\n",
       " 'the',\n",
       " 'nation',\n",
       " 'of',\n",
       " 'gallia',\n",
       " 'during',\n",
       " 'the',\n",
       " 'second',\n",
       " 'europan',\n",
       " 'war',\n",
       " 'who',\n",
       " 'perform',\n",
       " 'secret',\n",
       " 'black']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inp[:100]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# generate evaluation dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def disturb(word, degree, p=0.3):\n",
    "    if len(word) // 2 < degree: return word\n",
    "    if is_digit(word): return word\n",
    "    if random.random() < p: return word\n",
    "    else:\n",
    "        index = list(range(len(word)))\n",
    "        random.shuffle(index)\n",
    "        index = index[:degree]\n",
    "        new_word = []\n",
    "        for i in range(len(word)):\n",
    "            if i not in index: \n",
    "                new_word.append(word[i])\n",
    "                continue\n",
    "            if (word[i] not in charset) and (word[i] not in digits):\n",
    "                # special token\n",
    "                new_word.append(word[i])\n",
    "                continue\n",
    "            op = random.random()\n",
    "            if op < 0.1: # add\n",
    "                new_word.append(random.choice(charset))\n",
    "                new_word.append(word[i])\n",
    "            elif op < 0.2: continue  # remove\n",
    "            else: new_word.append(random.choice(charset))  # replace\n",
    "        return ''.join(new_word)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "lines = inp\n",
    "degree = 1\n",
    "keep_num = 50000\n",
    "\n",
    "random.shuffle(lines)\n",
    "part_lines = lines[:keep_num]\n",
    "inp, gt = [], []\n",
    "\n",
    "for w in part_lines:\n",
    "    w = w.strip().lower()\n",
    "    new_w = disturb(w, degree)\n",
    "    inp.append(new_w)\n",
    "    gt.append(w)\n",
    "    \n",
    "eval_voc = os.path.join(root, f'WikiText-103_eval_d{degree}.csv')\n",
    "pd.DataFrame({'inp':inp, 'gt':gt}).to_csv(eval_voc, index=None, sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('high', 'high'),\n",
       " ('vctoria', 'victoria'),\n",
       " ('mains', 'mains'),\n",
       " ('bi', 'by'),\n",
       " ('13', '13'),\n",
       " ('ticnet', 'ticket'),\n",
       " ('basil', 'basic'),\n",
       " ('cut', 'cut'),\n",
       " ('aqarky', 'anarky'),\n",
       " ('the', 'the'),\n",
       " ('tqe', 'the'),\n",
       " ('oc', 'of'),\n",
       " ('diwpersal', 'dispersal'),\n",
       " ('traffic', 'traffic'),\n",
       " ('in', 'in'),\n",
       " ('the', 'the'),\n",
       " ('ti', 'to'),\n",
       " ('professionalms', 'professionals'),\n",
       " ('747', '747'),\n",
       " ('in', 'in'),\n",
       " ('and', 'and'),\n",
       " ('exezutive', 'executive'),\n",
       " ('n400', 'n400'),\n",
       " ('yusic', 'music'),\n",
       " ('s', 's'),\n",
       " ('henri', 'henry'),\n",
       " ('heard', 'heard'),\n",
       " ('thousand', 'thousand'),\n",
       " ('to', 'to'),\n",
       " ('arhy', 'army'),\n",
       " ('td', 'to'),\n",
       " ('a', 'a'),\n",
       " ('oall', 'hall'),\n",
       " ('qind', 'kind'),\n",
       " ('od', 'on'),\n",
       " ('samfria', 'samaria'),\n",
       " ('driveway', 'driveway'),\n",
       " ('which', 'which'),\n",
       " ('wotk', 'work'),\n",
       " ('ak', 'as'),\n",
       " ('persona', 'persona'),\n",
       " ('s', 's'),\n",
       " ('melbourne', 'melbourne'),\n",
       " ('apong', 'along'),\n",
       " ('fas', 'was'),\n",
       " ('thea', 'then'),\n",
       " ('permcy', 'percy'),\n",
       " ('nnd', 'and'),\n",
       " ('alan', 'alan'),\n",
       " ('13', '13'),\n",
       " ('matteos', 'matters'),\n",
       " ('against', 'against'),\n",
       " ('nefion', 'nexion'),\n",
       " ('held', 'held'),\n",
       " ('negative', 'negative'),\n",
       " ('gogd', 'good'),\n",
       " ('the', 'the'),\n",
       " ('thd', 'the'),\n",
       " ('groening', 'groening'),\n",
       " ('tqe', 'the'),\n",
       " ('cwould', 'would'),\n",
       " ('fb', 'ft'),\n",
       " ('uniten', 'united'),\n",
       " ('kone', 'one'),\n",
       " ('thiy', 'this'),\n",
       " ('lanren', 'lauren'),\n",
       " ('s', 's'),\n",
       " ('thhe', 'the'),\n",
       " ('is', 'is'),\n",
       " ('modep', 'model'),\n",
       " ('weird', 'weird'),\n",
       " ('angwer', 'answer'),\n",
       " ('imprisxnment', 'imprisonment'),\n",
       " ('marpery', 'margery'),\n",
       " ('eventuanly', 'eventually'),\n",
       " ('in', 'in'),\n",
       " ('donnoa', 'donna'),\n",
       " ('ik', 'it'),\n",
       " ('reached', 'reached'),\n",
       " ('at', 'at'),\n",
       " ('excxted', 'excited'),\n",
       " ('ws', 'was'),\n",
       " ('raes', 'rates'),\n",
       " ('the', 'the'),\n",
       " ('firsq', 'first'),\n",
       " ('concluyed', 'concluded'),\n",
       " ('recdorded', 'recorded'),\n",
       " ('fhe', 'the'),\n",
       " ('uegiment', 'regiment'),\n",
       " ('a', 'a'),\n",
       " ('glanes', 'planes'),\n",
       " ('conyrol', 'control'),\n",
       " ('thr', 'the'),\n",
       " ('arrext', 'arrest'),\n",
       " ('bth', 'both'),\n",
       " ('forward', 'forward'),\n",
       " ('allowdd', 'allowed'),\n",
       " ('revealed', 'revealed'),\n",
       " ('mayagement', 'management'),\n",
       " ('normal', 'normal')]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(zip(inp, gt))[:100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}