{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 82841986 is_char and is_digit" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 82075350 regrex non-ascii and none-digit" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 86460763 left" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import random\n", "import re\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "max_length = 25\n", "min_length = 1\n", "root = '../data'\n", "charset = 'abcdefghijklmnopqrstuvwxyz'\n", "digits = '0123456789'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def is_char(text, ratio=0.5):\n", " text = text.lower()\n", " length = max(len(text), 1)\n", " char_num = sum([t in charset for t in text])\n", " if char_num < min_length: return False\n", " if char_num / length < ratio: return False\n", " return True\n", "\n", "def is_digit(text, ratio=0.5):\n", " length = max(len(text), 1)\n", " digit_num = sum([t in digits for t in text])\n", " if digit_num / length < ratio: return False\n", " return True" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# generate training dataset" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "with open('/tmp/wikitext-103/wiki.train.tokens', 'r') as file:\n", " lines = file.readlines()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "inp, gt = [], []\n", "for line in lines:\n", " token = line.lower().split()\n", " for text in token:\n", " text = re.sub('[^0-9a-zA-Z]+', '', text)\n", " if len(text) < min_length:\n", " # print('short-text', text)\n", " continue\n", " if len(text) > max_length:\n", " # print('long-text', text)\n", " continue\n", " inp.append(text)\n", " gt.append(text)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "train_voc = os.path.join(root, 'WikiText-103.csv')\n", "pd.DataFrame({'inp':inp, 'gt':gt}).to_csv(train_voc, index=None, sep='\\t')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "86460763" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(inp)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['valkyria',\n", " 'chronicles',\n", " 'iii',\n", " 'senj',\n", " 'no',\n", " 'valkyria',\n", " '3',\n", " 'unk',\n", " 'chronicles',\n", " 'japanese',\n", " '3',\n", " 'lit',\n", " 'valkyria',\n", " 'of',\n", " 'the',\n", " 'battlefield',\n", " '3',\n", " 'commonly',\n", " 'referred',\n", " 'to',\n", " 'as',\n", " 'valkyria',\n", " 'chronicles',\n", " 'iii',\n", " 'outside',\n", " 'japan',\n", " 'is',\n", " 'a',\n", " 'tactical',\n", " 'role',\n", " 'playing',\n", " 'video',\n", " 'game',\n", " 'developed',\n", " 'by',\n", " 'sega',\n", " 'and',\n", " 'mediavision',\n", " 'for',\n", " 'the',\n", " 'playstation',\n", " 'portable',\n", " 'released',\n", " 'in',\n", " 'january',\n", " '2011',\n", " 'in',\n", " 'japan',\n", " 'it',\n", " 'is',\n", " 'the',\n", " 'third',\n", " 'game',\n", " 'in',\n", " 'the',\n", " 'valkyria',\n", " 'series',\n", " 'employing',\n", " 'the',\n", " 'same',\n", " 'fusion',\n", " 'of',\n", " 'tactical',\n", " 'and',\n", " 'real',\n", " 'time',\n", " 'gameplay',\n", " 'as',\n", " 'its',\n", " 'predecessors',\n", " 'the',\n", " 'story',\n", " 'runs',\n", " 'parallel',\n", " 'to',\n", " 'the',\n", " 'first',\n", " 'game',\n", " 'and',\n", " 'follows',\n", " 'the',\n", " 'nameless',\n", " 'a',\n", " 'penal',\n", " 'military',\n", " 'unit',\n", " 'serving',\n", " 'the',\n", " 'nation',\n", " 'of',\n", " 'gallia',\n", " 'during',\n", " 'the',\n", " 'second',\n", " 'europan',\n", " 'war',\n", " 'who',\n", " 'perform',\n", " 'secret',\n", " 'black']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inp[:100]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# generate evaluation dataset" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def disturb(word, degree, p=0.3):\n", " if len(word) // 2 < degree: return word\n", " if is_digit(word): return word\n", " if random.random() < p: return word\n", " else:\n", " index = list(range(len(word)))\n", " random.shuffle(index)\n", " index = index[:degree]\n", " new_word = []\n", " for i in range(len(word)):\n", " if i not in index: \n", " new_word.append(word[i])\n", " continue\n", " if (word[i] not in charset) and (word[i] not in digits):\n", " # special token\n", " new_word.append(word[i])\n", " continue\n", " op = random.random()\n", " if op < 0.1: # add\n", " new_word.append(random.choice(charset))\n", " new_word.append(word[i])\n", " elif op < 0.2: continue # remove\n", " else: new_word.append(random.choice(charset)) # replace\n", " return ''.join(new_word)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "lines = inp\n", "degree = 1\n", "keep_num = 50000\n", "\n", "random.shuffle(lines)\n", "part_lines = lines[:keep_num]\n", "inp, gt = [], []\n", "\n", "for w in part_lines:\n", " w = w.strip().lower()\n", " new_w = disturb(w, degree)\n", " inp.append(new_w)\n", " gt.append(w)\n", " \n", "eval_voc = os.path.join(root, f'WikiText-103_eval_d{degree}.csv')\n", "pd.DataFrame({'inp':inp, 'gt':gt}).to_csv(eval_voc, index=None, sep='\\t')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('high', 'high'),\n", " ('vctoria', 'victoria'),\n", " ('mains', 'mains'),\n", " ('bi', 'by'),\n", " ('13', '13'),\n", " ('ticnet', 'ticket'),\n", " ('basil', 'basic'),\n", " ('cut', 'cut'),\n", " ('aqarky', 'anarky'),\n", " ('the', 'the'),\n", " ('tqe', 'the'),\n", " ('oc', 'of'),\n", " ('diwpersal', 'dispersal'),\n", " ('traffic', 'traffic'),\n", " ('in', 'in'),\n", " ('the', 'the'),\n", " ('ti', 'to'),\n", " ('professionalms', 'professionals'),\n", " ('747', '747'),\n", " ('in', 'in'),\n", " ('and', 'and'),\n", " ('exezutive', 'executive'),\n", " ('n400', 'n400'),\n", " ('yusic', 'music'),\n", " ('s', 's'),\n", " ('henri', 'henry'),\n", " ('heard', 'heard'),\n", " ('thousand', 'thousand'),\n", " ('to', 'to'),\n", " ('arhy', 'army'),\n", " ('td', 'to'),\n", " ('a', 'a'),\n", " ('oall', 'hall'),\n", " ('qind', 'kind'),\n", " ('od', 'on'),\n", " ('samfria', 'samaria'),\n", " ('driveway', 'driveway'),\n", " ('which', 'which'),\n", " ('wotk', 'work'),\n", " ('ak', 'as'),\n", " ('persona', 'persona'),\n", " ('s', 's'),\n", " ('melbourne', 'melbourne'),\n", " ('apong', 'along'),\n", " ('fas', 'was'),\n", " ('thea', 'then'),\n", " ('permcy', 'percy'),\n", " ('nnd', 'and'),\n", " ('alan', 'alan'),\n", " ('13', '13'),\n", " ('matteos', 'matters'),\n", " ('against', 'against'),\n", " ('nefion', 'nexion'),\n", " ('held', 'held'),\n", " ('negative', 'negative'),\n", " ('gogd', 'good'),\n", " ('the', 'the'),\n", " ('thd', 'the'),\n", " ('groening', 'groening'),\n", " ('tqe', 'the'),\n", " ('cwould', 'would'),\n", " ('fb', 'ft'),\n", " ('uniten', 'united'),\n", " ('kone', 'one'),\n", " ('thiy', 'this'),\n", " ('lanren', 'lauren'),\n", " ('s', 's'),\n", " ('thhe', 'the'),\n", " ('is', 'is'),\n", " ('modep', 'model'),\n", " ('weird', 'weird'),\n", " ('angwer', 'answer'),\n", " ('imprisxnment', 'imprisonment'),\n", " ('marpery', 'margery'),\n", " ('eventuanly', 'eventually'),\n", " ('in', 'in'),\n", " ('donnoa', 'donna'),\n", " ('ik', 'it'),\n", " ('reached', 'reached'),\n", " ('at', 'at'),\n", " ('excxted', 'excited'),\n", " ('ws', 'was'),\n", " ('raes', 'rates'),\n", " ('the', 'the'),\n", " ('firsq', 'first'),\n", " ('concluyed', 'concluded'),\n", " ('recdorded', 'recorded'),\n", " ('fhe', 'the'),\n", " ('uegiment', 'regiment'),\n", " ('a', 'a'),\n", " ('glanes', 'planes'),\n", " ('conyrol', 'control'),\n", " ('thr', 'the'),\n", " ('arrext', 'arrest'),\n", " ('bth', 'both'),\n", " ('forward', 'forward'),\n", " ('allowdd', 'allowed'),\n", " ('revealed', 'revealed'),\n", " ('mayagement', 'management'),\n", " ('normal', 'normal')]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(zip(inp, gt))[:100]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 4 }