{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Words</th>\n",
       "      <th>Frequency</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>有</td>\n",
       "      <td>51227728</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>我</td>\n",
       "      <td>43798085</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>一</td>\n",
       "      <td>43159170</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>的</td>\n",
       "      <td>40916482</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>你</td>\n",
       "      <td>30897176</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>133207</th>\n",
       "      <td>黎明網</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>133208</th>\n",
       "      <td>黎錦華</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>133209</th>\n",
       "      <td>墨包</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>133210</th>\n",
       "      <td>點晒穴</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>133211</th>\n",
       "      <td>齋頂</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>133212 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       Words  Frequency\n",
       "0          有   51227728\n",
       "1          我   43798085\n",
       "2          一   43159170\n",
       "3          的   40916482\n",
       "4          你   30897176\n",
       "...      ...        ...\n",
       "133207   黎明網         12\n",
       "133208   黎錦華         12\n",
       "133209    墨包         12\n",
       "133210   點晒穴         12\n",
       "133211    齋頂         12\n",
       "\n",
       "[133212 rows x 2 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Load Excel file and convert to dictionary\n",
    "df = pd.read_excel('CyberCan.xlsx')\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"CyberCan.dict\", \"w+\") as output_file:\n",
    "    for index, row in df.iterrows():\n",
    "        word = str(row['Words']).strip()\n",
    "        if not \" \" in word:\n",
    "            output_file.write(word + \" \" + str(row['Frequency']) + \"\\n\")\n",
    "    output_file.flush()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total words: 132895\n"
     ]
    }
   ],
   "source": [
    "puncts = [\"，\", \"。\", \"！\", \"？\", \"「\", \"」\", \"：\"]\n",
    "cybercan_words = set()\n",
    "\n",
    "for word in list(df['Words'].values) + puncts:\n",
    "    cybercan_words.add(word)\n",
    "\n",
    "print(\"Total words: {}\".format(len(cybercan_words)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "import jieba\n",
    "jieba.set_dictionary(\"CyberCan.dict\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total filtered lines: 140590\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "\n",
    "alnum = re.compile(\"[a-zA-Z0-9]\")\n",
    "filtered_lines = []\n",
    "\n",
    "with open(\"train/lihkg.can\", \"r\") as input_file:\n",
    "    for line in input_file.read().splitlines():\n",
    "        line = line.replace(\" \", \"\")\n",
    "        if len(line) < 10:\n",
    "            continue\n",
    "        if len(line) >= 64:\n",
    "            continue\n",
    "        if alnum.search(line):\n",
    "            continue\n",
    "        tokens = list(jieba.cut(line))\n",
    "        found_rare_word = False\n",
    "        for token in tokens:\n",
    "            if not token in cybercan_words:\n",
    "                found_rare_word = True\n",
    "                # print(\"Found rare word: {}\".format(token))\n",
    "                break\n",
    "        if found_rare_word:\n",
    "            continue\n",
    "        filtered_lines.append(line)\n",
    "\n",
    "print(\"Total filtered lines: {}\".format(len(filtered_lines)))\n",
    "\n",
    "with open(\"train/lihkg.filtered.can\", \"w+\") as output_file:\n",
    "    for line in filtered_lines:\n",
    "        output_file.write(line + \"\\n\")\n",
    "    output_file.flush()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}