{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
WordsFrequency
051227728
143798085
243159170
340916482
430897176
.........
133207黎明網12
133208黎錦華12
133209墨包12
133210點晒穴12
133211齋頂12
\n", "

133212 rows × 2 columns

\n", "
" ], "text/plain": [ " Words Frequency\n", "0 有 51227728\n", "1 我 43798085\n", "2 一 43159170\n", "3 的 40916482\n", "4 你 30897176\n", "... ... ...\n", "133207 黎明網 12\n", "133208 黎錦華 12\n", "133209 墨包 12\n", "133210 點晒穴 12\n", "133211 齋頂 12\n", "\n", "[133212 rows x 2 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "# Load Excel file and convert to dictionary\n", "df = pd.read_excel('CyberCan.xlsx')\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "with open(\"CyberCan.dict\", \"w+\") as output_file:\n", " for index, row in df.iterrows():\n", " word = str(row['Words']).strip()\n", " if not \" \" in word:\n", " output_file.write(word + \" \" + str(row['Frequency']) + \"\\n\")\n", " output_file.flush()\n" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total words: 132895\n" ] } ], "source": [ "puncts = [\",\", \"。\", \"!\", \"?\", \"「\", \"」\", \":\"]\n", "cybercan_words = set()\n", "\n", "for word in list(df['Words'].values) + puncts:\n", " cybercan_words.add(word)\n", "\n", "print(\"Total words: {}\".format(len(cybercan_words)))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "import jieba\n", "jieba.set_dictionary(\"CyberCan.dict\")" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total filtered lines: 140590\n" ] } ], "source": [ "import re\n", "\n", "alnum = re.compile(\"[a-zA-Z0-9]\")\n", "filtered_lines = []\n", "\n", "with open(\"train/lihkg.can\", \"r\") as input_file:\n", " for line in input_file.read().splitlines():\n", " line = line.replace(\" \", \"\")\n", " if len(line) < 10:\n", " continue\n", " if len(line) >= 64:\n", " continue\n", " if alnum.search(line):\n", " continue\n", " tokens = list(jieba.cut(line))\n", " found_rare_word = False\n", " for token in tokens:\n", " if not token in cybercan_words:\n", " found_rare_word = True\n", " # print(\"Found rare word: {}\".format(token))\n", " break\n", " if found_rare_word:\n", " continue\n", " filtered_lines.append(line)\n", "\n", "print(\"Total filtered lines: {}\".format(len(filtered_lines)))\n", "\n", "with open(\"train/lihkg.filtered.can\", \"w+\") as output_file:\n", " for line in filtered_lines:\n", " output_file.write(line + \"\\n\")\n", " output_file.flush()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }