File size: 3,764 Bytes
973eb8f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"# https://cloud.tencent.com/developer/article/2197062\n",
"import re\n",
"def cut_sent(input):\n",
" lines = []\n",
" i = 0\n",
" line = \"\"\n",
" while i < len(input):\n",
" if input[i] == \"「\":\n",
" if len(line) > 0:\n",
" lines.append(line)\n",
" line = \"\"\n",
" line += input[i]\n",
" i += 1\n",
" while i < len(input) and input[i] != \"」\":\n",
" line += input[i]\n",
" i += 1\n",
" if i < len(input):\n",
" line += input[i]\n",
" lines.append(line)\n",
" line = \"\"\n",
" else:\n",
" line += input[i]\n",
" i += 1\n",
" if len(line) > 0:\n",
" lines.append(line)\n",
" sents = []\n",
" for line in lines:\n",
" if line.startswith(\"「\"):\n",
" if len(sents) > 0 and not re.match(\"[。!?\\?]\", sents[-1][-1]):\n",
" sents[-1] += line\n",
" else:\n",
" sents.append(line)\n",
" else:\n",
" line = re.sub('([。!?\\?])([^”’」])', r\"\\1\\n\\2\", line) # 单字符断句符\n",
" line = re.sub('(\\.{6})([^”’」])', r\"\\1\\n\\2\", line) # 英文省略号\n",
" line = re.sub('(\\…{2})([^”’」])', r\"\\1\\n\\2\", line) # 中文省略号\n",
" line = re.sub('([。!?\\?][”’」])([^,。!?\\?])', r'\\1\\n\\2', line)\n",
" # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\\n放到双引号后,注意前面的几句都小心保留了双引号\n",
" line = line.rstrip() # 段尾如果有多余的\\n就去掉它\n",
" # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。\n",
" lines = line.split(\"\\n\")\n",
" if len(sents) > 0 and re.search(\"[^。!?\\?][”’」]$\", sents[-1]):\n",
" sents[-1] += lines[0]\n",
" sents.extend(lines[1:])\n",
" else:\n",
" sents.extend(lines)\n",
" return sents"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"novel_lines = []\n",
"\n",
"with open(\"train/little_prince.txt\", \"r\") as input_file:\n",
" for line in input_file.read().splitlines():\n",
" if len(line) > 0:\n",
" novel_lines.extend(cut_sent(line))\n",
"\n",
"with open(\"train/animal_farm.txt\", \"r\") as input_file:\n",
" for line in input_file.read().splitlines():\n",
" if len(line) > 0:\n",
" novel_lines.extend(cut_sent(line))\n",
"\n",
"with open(\"train/novels.can\", \"w+\") as output_file:\n",
" for line in novel_lines:\n",
" output_file.write(line + \"\\n\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
|