{ "cells": [ { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "# https://cloud.tencent.com/developer/article/2197062\n", "import re\n", "def cut_sent(input):\n", " lines = []\n", " i = 0\n", " line = \"\"\n", " while i < len(input):\n", " if input[i] == \"「\":\n", " if len(line) > 0:\n", " lines.append(line)\n", " line = \"\"\n", " line += input[i]\n", " i += 1\n", " while i < len(input) and input[i] != \"」\":\n", " line += input[i]\n", " i += 1\n", " if i < len(input):\n", " line += input[i]\n", " lines.append(line)\n", " line = \"\"\n", " else:\n", " line += input[i]\n", " i += 1\n", " if len(line) > 0:\n", " lines.append(line)\n", " sents = []\n", " for line in lines:\n", " if line.startswith(\"「\"):\n", " if len(sents) > 0 and not re.match(\"[。!?\\?]\", sents[-1][-1]):\n", " sents[-1] += line\n", " else:\n", " sents.append(line)\n", " else:\n", " line = re.sub('([。!?\\?])([^”’」])', r\"\\1\\n\\2\", line) # 单字符断句符\n", " line = re.sub('(\\.{6})([^”’」])', r\"\\1\\n\\2\", line) # 英文省略号\n", " line = re.sub('(\\…{2})([^”’」])', r\"\\1\\n\\2\", line) # 中文省略号\n", " line = re.sub('([。!?\\?][”’」])([^,。!?\\?])', r'\\1\\n\\2', line)\n", " # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\\n放到双引号后,注意前面的几句都小心保留了双引号\n", " line = line.rstrip() # 段尾如果有多余的\\n就去掉它\n", " # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。\n", " lines = line.split(\"\\n\")\n", " if len(sents) > 0 and re.search(\"[^。!?\\?][”’」]$\", sents[-1]):\n", " sents[-1] += lines[0]\n", " sents.extend(lines[1:])\n", " else:\n", " sents.extend(lines)\n", " return sents" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "novel_lines = []\n", "\n", "with open(\"train/little_prince.txt\", \"r\") as input_file:\n", " for line in input_file.read().splitlines():\n", " if len(line) > 0:\n", " novel_lines.extend(cut_sent(line))\n", "\n", "with open(\"train/animal_farm.txt\", \"r\") as input_file:\n", " for line in input_file.read().splitlines():\n", " if len(line) > 0:\n", " novel_lines.extend(cut_sent(line))\n", "\n", "with open(\"train/novels.can\", \"w+\") as output_file:\n", " for line in novel_lines:\n", " output_file.write(line + \"\\n\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }