File size: 3,764 Bytes
a54c5b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://cloud.tencent.com/developer/article/2197062\n",
    "import re\n",
    "def cut_sent(input):\n",
    "    lines = []\n",
    "    i = 0\n",
    "    line = \"\"\n",
    "    while i < len(input):\n",
    "        if input[i] == \"\":\n",
    "            if len(line) > 0:\n",
    "                lines.append(line)\n",
    "                line = \"\"\n",
    "            line += input[i]\n",
    "            i += 1\n",
    "            while i < len(input) and input[i] != \"\":\n",
    "                line += input[i]\n",
    "                i += 1\n",
    "            if i < len(input):\n",
    "                line += input[i]\n",
    "            lines.append(line)\n",
    "            line = \"\"\n",
    "        else:\n",
    "            line += input[i]\n",
    "        i += 1\n",
    "    if len(line) > 0:\n",
    "        lines.append(line)\n",
    "    sents = []\n",
    "    for line in lines:\n",
    "        if line.startswith(\"\"):\n",
    "            if len(sents) > 0 and not re.match(\"[。!?\\?]\", sents[-1][-1]):\n",
    "                sents[-1] += line\n",
    "            else:\n",
    "                sents.append(line)\n",
    "        else:\n",
    "            line = re.sub('([。!?\\?])([^”’」])', r\"\\1\\n\\2\", line)  # 单字符断句符\n",
    "            line = re.sub('(\\.{6})([^”’」])', r\"\\1\\n\\2\", line)  # 英文省略号\n",
    "            line = re.sub('(\\…{2})([^”’」])', r\"\\1\\n\\2\", line)  # 中文省略号\n",
    "            line = re.sub('([。!?\\?][”’」])([^,。!?\\?])', r'\\1\\n\\2', line)\n",
    "            # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\\n放到双引号后,注意前面的几句都小心保留了双引号\n",
    "            line = line.rstrip()  # 段尾如果有多余的\\n就去掉它\n",
    "            # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。\n",
    "            lines = line.split(\"\\n\")\n",
    "            if len(sents) > 0 and re.search(\"[^。!?\\?][”’」]$\", sents[-1]):\n",
    "                sents[-1] += lines[0]\n",
    "                sents.extend(lines[1:])\n",
    "            else:\n",
    "                sents.extend(lines)\n",
    "    return sents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "novel_lines = []\n",
    "\n",
    "with open(\"train/little_prince.txt\", \"r\") as input_file:\n",
    "    for line in input_file.read().splitlines():\n",
    "        if len(line) > 0:\n",
    "            novel_lines.extend(cut_sent(line))\n",
    "\n",
    "with open(\"train/animal_farm.txt\", \"r\") as input_file:\n",
    "    for line in input_file.read().splitlines():\n",
    "        if len(line) > 0:\n",
    "            novel_lines.extend(cut_sent(line))\n",
    "\n",
    "with open(\"train/novels.can\", \"w+\") as output_file:\n",
    "    for line in novel_lines:\n",
    "        output_file.write(line + \"\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}