freemt commited on
Commit
57b1c4f
1 Parent(s): 78ab3ee

Before branch dev

Browse files
gradiobee/seg_text.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Split text to sentences.
2
+
3
+ Use sentence_splitter if supported,
4
+ else use polyglot.text.Text
5
+
6
+ from hlm_texts
7
+
8
+ !apt install libicu-dev
9
+ !install pyicu pycld2 Morfessor
10
+ !pip install polyglot sentence_splitter
11
+ """
12
+ from typing import List, Optional
13
+
14
+ from tqdm.auto import tqdm
15
+ from polyglot.detect.base import logger as polyglot_logger
16
+ from polyglot.text import Detector, Text
17
+ from sentence_splitter import split_text_into_sentences
18
+
19
+ from logzero import logger
20
+
21
+ # turn of polyglot.text.Detector warning
22
+ polyglot_logger.setLevel("ERROR")
23
+
24
+
25
+ # fmt: off
26
+ # use sentence_splitter if supported
27
+ LANG_S = ["ca", "cs", "da", "nl", "en", "fi", "fr", "de",
28
+ "el", "hu", "is", "it", "lv", "lt", "no", "pl",
29
+ "pt", "ro", "ru", "sk", "sl", "es", "sv", "tr"]
30
+
31
+
32
+ def seg_text(
33
+ text: str,
34
+ lang: Optional[str] = None,
35
+ qmode: bool = False,
36
+ maxlines: int = 1000
37
+ ) -> List[str]:
38
+ # fmt: on
39
+ """
40
+ Split text to sentences.
41
+
42
+ Use sentence_splitter if supported,
43
+ else use polyglot.text.Text.sentences
44
+
45
+ qmode: skip split_text_into_sentences if True, default False
46
+ vectors for all books are based on qmode=False.
47
+ qmode=True is for quick test purpose only
48
+
49
+ maxlines (default 1000), threhold for turn on tqdm progressbar
50
+ set to <1 or a large number to turn it off
51
+ """
52
+ if lang is None:
53
+ try:
54
+ lang = Detector(text).language.code
55
+ except Exception as exc:
56
+ logger.warning("polyglot.text.Detector exc: %s, setting to 'en'", exc)
57
+ lang = "en"
58
+
59
+ if not qmode and lang in LANG_S:
60
+ _ = []
61
+ lines = text.splitlines()
62
+ # if maxlines > 1 and len(lines) > maxlines:
63
+ if len(lines) > maxlines > 1:
64
+ for para in tqdm(lines):
65
+ if para.strip():
66
+ _.extend(split_text_into_sentences(para, lang))
67
+ else:
68
+ for para in lines:
69
+ if para.strip():
70
+ _.extend(split_text_into_sentences(para, lang))
71
+ return _
72
+
73
+ # return split_text_into_sentences(text, lang)
74
+
75
+ return [elm.string for elm in Text(text, lang).sentences]
requirements-compiled.txt ADDED
File without changes
requirements-save.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ chardet
2
+ certifi
3
+ charset-normalizer
4
+ idna
5
+ typing-extensions
6
+ sklearn
7
+ textacy
8
+ logzero
9
+ more_itertools
10
+ psutil
11
+ seaborn
requirements.in ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
1
+ sklearn
2
+ textacy
3
+ logzero
4
+ more_itertools
5
+ psutil
6
+ seaborn
7
+ Morfessor
8
+ # pyicu
9
+ pycld2
requirements.txt CHANGED
@@ -1,11 +1,182 @@
1
- chardet
2
- certifi
3
- charset-normalizer
4
- idna
5
- typing-extensions
6
- sklearn
7
- textacy
8
- logzero
9
- more_itertools
10
- psutil
11
- seaborn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # This file is autogenerated by pip-compile with python 3.7
3
+ # To update, run:
4
+ #
5
+ # pip-compile requirements.in
6
+ #
7
+ blis==0.7.5
8
+ # via
9
+ # spacy
10
+ # thinc
11
+ cachetools==5.0.0
12
+ # via textacy
13
+ catalogue==2.0.6
14
+ # via
15
+ # spacy
16
+ # srsly
17
+ # thinc
18
+ certifi==2021.10.8
19
+ # via requests
20
+ charset-normalizer==2.0.9
21
+ # via requests
22
+ click==8.0.3
23
+ # via typer
24
+ colorama==0.4.4
25
+ # via
26
+ # click
27
+ # logzero
28
+ # tqdm
29
+ cycler==0.11.0
30
+ # via matplotlib
31
+ cymem==2.0.6
32
+ # via
33
+ # preshed
34
+ # spacy
35
+ # thinc
36
+ cytoolz==0.11.2
37
+ # via textacy
38
+ fonttools==4.28.5
39
+ # via matplotlib
40
+ idna==3.3
41
+ # via requests
42
+ importlib-metadata==4.10.0
43
+ # via click
44
+ jellyfish==0.8.9
45
+ # via textacy
46
+ jinja2==3.0.3
47
+ # via spacy
48
+ joblib==1.1.0
49
+ # via
50
+ # scikit-learn
51
+ # textacy
52
+ kiwisolver==1.3.2
53
+ # via matplotlib
54
+ langcodes==3.3.0
55
+ # via spacy
56
+ logzero==1.7.0
57
+ # via -r requirements.in
58
+ markupsafe==2.0.1
59
+ # via jinja2
60
+ matplotlib==3.5.1
61
+ # via seaborn
62
+ more-itertools==8.12.0
63
+ # via -r requirements.in
64
+ morfessor==2.0.6
65
+ # via -r requirements.in
66
+ murmurhash==1.0.6
67
+ # via
68
+ # preshed
69
+ # spacy
70
+ # thinc
71
+ networkx==2.6.3
72
+ # via textacy
73
+ numpy==1.21.5
74
+ # via
75
+ # blis
76
+ # matplotlib
77
+ # pandas
78
+ # scikit-learn
79
+ # scipy
80
+ # seaborn
81
+ # spacy
82
+ # textacy
83
+ # thinc
84
+ packaging==21.3
85
+ # via
86
+ # matplotlib
87
+ # spacy
88
+ pandas==1.3.5
89
+ # via seaborn
90
+ pathy==0.6.1
91
+ # via spacy
92
+ pillow==8.4.0
93
+ # via matplotlib
94
+ preshed==3.0.6
95
+ # via
96
+ # spacy
97
+ # thinc
98
+ psutil==5.8.0
99
+ # via -r requirements.in
100
+ pycld2==0.41
101
+ # via -r requirements.in
102
+ pydantic==1.8.2
103
+ # via
104
+ # spacy
105
+ # thinc
106
+ pyparsing==3.0.6
107
+ # via
108
+ # matplotlib
109
+ # packaging
110
+ pyphen==0.12.0
111
+ # via textacy
112
+ python-dateutil==2.8.2
113
+ # via
114
+ # matplotlib
115
+ # pandas
116
+ pytz==2021.3
117
+ # via pandas
118
+ requests==2.26.0
119
+ # via
120
+ # spacy
121
+ # textacy
122
+ scikit-learn==1.0.2
123
+ # via
124
+ # sklearn
125
+ # textacy
126
+ scipy==1.7.3
127
+ # via
128
+ # scikit-learn
129
+ # seaborn
130
+ # textacy
131
+ seaborn==0.11.2
132
+ # via -r requirements.in
133
+ six==1.16.0
134
+ # via python-dateutil
135
+ sklearn==0.0
136
+ # via -r requirements.in
137
+ smart-open==5.2.1
138
+ # via pathy
139
+ spacy==3.2.1
140
+ # via textacy
141
+ spacy-legacy==3.0.8
142
+ # via spacy
143
+ spacy-loggers==1.0.1
144
+ # via spacy
145
+ srsly==2.4.2
146
+ # via
147
+ # spacy
148
+ # thinc
149
+ textacy==0.11.0
150
+ # via -r requirements.in
151
+ thinc==8.0.13
152
+ # via spacy
153
+ threadpoolctl==3.0.0
154
+ # via scikit-learn
155
+ toolz==0.11.2
156
+ # via cytoolz
157
+ tqdm==4.62.3
158
+ # via
159
+ # spacy
160
+ # textacy
161
+ typer==0.4.0
162
+ # via
163
+ # pathy
164
+ # spacy
165
+ typing-extensions==3.10.0.2
166
+ # via
167
+ # catalogue
168
+ # pydantic
169
+ # spacy
170
+ # thinc
171
+ urllib3==1.26.7
172
+ # via requests
173
+ wasabi==0.9.0
174
+ # via
175
+ # spacy
176
+ # spacy-loggers
177
+ # thinc
178
+ zipp==3.6.0
179
+ # via catalogue
180
+
181
+ # The following packages are considered to be unsafe in a requirements file:
182
+ # setuptools