Pietro Lesci commited on
Commit
b3ecaa7
1 Parent(s): b482a79

add support for chinese

Browse files
.streamlit/config.toml CHANGED
@@ -1,7 +1,7 @@
1
  [server]
2
  # Max size, in megabytes, for files uploaded with the file_uploader.
3
  # Default: 200
4
- maxUploadSize = 20
5
 
6
  [browser]
7
  gatherUsageStats = false
1
  [server]
2
  # Max size, in megabytes, for files uploaded with the file_uploader.
3
  # Default: 200
4
+ maxUploadSize = 10
5
 
6
  [browser]
7
  gatherUsageStats = false
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import streamlit as st
2
 
3
- from src.components import faq, footer, form, presentation, analysis, docs
 
4
  from src.utils import convert_df, get_logo, read_file
5
 
6
  # app configs
@@ -25,7 +26,7 @@ st.title("Wordify")
25
  # file uploader
26
  uploaded_fl = st.sidebar.file_uploader(
27
  label="Choose a file",
28
- type=["csv", "parquet", "tsv", "xlsx"],
29
  accept_multiple_files=False,
30
  help="""
31
  Supported formats:
1
  import streamlit as st
2
 
3
+ from src.components import analysis, docs, faq, footer, form, presentation
4
+ from src.configs import SupportedFiles
5
  from src.utils import convert_df, get_logo, read_file
6
 
7
  # app configs
26
  # file uploader
27
  uploaded_fl = st.sidebar.file_uploader(
28
  label="Choose a file",
29
+ type=[i.name for i in SupportedFiles],
30
  accept_multiple_files=False,
31
  help="""
32
  Supported formats:
data/test_chinese.xlsx ADDED
Binary file (580 kB). View file
requirements.txt CHANGED
@@ -37,3 +37,6 @@ https://github.com/explosion/spacy-models/releases/download/ro_core_news_sm-3.2.
37
  https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.2.0/ru_core_news_sm-3.2.0.tar.gz#egg=ru_core_news_sm
38
  # multi-language
39
  https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.2.0/xx_ent_wiki_sm-3.2.0.tar.gz#egg=xx_ent_wiki_sm
 
 
 
37
  https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.2.0/ru_core_news_sm-3.2.0.tar.gz#egg=ru_core_news_sm
38
  # multi-language
39
  https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.2.0/xx_ent_wiki_sm-3.2.0.tar.gz#egg=xx_ent_wiki_sm
40
+ # chinese
41
+ https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.2.0/zh_core_web_sm-3.2.0.tar.gz#egg=zh_core_web_sm
42
+
src/components.py CHANGED
@@ -1,11 +1,12 @@
1
- import streamlit as st
2
  import time
 
3
  import pandas as pd
 
4
 
5
- from src.configs import Languages, PreprocessingConfigs, SupportedFiles, ColumnNames
6
  from src.preprocessing import PreprocessingPipeline
7
- from src.wordifier import input_transform, output_transform, wordifier
8
  from src.utils import get_col_indices
 
9
 
10
 
11
  def docs():
@@ -78,7 +79,7 @@ def form(df):
78
  "Select lemmatization",
79
  options=lammatization_options,
80
  index=PreprocessingConfigs.DEFAULT_LEMMA.value,
81
- help="Select lemmatization procedure",
82
  )
83
 
84
  post_steps = st.multiselect(
@@ -98,6 +99,11 @@ def form(df):
98
 
99
  start_time = time.time()
100
 
 
 
 
 
 
101
  # preprocess
102
  if not disable_preprocessing:
103
  with st.spinner("Step 1/4: Preprocessing text"):
@@ -109,7 +115,10 @@ def form(df):
109
  with st.spinner(
110
  "Step 1/4: Preprocessing has been disabled - doing nothing"
111
  ):
112
- time.sleep(1.5)
 
 
 
113
 
114
  # prepare input
115
  with st.spinner("Step 2/4: Preparing inputs"):
@@ -260,6 +269,15 @@ def presentation():
260
  you provide a file following this naming convention, Wordify will automatically select the
261
  correct columns. However, if you wish to use a different nomenclature, you will be asked to
262
  provide the column names in the interactive UI.
 
 
 
 
 
 
 
 
 
263
  """
264
  )
265
 
@@ -377,3 +395,32 @@ def analysis(outputs):
377
  st.write(meta_data["labels"])
378
 
379
  return subset_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import time
2
+
3
  import pandas as pd
4
+ import streamlit as st
5
 
6
+ from src.configs import ColumnNames, Languages, PreprocessingConfigs, SupportedFiles
7
  from src.preprocessing import PreprocessingPipeline
 
8
  from src.utils import get_col_indices
9
+ from src.wordifier import input_transform, output_transform, wordifier
10
 
11
 
12
  def docs():
79
  "Select lemmatization",
80
  options=lammatization_options,
81
  index=PreprocessingConfigs.DEFAULT_LEMMA.value,
82
+ help="Select lemmatization procedure. This is automatically disabled when the selected language is Chinese or MultiLanguage.",
83
  )
84
 
85
  post_steps = st.multiselect(
99
 
100
  start_time = time.time()
101
 
102
+ # warnings about inputs
103
+ language_specific_warnings(
104
+ pre_steps, post_steps, lemmatization_step, language
105
+ )
106
+
107
  # preprocess
108
  if not disable_preprocessing:
109
  with st.spinner("Step 1/4: Preprocessing text"):
115
  with st.spinner(
116
  "Step 1/4: Preprocessing has been disabled - doing nothing"
117
  ):
118
+ df = df.rename(
119
+ columns={text_column: ColumnNames.PROCESSED_TEXT.value}
120
+ )
121
+ time.sleep(1.2)
122
 
123
  # prepare input
124
  with st.spinner("Step 2/4: Preparing inputs"):
269
  you provide a file following this naming convention, Wordify will automatically select the
270
  correct columns. However, if you wish to use a different nomenclature, you will be asked to
271
  provide the column names in the interactive UI.
272
+
273
+ - Maintain a stable connection with the Wordify page until you download the data. If you refresh the page,
274
+ a new Wordify session is created and your progress is lost.
275
+
276
+ - Wordify performances depend on the length of the individual texts in your file. The longer the texts, the higher
277
+ the chance that Wordify considers many n-grams. More n-grams means more data to analyse in each run.
278
+ We tailored Wordify performance for files of approximately 5'000 lines or 50k n-grams. In such cases we expect a runtime
279
+ between 90 seconds and 10 minutes. If your file is big, try to apply a stricter preprocessing of the text in the `Advanced Options` section.
280
+ If this is not enough, please do feel free to reach out to us directly so we can help.
281
  """
282
  )
283
 
395
  st.write(meta_data["labels"])
396
 
397
  return subset_df
398
+
399
+
400
+ # warning for Chinese and MultiLanguage
401
+ def language_specific_warnings(pre_steps, post_steps, lemmatization_step, language):
402
+
403
+ if language in ("MultiLanguage", "Chinese") and (
404
+ "remove_non_words" in pre_steps or "remove_non_words" in post_steps
405
+ ):
406
+ msg = """
407
+ NOTE: for Chinese and MultiLanguage we automatically substitute `remove_non_words` with
408
+ `remove_numbers` and `remove_punctuation` to avoid wrong results.
409
+ """
410
+ st.info(msg)
411
+
412
+ msg = "NOTE: for Chinese and MultiLanguage we turn-off lemmatization automatically."
413
+ if lemmatization_step == "Spacy lemmatizer (keep stopwords)" and language in (
414
+ "MultiLanguage",
415
+ "Chinese",
416
+ ):
417
+ st.info(msg)
418
+
419
+ elif lemmatization_step == "Spacy lemmatizer (remove stopwords)" and language in (
420
+ "MultiLanguage",
421
+ "Chinese",
422
+ ):
423
+ st.info(
424
+ msg
425
+ + " However we will still remove stopwords since you selected `Spacy lemmatizer (remove stopwords)`."
426
+ )
src/configs.py CHANGED
@@ -25,7 +25,7 @@ class InputTransformConfigs(Enum):
25
 
26
 
27
  class PreprocessingConfigs(Enum):
28
- DEFAULT_PRE = [1, 14, 2, 3, 4, 21, 23, 22, 5, 24]
29
  DEFAULT_LEMMA = 1
30
  DEFAULT_POST = [0, 17, 15, 19, 23, 22, 21, 24]
31
 
@@ -39,7 +39,6 @@ class Languages(Enum):
39
  Dutch = "nl_core_news_sm"
40
  Portuguese = "pt_core_news_sm"
41
  French = "fr_core_news_sm"
42
- # Chinese = "zh_core_news_sm"
43
  Danish = "da_core_news_sm"
44
  # Japanese = "ja_core_news_sm"
45
  Lithuanian = "lt_core_news_sm"
@@ -48,9 +47,11 @@ class Languages(Enum):
48
  Romanian = "ro_core_news_sm"
49
  Russian = "ru_core_news_sm"
50
  MultiLanguage = "xx_ent_wiki_sm"
 
51
 
52
 
53
  class SupportedFiles(Enum):
54
  xlsx = (lambda x: pd.read_excel(x, dtype=str),)
55
- csv = (lambda x: pd.read_csv(x, dtype=str),)
 
56
  parquet = (lambda x: pd.read_parquet(x),)
25
 
26
 
27
  class PreprocessingConfigs(Enum):
28
+ DEFAULT_PRE = [1, 14, 2, 3, 4, 5, 23, 22, 21, 24]
29
  DEFAULT_LEMMA = 1
30
  DEFAULT_POST = [0, 17, 15, 19, 23, 22, 21, 24]
31
 
39
  Dutch = "nl_core_news_sm"
40
  Portuguese = "pt_core_news_sm"
41
  French = "fr_core_news_sm"
 
42
  Danish = "da_core_news_sm"
43
  # Japanese = "ja_core_news_sm"
44
  Lithuanian = "lt_core_news_sm"
47
  Romanian = "ro_core_news_sm"
48
  Russian = "ru_core_news_sm"
49
  MultiLanguage = "xx_ent_wiki_sm"
50
+ Chinese = "zh_core_web_sm"
51
 
52
 
53
  class SupportedFiles(Enum):
54
  xlsx = (lambda x: pd.read_excel(x, dtype=str),)
55
+ tsv = (lambda x: pd.read_csv(x, dtype=str, sep="\t"),)
56
+ csv = (lambda x: pd.read_csv(x, dtype=str, sep=","),)
57
  parquet = (lambda x: pd.read_parquet(x),)
src/preprocessing.py CHANGED
@@ -3,11 +3,9 @@ import os
3
  import re
4
  import string
5
  from collections import OrderedDict
6
- from typing import Callable, List, Optional
7
 
8
- import pandas as pd
9
  import spacy
10
- import streamlit as st
11
  import vaex
12
  from pandas.core.frame import DataFrame
13
  from pandas.core.series import Series
@@ -99,14 +97,10 @@ class PreprocessingPipeline:
99
  self.lemmatization_step = lemmatization_step
100
  self.post_steps = post_steps
101
 
102
- self.nlp = (
103
- spacy.load(Languages[language].value, disable=["parser", "ner"])
104
- if self.lemmatization_step != "Disable lemmatizer"
105
- else identity
106
- )
107
- self.pre = self.make_pipe_component(self.pre_steps)
108
- self.post = self.make_pipe_component(self.post_steps)
109
- self.lemma = self.lemmatization_component().get(self.lemmatization_step)
110
 
111
  # def apply_multiproc(fn, series):
112
  # with mp.Pool(mp.cpu_count()) as pool:
@@ -148,13 +142,59 @@ class PreprocessingPipeline:
148
 
149
  # return series
150
 
151
- def make_pipe_component(self, steps: Optional[List[str]]) -> Optional[Callable]:
 
152
  if not steps:
153
  return identity
154
- components = [self.pipeline_components()[step] for step in steps]
 
 
 
 
 
 
 
 
 
155
 
156
  return make_pipeline(*components)
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  @staticmethod
159
  def pipeline_components() -> "OrderedDict[str, Callable]":
160
  """Returns available cleaning steps in order"""
@@ -193,7 +233,7 @@ class PreprocessingPipeline:
193
  return OrderedDict(
194
  [
195
  ("Spacy lemmatizer (keep stopwords)", lemmatize_keep_stopwords),
196
- ("Spacy lemmatizer (no stopwords)", lemmatize_remove_stopwords),
197
  ("Disable lemmatizer", identity),
198
  ("Remove stopwords", remove_stopwords),
199
  ]
3
  import re
4
  import string
5
  from collections import OrderedDict
6
+ from typing import Callable, List, Optional, Union
7
 
 
8
  import spacy
 
9
  import vaex
10
  from pandas.core.frame import DataFrame
11
  from pandas.core.series import Series
97
  self.lemmatization_step = lemmatization_step
98
  self.post_steps = post_steps
99
 
100
+ self.pre = self.make_pipe_component(self.pre_steps, self.language)
101
+ self.post = self.make_pipe_component(self.post_steps, self.language)
102
+ self.nlp = self.make_nlp(self.lemmatization_step, self.language)
103
+ self.lemma = self.make_lemma(self.lemmatization_step, self.language)
 
 
 
 
104
 
105
  # def apply_multiproc(fn, series):
106
  # with mp.Pool(mp.cpu_count()) as pool:
142
 
143
  # return series
144
 
145
+ @classmethod
146
+ def make_pipe_component(cls, steps: Optional[List[str]], language: str) -> Callable:
147
  if not steps:
148
  return identity
149
+
150
+ elif language in ("MultiLanguage", "Chinese") and "remove_non_words" in steps:
151
+ idx = steps.index("remove_non_words")
152
+ steps = (
153
+ steps[:idx]
154
+ + ["remove_numbers", "remove_punctuation"]
155
+ + steps[idx + 1 :]
156
+ )
157
+
158
+ components = [cls.pipeline_components()[step] for step in steps]
159
 
160
  return make_pipeline(*components)
161
 
162
+ @staticmethod
163
+ def make_nlp(
164
+ lemmatization_step: Optional[str], language: str
165
+ ) -> Union[spacy.language.Language, Callable]:
166
+ if (
167
+ lemmatization_step is None
168
+ or lemmatization_step == "Disable lemmatizer"
169
+ or (
170
+ lemmatization_step == "Spacy lemmatizer (keep stopwords)"
171
+ and language in ("MultiLanguage", "Chinese")
172
+ )
173
+ ):
174
+ return identity
175
+ return spacy.load(Languages[language].value, disable=["parser", "ner"])
176
+
177
+ @classmethod
178
+ def make_lemma(cls, lemmatization_step: Optional[str], language: str) -> Callable:
179
+
180
+ if (
181
+ lemmatization_step is None
182
+ or lemmatization_step == "Disable lemmatizer"
183
+ or (
184
+ lemmatization_step == "Spacy lemmatizer (keep stopwords)"
185
+ and language in ("MultiLanguage", "Chinese")
186
+ )
187
+ ):
188
+ return identity
189
+
190
+ elif (
191
+ lemmatization_step == "Spacy lemmatizer (remove stopwords)"
192
+ and language in ("MultiLanguage", "Chinese")
193
+ ):
194
+ return cls.lemmatization_component().get("Remove stopwords")
195
+
196
+ return cls.lemmatization_component().get(lemmatization_step)
197
+
198
  @staticmethod
199
  def pipeline_components() -> "OrderedDict[str, Callable]":
200
  """Returns available cleaning steps in order"""
233
  return OrderedDict(
234
  [
235
  ("Spacy lemmatizer (keep stopwords)", lemmatize_keep_stopwords),
236
+ ("Spacy lemmatizer (remove stopwords)", lemmatize_remove_stopwords),
237
  ("Disable lemmatizer", identity),
238
  ("Remove stopwords", remove_stopwords),
239
  ]
src/utils.py CHANGED
@@ -1,12 +1,13 @@
1
  import base64
2
  from typing import List, Tuple
3
- from pandas.core.frame import DataFrame
4
  import streamlit as st
 
5
  from PIL import Image
6
 
7
- # import altair as alt
8
 
9
- from .configs import SupportedFiles, ColumnNames
10
 
11
 
12
  def get_col_indices(cols: List) -> Tuple[int, int]:
1
  import base64
2
  from typing import List, Tuple
3
+
4
  import streamlit as st
5
+ from pandas.core.frame import DataFrame
6
  from PIL import Image
7
 
8
+ from .configs import ColumnNames, SupportedFiles
9
 
10
+ # import altair as alt
11
 
12
 
13
  def get_col_indices(cols: List) -> Tuple[int, int]:
tests/notebook.ipynb CHANGED
@@ -21,7 +21,8 @@
21
  "metadata": {},
22
  "outputs": [],
23
  "source": [
24
- "df = pd.read_csv(\"../data/test_en.csv\")"
 
25
  ]
26
  },
27
  {
@@ -36,10 +37,10 @@
36
  " \"normalize_bullet_points\",\n",
37
  " \"normalize_hyphenated_words\",\n",
38
  " \"normalize_quotation_marks\",\n",
39
- " \"normalize_useless_spaces\",\n",
40
  " \"normalize_repeating_words\",\n",
41
  " \"normalize_repeating_chars\",\n",
42
- " \"normalize_whitespaces\",\n",
43
  " # \"replace_currency_symbols\",\n",
44
  " # \"replace_emails\",\n",
45
  " # \"replace_emojis\",\n",
@@ -60,7 +61,7 @@
60
  },
61
  {
62
  "cell_type": "code",
63
- "execution_count": 4,
64
  "metadata": {},
65
  "outputs": [],
66
  "source": [
@@ -74,8 +75,8 @@
74
  " # \"replace_emojis\",\n",
75
  " # \"replace_phone_numbers\",\n",
76
  " # \"replace_numbers\",\n",
77
- " \"remove_html_tags\",\n",
78
- " \"remove_accents\",\n",
79
  " # \"remove_brackets\",\n",
80
  " \"remove_non_words\",\n",
81
  " # \"remove_numbers\",\n",
@@ -89,13 +90,13 @@
89
  },
90
  {
91
  "cell_type": "code",
92
- "execution_count": 5,
93
  "metadata": {},
94
  "outputs": [],
95
  "source": [
96
  "pipe = PreprocessingPipeline(\n",
97
- " language=\"English\",\n",
98
- " lemmatization_step=\"Spacy lemmatizer (no stopwords)\", # \"Disable lemmatizer\",\n",
99
  " pre_steps=pre_steps,\n",
100
  " post_steps=post_steps,\n",
101
  ")"
@@ -103,218 +104,125 @@
103
  },
104
  {
105
  "cell_type": "code",
106
- "execution_count": 6,
107
  "metadata": {},
108
  "outputs": [
109
  {
110
  "data": {
111
  "text/plain": [
112
- "\"I think it's time John Rambo move on with his life and try to put Vietnam behind him. This series is getting old and Rambo is no longer a solider but a cold blooded killer. Ever time he turns up on the screen someone dies. Vietnam was not a fun place to be and frankly I am tired of Hollywood making it seem like it was. This is not the worst of the films concerning Vietnam, that honor goes to John Waynes Green Berets. In any case John Rambo carrying around a 50 cal Machine Gun taking on what seems to be half of the Viet Cong army plus a good many Russians is an insult to watch. What is worse is Rambos cheesy speech at the end.Please!! Oh yeah I heard they are making another one.\""
113
  ]
114
  },
115
- "execution_count": 6,
116
  "metadata": {},
117
  "output_type": "execute_result"
118
  }
119
  ],
120
  "source": [
121
- "pipe.pre(df.text[0])"
122
  ]
123
  },
124
  {
125
  "cell_type": "code",
126
- "execution_count": 7,
127
  "metadata": {},
128
  "outputs": [
129
  {
130
  "data": {
131
  "text/plain": [
132
- "'think time John Rambo life try Vietnam . series get old Rambo long solider cold blooded killer . time turn screen die . Vietnam fun place frankly tired Hollywood make like . bad film concern Vietnam , honor go John Waynes Green Berets . case John Rambo carry 50 cal Machine Gun take half Viet Cong army plus good Russians insult watch . bad Rambos cheesy speech end . ! ! oh yeah hear make .'"
133
  ]
134
  },
135
- "execution_count": 7,
136
  "metadata": {},
137
  "output_type": "execute_result"
138
  }
139
  ],
140
  "source": [
141
- "pipe.lemma(pipe.nlp(pipe.pre(df.text[0])))"
142
  ]
143
  },
144
  {
145
  "cell_type": "code",
146
- "execution_count": 8,
147
  "metadata": {},
148
  "outputs": [
149
  {
150
  "data": {
151
  "text/plain": [
152
- "'think time john rambo life try vietnam series get old rambo long solider cold blooded killer time turn screen die vietnam fun place frankly tired hollywood make like bad film concern vietnam honor go john waynes green berets case john rambo carry cal machine gun take half viet cong army plus good russians insult watch bad rambos cheesy speech end oh yeah hear make'"
153
  ]
154
  },
155
- "execution_count": 8,
156
  "metadata": {},
157
  "output_type": "execute_result"
158
  }
159
  ],
160
  "source": [
161
- "pipe.post(pipe.lemma(pipe.nlp(pipe.pre(df.text[0]))))"
162
  ]
163
  },
164
  {
165
  "cell_type": "code",
166
- "execution_count": 9,
167
  "metadata": {},
168
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
169
  "source": [
170
- "odf = pipe.vaex_process(df, \"text\")"
171
  ]
172
  },
173
  {
174
  "cell_type": "code",
175
- "execution_count": 14,
176
  "metadata": {},
177
  "outputs": [
178
  {
179
  "data": {
180
- "text/html": [
181
- "<div>\n",
182
- "<style scoped>\n",
183
- " .dataframe tbody tr th:only-of-type {\n",
184
- " vertical-align: middle;\n",
185
- " }\n",
186
- "\n",
187
- " .dataframe tbody tr th {\n",
188
- " vertical-align: top;\n",
189
- " }\n",
190
- "\n",
191
- " .dataframe thead th {\n",
192
- " text-align: right;\n",
193
- " }\n",
194
- "</style>\n",
195
- "<table border=\"1\" class=\"dataframe\">\n",
196
- " <thead>\n",
197
- " <tr style=\"text-align: right;\">\n",
198
- " <th></th>\n",
199
- " <th>label</th>\n",
200
- " <th>text</th>\n",
201
- " <th>processed_text</th>\n",
202
- " </tr>\n",
203
- " </thead>\n",
204
- " <tbody>\n",
205
- " <tr>\n",
206
- " <th>0</th>\n",
207
- " <td>0</td>\n",
208
- " <td>I think it's time John Rambo move on with his ...</td>\n",
209
- " <td>think time john rambo life try vietnam series ...</td>\n",
210
- " </tr>\n",
211
- " <tr>\n",
212
- " <th>1</th>\n",
213
- " <td>1</td>\n",
214
- " <td>I've just watch 2 films of Pang brothers, The ...</td>\n",
215
- " <td>watch film pang brother eye watch eye kind dis...</td>\n",
216
- " </tr>\n",
217
- " <tr>\n",
218
- " <th>2</th>\n",
219
- " <td>1</td>\n",
220
- " <td>Jewel Thief is *THE* crime thriller of Bollywo...</td>\n",
221
- " <td>jewel thief crime thriller bollywood direct bi...</td>\n",
222
- " </tr>\n",
223
- " <tr>\n",
224
- " <th>3</th>\n",
225
- " <td>0</td>\n",
226
- " <td>This so called remake is terrible. I went to s...</td>\n",
227
- " <td>call remake terrible go tonight day anticipati...</td>\n",
228
- " </tr>\n",
229
- " <tr>\n",
230
- " <th>4</th>\n",
231
- " <td>1</td>\n",
232
- " <td>When Northfork debuted at the Cannes Film Fest...</td>\n",
233
- " <td>northfork debut cannes film festival people li...</td>\n",
234
- " </tr>\n",
235
- " <tr>\n",
236
- " <th>...</th>\n",
237
- " <td>...</td>\n",
238
- " <td>...</td>\n",
239
- " <td>...</td>\n",
240
- " </tr>\n",
241
- " <tr>\n",
242
- " <th>4995</th>\n",
243
- " <td>0</td>\n",
244
- " <td>The title tells it all -- Ed Gein, the butcher...</td>\n",
245
- " <td>title tell ed gein butcher plainfield it zappy...</td>\n",
246
- " </tr>\n",
247
- " <tr>\n",
248
- " <th>4996</th>\n",
249
- " <td>0</td>\n",
250
- " <td>This film makes about as much sense as an 'Ozz...</td>\n",
251
- " <td>film make sense ozzie harriet father know best...</td>\n",
252
- " </tr>\n",
253
- " <tr>\n",
254
- " <th>4997</th>\n",
255
- " <td>0</td>\n",
256
- " <td>\"Sex and the City\" has some great things going...</td>\n",
257
- " <td>sex city great thing go problem saddle number ...</td>\n",
258
- " </tr>\n",
259
- " <tr>\n",
260
- " <th>4998</th>\n",
261
- " <td>0</td>\n",
262
- " <td>Please...if anybody gets the chance to read th...</td>\n",
263
- " <td>please if anybody get chance read watch movie ...</td>\n",
264
- " </tr>\n",
265
- " <tr>\n",
266
- " <th>4999</th>\n",
267
- " <td>0</td>\n",
268
- " <td>...a film comes along that manages to be absol...</td>\n",
269
- " <td>a film come manage absolutely terrible open ti...</td>\n",
270
- " </tr>\n",
271
- " </tbody>\n",
272
- "</table>\n",
273
- "<p>5000 rows × 3 columns</p>\n",
274
- "</div>"
275
- ],
276
  "text/plain": [
277
- " label text \\\n",
278
- "0 0 I think it's time John Rambo move on with his ... \n",
279
- "1 1 I've just watch 2 films of Pang brothers, The ... \n",
280
- "2 1 Jewel Thief is *THE* crime thriller of Bollywo... \n",
281
- "3 0 This so called remake is terrible. I went to s... \n",
282
- "4 1 When Northfork debuted at the Cannes Film Fest... \n",
283
- "... ... ... \n",
284
- "4995 0 The title tells it all -- Ed Gein, the butcher... \n",
285
- "4996 0 This film makes about as much sense as an 'Ozz... \n",
286
- "4997 0 \"Sex and the City\" has some great things going... \n",
287
- "4998 0 Please...if anybody gets the chance to read th... \n",
288
- "4999 0 ...a film comes along that manages to be absol... \n",
289
- "\n",
290
- " processed_text \n",
291
- "0 think time john rambo life try vietnam series ... \n",
292
- "1 watch film pang brother eye watch eye kind dis... \n",
293
- "2 jewel thief crime thriller bollywood direct bi... \n",
294
- "3 call remake terrible go tonight day anticipati... \n",
295
- "4 northfork debut cannes film festival people li... \n",
296
- "... ... \n",
297
- "4995 title tell ed gein butcher plainfield it zappy... \n",
298
- "4996 film make sense ozzie harriet father know best... \n",
299
- "4997 sex city great thing go problem saddle number ... \n",
300
- "4998 please if anybody get chance read watch movie ... \n",
301
- "4999 a film come manage absolutely terrible open ti... \n",
302
- "\n",
303
- "[5000 rows x 3 columns]"
304
  ]
305
  },
306
- "execution_count": 14,
307
  "metadata": {},
308
  "output_type": "execute_result"
309
  }
310
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  "source": [
312
  "odf"
313
  ]
314
  },
315
  {
316
  "cell_type": "code",
317
- "execution_count": 15,
318
  "metadata": {},
319
  "outputs": [],
320
  "source": [
@@ -325,40 +233,18 @@
325
  },
326
  {
327
  "cell_type": "code",
328
- "execution_count": 16,
329
  "metadata": {},
330
- "outputs": [
331
- {
332
- "data": {
333
- "text/plain": [
334
- "[1, 14, 2, 3, 4, 21, 23, 22, 5, 24]"
335
- ]
336
- },
337
- "execution_count": 16,
338
- "metadata": {},
339
- "output_type": "execute_result"
340
- }
341
- ],
342
  "source": [
343
  "default_pre_steps_idx"
344
  ]
345
  },
346
  {
347
  "cell_type": "code",
348
- "execution_count": 17,
349
  "metadata": {},
350
- "outputs": [
351
- {
352
- "data": {
353
- "text/plain": [
354
- "[0, 17, 15, 19, 23, 22, 21, 24]"
355
- ]
356
- },
357
- "execution_count": 17,
358
- "metadata": {},
359
- "output_type": "execute_result"
360
- }
361
- ],
362
  "source": [
363
  "default_post_steps_idx"
364
  ]
@@ -383,7 +269,7 @@
383
  },
384
  {
385
  "cell_type": "code",
386
- "execution_count": 14,
387
  "metadata": {},
388
  "outputs": [],
389
  "source": [
@@ -392,7 +278,7 @@
392
  },
393
  {
394
  "cell_type": "code",
395
- "execution_count": 27,
396
  "metadata": {},
397
  "outputs": [],
398
  "source": [
@@ -401,20 +287,9 @@
401
  },
402
  {
403
  "cell_type": "code",
404
- "execution_count": 28,
405
  "metadata": {},
406
- "outputs": [
407
- {
408
- "data": {
409
- "text/plain": [
410
- "'Mimmo '"
411
- ]
412
- },
413
- "execution_count": 28,
414
- "metadata": {},
415
- "output_type": "execute_result"
416
- }
417
- ],
418
  "source": [
419
  "_re_non_words.sub(\" \", \"Mimmo23\")"
420
  ]
21
  "metadata": {},
22
  "outputs": [],
23
  "source": [
24
+ "# df = pd.read_csv(\"../data/test_en.csv\")\n",
25
+ "df = pd.read_excel(\"../data/test_chinese.xlsx\")"
26
  ]
27
  },
28
  {
37
  " \"normalize_bullet_points\",\n",
38
  " \"normalize_hyphenated_words\",\n",
39
  " \"normalize_quotation_marks\",\n",
40
+ " \"normalize_whitespaces\",\n",
41
  " \"normalize_repeating_words\",\n",
42
  " \"normalize_repeating_chars\",\n",
43
+ " \"normalize_useless_spaces\",\n",
44
  " # \"replace_currency_symbols\",\n",
45
  " # \"replace_emails\",\n",
46
  " # \"replace_emojis\",\n",
61
  },
62
  {
63
  "cell_type": "code",
64
+ "execution_count": 11,
65
  "metadata": {},
66
  "outputs": [],
67
  "source": [
75
  " # \"replace_emojis\",\n",
76
  " # \"replace_phone_numbers\",\n",
77
  " # \"replace_numbers\",\n",
78
+ " # \"remove_html_tags\",\n",
79
+ " # \"remove_accents\",\n",
80
  " # \"remove_brackets\",\n",
81
  " \"remove_non_words\",\n",
82
  " # \"remove_numbers\",\n",
90
  },
91
  {
92
  "cell_type": "code",
93
+ "execution_count": 12,
94
  "metadata": {},
95
  "outputs": [],
96
  "source": [
97
  "pipe = PreprocessingPipeline(\n",
98
+ " language=\"Chinese\",\n",
99
+ " lemmatization_step=\"Spacy lemmatizer (keep stopwords)\", # \"Disable lemmatizer\",\n",
100
  " pre_steps=pre_steps,\n",
101
  " post_steps=post_steps,\n",
102
  ")"
104
  },
105
  {
106
  "cell_type": "code",
107
+ "execution_count": 13,
108
  "metadata": {},
109
  "outputs": [
110
  {
111
  "data": {
112
  "text/plain": [
113
+ "'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate7 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 手机 只能 大半天 手机游戏 最多 看个 新闻 微信 不行 手机 买手机 谈谈 通话 一句 手机 通话 保证 畅通 手机 意义 一部 MP4 区别 第一次 通话 五分钟 声音 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 无声 加油 拿出 诚意'"
114
  ]
115
  },
116
+ "execution_count": 13,
117
  "metadata": {},
118
  "output_type": "execute_result"
119
  }
120
  ],
121
  "source": [
122
+ "df.text[0]"
123
  ]
124
  },
125
  {
126
  "cell_type": "code",
127
+ "execution_count": 14,
128
  "metadata": {},
129
  "outputs": [
130
  {
131
  "data": {
132
  "text/plain": [
133
+ "'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate7 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 手机 只能 大半天 手机游戏 最多 看个 新闻 微信 不行 手机 买手机 谈谈 通话 一句 手机 通话 保证 畅通 手机 意义 一部 MP4 区别 第一次 通话 五分钟 声音 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
134
  ]
135
  },
136
+ "execution_count": 14,
137
  "metadata": {},
138
  "output_type": "execute_result"
139
  }
140
  ],
141
  "source": [
142
+ "pipe.pre(df.text[0])"
143
  ]
144
  },
145
  {
146
  "cell_type": "code",
147
+ "execution_count": 15,
148
  "metadata": {},
149
  "outputs": [
150
  {
151
  "data": {
152
  "text/plain": [
153
+ "'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate7 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 手机 只能 大半天 手机游戏 最多 看个 新闻 微信 不行 手机 买手机 谈谈 通话 一句 手机 通话 保证 畅通 手机 意义 一部 MP4 区别 第一次 通话 五分钟 声音 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
154
  ]
155
  },
156
+ "execution_count": 15,
157
  "metadata": {},
158
  "output_type": "execute_result"
159
  }
160
  ],
161
  "source": [
162
+ "pipe.lemma(pipe.nlp(pipe.pre(df.text[0])))"
163
  ]
164
  },
165
  {
166
  "cell_type": "code",
167
+ "execution_count": 16,
168
  "metadata": {},
169
+ "outputs": [
170
+ {
171
+ "data": {
172
+ "text/plain": [
173
+ "'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 mp 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
174
+ ]
175
+ },
176
+ "execution_count": 16,
177
+ "metadata": {},
178
+ "output_type": "execute_result"
179
+ }
180
+ ],
181
  "source": [
182
+ "pipe.post(pipe.lemma(pipe.nlp(pipe.pre(df.text[0]))))"
183
  ]
184
  },
185
  {
186
  "cell_type": "code",
187
+ "execution_count": 17,
188
  "metadata": {},
189
  "outputs": [
190
  {
191
  "data": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  "text/plain": [
193
+ "Compose(<function strip at 0x7ff4894750e0>, <function normalize_useless_spaces at 0x7ff48946eef0>, <function normalize_repeating_chars at 0x7ff48946ef80>, <function normalize_repeating_words at 0x7ff4871a7170>, <function punctuation at 0x7ff48946e4d0>, <function remove_numbers at 0x7ff4894754d0>, <function lowercase at 0x7ff489475050>)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  ]
195
  },
196
+ "execution_count": 17,
197
  "metadata": {},
198
  "output_type": "execute_result"
199
  }
200
  ],
201
+ "source": [
202
+ "pipe.post"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": null,
208
+ "metadata": {},
209
+ "outputs": [],
210
+ "source": [
211
+ "odf = pipe.vaex_process(df, \"text\")"
212
+ ]
213
+ },
214
+ {
215
+ "cell_type": "code",
216
+ "execution_count": null,
217
+ "metadata": {},
218
+ "outputs": [],
219
  "source": [
220
  "odf"
221
  ]
222
  },
223
  {
224
  "cell_type": "code",
225
+ "execution_count": null,
226
  "metadata": {},
227
  "outputs": [],
228
  "source": [
233
  },
234
  {
235
  "cell_type": "code",
236
+ "execution_count": null,
237
  "metadata": {},
238
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
239
  "source": [
240
  "default_pre_steps_idx"
241
  ]
242
  },
243
  {
244
  "cell_type": "code",
245
+ "execution_count": null,
246
  "metadata": {},
247
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
248
  "source": [
249
  "default_post_steps_idx"
250
  ]
269
  },
270
  {
271
  "cell_type": "code",
272
+ "execution_count": null,
273
  "metadata": {},
274
  "outputs": [],
275
  "source": [
278
  },
279
  {
280
  "cell_type": "code",
281
+ "execution_count": null,
282
  "metadata": {},
283
  "outputs": [],
284
  "source": [
287
  },
288
  {
289
  "cell_type": "code",
290
+ "execution_count": null,
291
  "metadata": {},
292
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
293
  "source": [
294
  "_re_non_words.sub(\" \", \"Mimmo23\")"
295
  ]