Tymec commited on
Commit
1414454
1 Parent(s): a0c00be

Parallelize text cleaning

Browse files
Files changed (2) hide show
  1. app/data.py +13 -4
  2. app/utils.py +1 -1
app/data.py CHANGED
@@ -9,6 +9,7 @@ from typing import TYPE_CHECKING, Literal, Sequence
9
  import emoji
10
  import pandas as pd
11
  import spacy
 
12
  from tqdm import tqdm
13
 
14
  from app.constants import (
@@ -160,16 +161,24 @@ def tokenize(
160
  Returns:
161
  Tokenized text data
162
  """
163
- text_data = [
164
- _clean(text)
 
 
 
 
 
 
 
 
 
165
  for text in tqdm(
166
  text_data,
167
  desc="Cleaning",
168
  unit="doc",
169
  disable=not show_progress,
170
  )
171
- ]
172
-
173
  return pd.Series(
174
  [
175
  _lemmatize(doc, character_threshold)
 
9
  import emoji
10
  import pandas as pd
11
  import spacy
12
+ from joblib import Parallel, delayed
13
  from tqdm import tqdm
14
 
15
  from app.constants import (
 
161
  Returns:
162
  Tokenized text data
163
  """
164
+ # text_data = [
165
+ # _clean(text)
166
+ # for text in tqdm(
167
+ # text_data,
168
+ # desc="Cleaning",
169
+ # unit="doc",
170
+ # disable=not show_progress,
171
+ # )
172
+ # ]
173
+ text_data = Parallel(n_jobs=n_jobs)(
174
+ delayed(_clean)(text)
175
  for text in tqdm(
176
  text_data,
177
  desc="Cleaning",
178
  unit="doc",
179
  disable=not show_progress,
180
  )
181
+ )
 
182
  return pd.Series(
183
  [
184
  _lemmatize(doc, character_threshold)
app/utils.py CHANGED
@@ -11,7 +11,7 @@ if TYPE_CHECKING:
11
  __all__ = ["serialize", "deserialize"]
12
 
13
 
14
- def serialize(data: Sequence[str | int], path: Path, max_size: int = 100000, show_progress: bool = False) -> None:
15
  """Serialize data to a file
16
 
17
  Args:
 
11
  __all__ = ["serialize", "deserialize"]
12
 
13
 
14
+ def serialize(data: Sequence[str | int], path: Path, max_size: int = 100_000, show_progress: bool = False) -> None:
15
  """Serialize data to a file
16
 
17
  Args: