Tymec commited on
Commit
228859a
·
1 Parent(s): 71069d7

Refactor typing and update tokenization rules

Browse files
Files changed (2) hide show
  1. app/data.py +22 -22
  2. app/utils.py +3 -3
app/data.py CHANGED
@@ -1,7 +1,7 @@
1
  from __future__ import annotations
2
 
3
  import bz2
4
- from typing import TYPE_CHECKING, Literal
5
 
6
  import pandas as pd
7
  import spacy
@@ -25,17 +25,17 @@ __all__ = ["load_data", "tokenize"]
25
 
26
 
27
  try:
28
- nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "parser", "ner"])
29
  except OSError:
30
  print("Downloading spaCy model...")
31
 
32
  from spacy.cli import download as spacy_download
33
 
34
  spacy_download("en_core_web_sm")
35
- nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "parser", "ner"])
36
 
37
 
38
- def _lemmatize(doc: Doc, threshold: int = 2) -> list[str]:
39
  """Lemmatize the provided text using spaCy.
40
 
41
  Args:
@@ -43,27 +43,25 @@ def _lemmatize(doc: Doc, threshold: int = 2) -> list[str]:
43
  threshold: Minimum character length of tokens
44
 
45
  Returns:
46
- Lemmatized text
47
  """
48
  return [
49
  token.lemma_.lower().strip()
50
  for token in doc
51
- if not token.is_stop
52
- and not token.is_punct
53
- and not token.like_email
54
- and not token.like_url
55
- and not token.like_num
56
- and not (len(token.lemma_) < threshold)
57
  ]
58
 
59
 
60
  def tokenize(
61
- text_data: list[str],
62
  batch_size: int = 512,
63
  n_jobs: int = 4,
64
  character_threshold: int = 2,
65
  show_progress: bool = True,
66
- ) -> list[list[str]]:
67
  """Tokenize the provided text using spaCy.
68
 
69
  Args:
@@ -76,15 +74,17 @@ def tokenize(
76
  Returns:
77
  Tokenized text data
78
  """
79
- return [
80
- _lemmatize(doc, character_threshold)
81
- for doc in tqdm(
82
- nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs),
83
- total=len(text_data),
84
- disable=not show_progress,
85
- unit="doc",
86
- )
87
- ]
 
 
88
 
89
 
90
  def load_sentiment140(include_neutral: bool = False) -> tuple[list[str], list[int]]:
 
1
  from __future__ import annotations
2
 
3
  import bz2
4
+ from typing import TYPE_CHECKING, Literal, Sequence
5
 
6
  import pandas as pd
7
  import spacy
 
25
 
26
 
27
  try:
28
+ nlp = spacy.load("en_core_web_sm")
29
  except OSError:
30
  print("Downloading spaCy model...")
31
 
32
  from spacy.cli import download as spacy_download
33
 
34
  spacy_download("en_core_web_sm")
35
+ nlp = spacy.load("en_core_web_sm")
36
 
37
 
38
+ def _lemmatize(doc: Doc, threshold: int = 2) -> Sequence[str]:
39
  """Lemmatize the provided text using spaCy.
40
 
41
  Args:
 
43
  threshold: Minimum character length of tokens
44
 
45
  Returns:
46
+ Sequence of lemmatized tokens
47
  """
48
  return [
49
  token.lemma_.lower().strip()
50
  for token in doc
51
+ if not token.is_stop # Ignore stop words
52
+ and not token.is_punct # Ignore punctuation
53
+ and not token.is_alpha # Ignore non-alphabetic tokens
54
+ and not (len(token.lemma_) < threshold) # Ignore short tokens
 
 
55
  ]
56
 
57
 
58
  def tokenize(
59
+ text_data: Sequence[str],
60
  batch_size: int = 512,
61
  n_jobs: int = 4,
62
  character_threshold: int = 2,
63
  show_progress: bool = True,
64
+ ) -> Sequence[Sequence[str]]:
65
  """Tokenize the provided text using spaCy.
66
 
67
  Args:
 
74
  Returns:
75
  Tokenized text data
76
  """
77
+ return pd.Series(
78
+ [
79
+ _lemmatize(doc, character_threshold)
80
+ for doc in tqdm(
81
+ nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs, disable=["parser", "ner", "tok2vec"]),
82
+ total=len(text_data),
83
+ disable=not show_progress,
84
+ unit="doc",
85
+ )
86
+ ],
87
+ )
88
 
89
 
90
  def load_sentiment140(include_neutral: bool = False) -> tuple[list[str], list[int]]:
app/utils.py CHANGED
@@ -1,6 +1,6 @@
1
  from __future__ import annotations
2
 
3
- from typing import TYPE_CHECKING
4
 
5
  import joblib
6
  from tqdm import tqdm
@@ -11,7 +11,7 @@ if TYPE_CHECKING:
11
  __all__ = ["serialize", "deserialize"]
12
 
13
 
14
- def serialize(data: list[list[str]], path: Path, max_size: int = 400) -> None:
15
  """Serialize data to a file
16
 
17
  Args:
@@ -26,7 +26,7 @@ def serialize(data: list[list[str]], path: Path, max_size: int = 400) -> None:
26
  joblib.dump(chunk, f, compress=3)
27
 
28
 
29
- def deserialize(path: Path) -> list[list[str]]:
30
  """Deserialize data from a file
31
 
32
  Args:
 
1
  from __future__ import annotations
2
 
3
+ from typing import TYPE_CHECKING, Sequence
4
 
5
  import joblib
6
  from tqdm import tqdm
 
11
  __all__ = ["serialize", "deserialize"]
12
 
13
 
14
+ def serialize(data: Sequence[str], path: Path, max_size: int = 100000) -> None:
15
  """Serialize data to a file
16
 
17
  Args:
 
26
  joblib.dump(chunk, f, compress=3)
27
 
28
 
29
+ def deserialize(path: Path) -> Sequence[str]:
30
  """Deserialize data from a file
31
 
32
  Args: