Dmitry Chaplinsky commited on
Commit
09b4276
1 Parent(s): 2bb3de0

Trying to enable model pipeline

Browse files
Files changed (3) hide show
  1. README.md +12 -4
  2. pipeline.py +44 -0
  3. requirements.txt +1 -0
README.md CHANGED
@@ -1,9 +1,17 @@
1
  ---
2
  language:
3
- - uk
4
  tags:
5
- - token-classification
 
 
6
  license: mit
7
  metrics:
8
- - f1
9
- ---
 
 
 
 
 
 
 
1
  ---
2
  language:
3
+ - uk
4
  tags:
5
+ - token-classification
6
+ - punctuation prediction
7
+ - punctuation
8
  license: mit
9
  metrics:
10
+ - f1
11
+ ---
12
+
13
+ # Ukrainian model to restore punctuation and capitalization
14
+
15
+ This is the NeMo model to restore punctuation and capitalization in sentences, trained on 10m+ sentences from UberText 2.0 corpus (yet unreleased)
16
+
17
+ Model restores the following punctuations -- [? . ,]
pipeline.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ from nemo.collections.nlp.models import PunctuationCapitalizationModel
3
+
4
+ class PreTrainedPipeline():
5
+ def __init__(self, path=""):
6
+ # IMPLEMENT_THIS
7
+ # Preload all the elements you are going to need at inference.
8
+ # For instance your model, processors, tokenizer that might be needed.
9
+ # This function is only called once, so do all the heavy processing I/O here"""
10
+ self.model = PunctuationCapitalizationModel.from_pretrained("dchaplinsky/punctuation_uk_bert")
11
+
12
+ def __call__(self, inputs: str) -> List[Dict[str, Any]]:
13
+ """
14
+ Args:
15
+ inputs (:obj:`str`):
16
+ a string containing some text
17
+ Return:
18
+ A :obj:`list`:. The object returned should be like [{"entity_group": "XXX", "word": "some word", "start": 3, "end": 6, "score": 0.82}] containing :
19
+ - "entity_group": A string representing what the entity is.
20
+ - "word": A substring of the original string that was detected as an entity.
21
+ - "start": the offset within `input` leading to `answer`. context[start:stop] == word
22
+ - "end": the ending offset within `input` leading to `answer`. context[start:stop] === word
23
+ - "score": A score between 0 and 1 describing how confident the model is for this entity.
24
+ """
25
+ inputs = inputs.strip()
26
+ labels = self.model.add_punctuation_capitalization([inputs], return_labels=True)[0].split()
27
+
28
+ tokens = inputs.split()
29
+
30
+ res: List[Dict[str, Any]] = []
31
+ offset = 0
32
+ for tok, lab in zip(tokens, labels):
33
+ if lab != "OO":
34
+ res.append({
35
+ "entity_group": lab,
36
+ "word": tok,
37
+ "start": offset,
38
+ "end": offset + len(tok),
39
+ "score": 1
40
+ })
41
+
42
+ offset += len(tok) + 1
43
+
44
+ return res
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ requirements.txt