aimlnerd commited on
Commit
35f2a2f
1 Parent(s): f7558d2

remove steps

Browse files
source/services/ner/steps/.gitkeep ADDED
File without changes
source/services/ner/steps/steps.py DELETED
@@ -1,106 +0,0 @@
1
- from typing import List, Union
2
-
3
- from spacy.tokens import Doc
4
- from configuration.config import settings
5
- from spacy.training import docs_to_json
6
- import pycrfsuite
7
-
8
- import srsly
9
-
10
- nlp = settings.spacy_pretrained_model_nl_md
11
-
12
- DOC2JSON_FT = ["id", "orth", "lemma", "ner"]
13
-
14
-
15
- def get_entity_prediction(lst_token: List, pred: List) -> List:
16
- lst = []
17
- for token, pred_token in zip(lst_token[0], pred[0]):
18
- if pred_token != 'O':
19
- lst.append((token[1], token[2], pred_token[2:]))
20
- return lst
21
-
22
-
23
- def format_prediction(offsets: List, text: str, **kwargs) -> List:
24
- if kwargs.get('model_name', None):
25
- source = kwargs['model_name']
26
- else:
27
- source = "crf-broker"
28
-
29
- lst = []
30
- for pred_token in offsets:
31
- lst.append({
32
- "text": text[pred_token[0]:pred_token[1]],
33
- "start": pred_token[0],
34
- "end": pred_token[1],
35
- "label": pred_token[2],
36
- "source": source,
37
- "score": 1.0})
38
- return lst
39
-
40
-
41
- def convert_spacybilou_to_crfsuitebilou(doc2json, save_path=False):
42
- """
43
- Convert from spacybilou to convert_spacybilou_to_crfsuitebilou i.e. [[(),()...],[(),()...]....]
44
- the model will be feed at document level not sentence level
45
- doc['sentences'] represents one document eg: 1 email or 1 attachment
46
- format example
47
- {
48
- "id":10,
49
- "orth":"Belgium",
50
- "space":"",
51
- "tag":"SPEC|deeleigen",
52
- "pos":"PROPN",
53
- "morph":"",
54
- "lemma":"belgium",
55
- "head":-1,
56
- "dep":"flat",
57
- "ner":"O"
58
- }
59
-
60
- """
61
- lst_crf_docs = []
62
- for doc in doc2json['paragraphs']:
63
- lst_crf_doc = []
64
- for sents in doc['sentences']:
65
- sentence = [tuple({k: v for k, v in token.items() if k in DOC2JSON_FT}.values()) for token in
66
- sents['tokens']]
67
- lst_crf_doc.extend(sentence)
68
- lst_crf_docs.append(lst_crf_doc)
69
- if save_path:
70
- srsly.write_json(save_path, lst_crf_docs)
71
- return lst_crf_docs
72
-
73
-
74
- def create_raw_data(input_text: str) -> tuple[List, List, Union[Doc, Doc]]:
75
- doc = nlp(input_text)
76
- lst_tokens = []
77
- for token in doc:
78
- lst_tokens.append((token.text, token.idx, token.idx + len(token.text)))
79
-
80
- doc2json1 = docs_to_json(doc)
81
- lst_data1 = convert_spacybilou_to_crfsuitebilou(doc2json=doc2json1)
82
- return lst_data1, [lst_tokens], doc
83
-
84
-
85
- def token_feature_engineering(raw_data: List, tokenfeatures) -> List:
86
- X = [tokenfeatures.sent2features(s) for s in raw_data]
87
- return X
88
-
89
-
90
- def load_crf_model(path: str) -> pycrfsuite.Tagger:
91
- tagger = pycrfsuite.Tagger()
92
- tagger.open(path)
93
- return tagger
94
-
95
-
96
- def predictor(tagger, x: List) -> List:
97
- """Runs prediction.
98
- Args:
99
- tagger: CRF
100
- x: input data
101
- Returns:
102
- prediction: List
103
- """
104
-
105
- y_pred = [tagger.tag(xseq) for xseq in x]
106
- return y_pred
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
source/services/ner/train/train.py CHANGED
@@ -251,7 +251,7 @@ model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
251
 
252
  from transformers import get_scheduler
253
 
254
- num_train_epochs = 3
255
  num_update_steps_per_epoch = len(train_dataloader)
256
  num_training_steps = num_train_epochs * num_update_steps_per_epoch
257
 
 
251
 
252
  from transformers import get_scheduler
253
 
254
+ num_train_epochs = 6
255
  num_update_steps_per_epoch = len(train_dataloader)
256
  num_training_steps = num_train_epochs * num_update_steps_per_epoch
257