Spaces:
Runtime error
Runtime error
remove steps
Browse files
source/services/ner/steps/.gitkeep
ADDED
File without changes
|
source/services/ner/steps/steps.py
DELETED
@@ -1,106 +0,0 @@
|
|
1 |
-
from typing import List, Union
|
2 |
-
|
3 |
-
from spacy.tokens import Doc
|
4 |
-
from configuration.config import settings
|
5 |
-
from spacy.training import docs_to_json
|
6 |
-
import pycrfsuite
|
7 |
-
|
8 |
-
import srsly
|
9 |
-
|
10 |
-
nlp = settings.spacy_pretrained_model_nl_md
|
11 |
-
|
12 |
-
DOC2JSON_FT = ["id", "orth", "lemma", "ner"]
|
13 |
-
|
14 |
-
|
15 |
-
def get_entity_prediction(lst_token: List, pred: List) -> List:
|
16 |
-
lst = []
|
17 |
-
for token, pred_token in zip(lst_token[0], pred[0]):
|
18 |
-
if pred_token != 'O':
|
19 |
-
lst.append((token[1], token[2], pred_token[2:]))
|
20 |
-
return lst
|
21 |
-
|
22 |
-
|
23 |
-
def format_prediction(offsets: List, text: str, **kwargs) -> List:
|
24 |
-
if kwargs.get('model_name', None):
|
25 |
-
source = kwargs['model_name']
|
26 |
-
else:
|
27 |
-
source = "crf-broker"
|
28 |
-
|
29 |
-
lst = []
|
30 |
-
for pred_token in offsets:
|
31 |
-
lst.append({
|
32 |
-
"text": text[pred_token[0]:pred_token[1]],
|
33 |
-
"start": pred_token[0],
|
34 |
-
"end": pred_token[1],
|
35 |
-
"label": pred_token[2],
|
36 |
-
"source": source,
|
37 |
-
"score": 1.0})
|
38 |
-
return lst
|
39 |
-
|
40 |
-
|
41 |
-
def convert_spacybilou_to_crfsuitebilou(doc2json, save_path=False):
|
42 |
-
"""
|
43 |
-
Convert from spacybilou to convert_spacybilou_to_crfsuitebilou i.e. [[(),()...],[(),()...]....]
|
44 |
-
the model will be feed at document level not sentence level
|
45 |
-
doc['sentences'] represents one document eg: 1 email or 1 attachment
|
46 |
-
format example
|
47 |
-
{
|
48 |
-
"id":10,
|
49 |
-
"orth":"Belgium",
|
50 |
-
"space":"",
|
51 |
-
"tag":"SPEC|deeleigen",
|
52 |
-
"pos":"PROPN",
|
53 |
-
"morph":"",
|
54 |
-
"lemma":"belgium",
|
55 |
-
"head":-1,
|
56 |
-
"dep":"flat",
|
57 |
-
"ner":"O"
|
58 |
-
}
|
59 |
-
|
60 |
-
"""
|
61 |
-
lst_crf_docs = []
|
62 |
-
for doc in doc2json['paragraphs']:
|
63 |
-
lst_crf_doc = []
|
64 |
-
for sents in doc['sentences']:
|
65 |
-
sentence = [tuple({k: v for k, v in token.items() if k in DOC2JSON_FT}.values()) for token in
|
66 |
-
sents['tokens']]
|
67 |
-
lst_crf_doc.extend(sentence)
|
68 |
-
lst_crf_docs.append(lst_crf_doc)
|
69 |
-
if save_path:
|
70 |
-
srsly.write_json(save_path, lst_crf_docs)
|
71 |
-
return lst_crf_docs
|
72 |
-
|
73 |
-
|
74 |
-
def create_raw_data(input_text: str) -> tuple[List, List, Union[Doc, Doc]]:
|
75 |
-
doc = nlp(input_text)
|
76 |
-
lst_tokens = []
|
77 |
-
for token in doc:
|
78 |
-
lst_tokens.append((token.text, token.idx, token.idx + len(token.text)))
|
79 |
-
|
80 |
-
doc2json1 = docs_to_json(doc)
|
81 |
-
lst_data1 = convert_spacybilou_to_crfsuitebilou(doc2json=doc2json1)
|
82 |
-
return lst_data1, [lst_tokens], doc
|
83 |
-
|
84 |
-
|
85 |
-
def token_feature_engineering(raw_data: List, tokenfeatures) -> List:
|
86 |
-
X = [tokenfeatures.sent2features(s) for s in raw_data]
|
87 |
-
return X
|
88 |
-
|
89 |
-
|
90 |
-
def load_crf_model(path: str) -> pycrfsuite.Tagger:
|
91 |
-
tagger = pycrfsuite.Tagger()
|
92 |
-
tagger.open(path)
|
93 |
-
return tagger
|
94 |
-
|
95 |
-
|
96 |
-
def predictor(tagger, x: List) -> List:
|
97 |
-
"""Runs prediction.
|
98 |
-
Args:
|
99 |
-
tagger: CRF
|
100 |
-
x: input data
|
101 |
-
Returns:
|
102 |
-
prediction: List
|
103 |
-
"""
|
104 |
-
|
105 |
-
y_pred = [tagger.tag(xseq) for xseq in x]
|
106 |
-
return y_pred
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
source/services/ner/train/train.py
CHANGED
@@ -251,7 +251,7 @@ model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
|
251 |
|
252 |
from transformers import get_scheduler
|
253 |
|
254 |
-
num_train_epochs =
|
255 |
num_update_steps_per_epoch = len(train_dataloader)
|
256 |
num_training_steps = num_train_epochs * num_update_steps_per_epoch
|
257 |
|
|
|
251 |
|
252 |
from transformers import get_scheduler
|
253 |
|
254 |
+
num_train_epochs = 6
|
255 |
num_update_steps_per_epoch = len(train_dataloader)
|
256 |
num_training_steps = num_train_epochs * num_update_steps_per_epoch
|
257 |
|