aimlnerd commited on
Commit
1d6faef
β€’
1 Parent(s): b9d5349
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.12.1
5
+
6
+ WORKDIR /code
7
+
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
11
+
12
+ COPY . .
13
+
14
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,11 +1,10 @@
1
  ---
2
- title: Legal Entity Ner Transformers
3
- emoji: πŸŒ–
4
- colorFrom: green
5
- colorTo: yellow
6
  sdk: docker
7
  pinned: false
8
- license: apache-2.0
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Legal Entity Ner Crf
3
+ emoji: πŸ†
4
+ colorFrom: red
5
+ colorTo: gray
6
  sdk: docker
7
  pinned: false
 
8
  ---
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
configuration/.gitkeep ADDED
File without changes
configuration/__init__.py ADDED
File without changes
configuration/config.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from functools import lru_cache
3
+ from pydantic import BaseSettings, Field
4
+ from source.datamodel.common import CountryCode, LineOfBusiness
5
+ from source.datamodel.annotation_ranking import Weights, WeightCatalog
6
+
7
+
8
+ class Settings(BaseSettings):
9
+ SERVER_HOST: str = '0.0.0.0'
10
+ PORT: int = 3000
11
+ STOP_TIMEOUT = 120
12
+ SLEEP_DURATION = 1e-4 # 0.1 ms sleep
13
+ APP_NAME: str = "MIRA MODELS"
14
+ MIRA_MODELS_BLOB_PATH: str = "Mira/ml_models"
15
+ LOCAL_MIRA_MODELS: str = "ml_models"
16
+ MIRA_INTENT_MODEL: str = "ml_models/intent_classifier/2021-04-09"
17
+ MARINE_NL_NER_MODEL: str = "ml_models/ner_marine_nl/2021-04-09"
18
+ MARINE_NL_RB_MODEL: str = "ml_models/ner_marine_nl/rule_based_annotator/rb_annotator.pkl"
19
+ PROPERTY_NL_NER_MODEL: str = "ml_models/ner_property_nl/ner_v10"
20
+ PROPERTY_BE_NER_MODEL: str = "ml_models/ner_property_be/ner_v10"
21
+ PROPERTY_BE_UW_MODEL: str = Field("ml_models/ner_property_be/uw_property_be_dev", env='PROPERTY_BE_UW_MODEL')
22
+ PROPERTY_NL_UW_MODEL: str = Field("ml_models/ner_property_nl/uw_property_nl_dev", env='PROPERTY_NL_UW_MODEL')
23
+ ADDRESS_DETECTION_LAXONS: str = "ml_models/address_detection/laxons.json"
24
+ ADDRESS_DETECTION_TERMS: str = "ml_models/address_detection/terms.json"
25
+ ADDRESS_DETECTION_BROKER_ADDRESSES: str = "ml_models/address_detection/broker_addresses.json"
26
+ LAYOUTLM_MODEL: str = "ml_models/layoutlm/layoutlm_model.pth"
27
+ LAYOUTLM_LABEL_MAPPING: str = "ml_models/layoutlm/labels_mapping.json"
28
+ LAYOUTLM_TOKENIZER: str = "ml_models/layoutlm/tokenizer"
29
+ ADDRESS_DETECTION_MAX_LEN: int = 60
30
+ ADDRESS_INDEX_MIN: int = 40
31
+ DEEPPARSE_ROOT_DIR: str = "ml_models/deepparse"
32
+ TSI_THRESHOLD: int = 100000
33
+ BROKER_MODEL: dict = {
34
+ 'CRF_BROKER_MODEL_PATH': r"source/services/ner_crf/model/crf/30_Nov_2023-14h-broker_pycrf.crfsuite",
35
+ 'WORD_POSITION': 1,
36
+ #'POS_POSITION': 2,
37
+ 'LEMMA_POSITION': 2,
38
+ #'NER_POSITION': 3
39
+ }
40
+ si_model: dict = {
41
+ 'CRF_SI_MODEL_PATH': r"ml_models/si/crf_23_Jun_2022-11h_inclu_lemma_n_amount_with_eur_gt10k_amount.joblib",
42
+ 'WORD_POSITION': 1,
43
+ 'LEMMA_POSITION': 2,
44
+ 'NER_POSITION': 3,
45
+ 'POS_POSITION': 4
46
+ }
47
+ #spacy_pretrained_model_nl_sm = spacy.load('nl_core_news_sm')
48
+ spacy_pretrained_model_nl_md = spacy.load('nl_core_news_md')
49
+ layoutlm_config: dict = {'local_rank': -1,
50
+ 'overwrite_cache': True,
51
+ 'max_seq_length': 512,
52
+ 'model_type': 'layoutlm',
53
+ 'cls_token_box': [0, 0, 0, 0],
54
+ 'sep_token_box': [1000, 1000, 1000, 1000],
55
+ 'pad_token_box': [0, 0, 0, 0]}
56
+
57
+
58
+ def loss_ratio_params():
59
+ url = "http://0.0.0.0:3000/claim-experience-risk-level/"
60
+ login = "clerk"
61
+ pw = "asdfgh"
62
+ return url, login, pw
63
+
64
+
65
+ @lru_cache()
66
+ def get_weight_catalog():
67
+ weight_catalog = WeightCatalog()
68
+
69
+ # PROPERTY BE WEIGHTS
70
+ weight_catalog.set_weights(
71
+ LineOfBusiness.property, CountryCode.belgium, 'POLICYHOLDER',
72
+ Weights(subject=0.7, body=0.2, attachment=0.1))
73
+ weight_catalog.set_weights(
74
+ LineOfBusiness.property, CountryCode.belgium, 'BROKER',
75
+ Weights(subject=0.1, body=0.6, attachment=0.2))
76
+
77
+ # PROPERTY NL WEIGHTS
78
+ weight_catalog.set_weights(
79
+ LineOfBusiness.property, CountryCode.netherlands, 'POLICYHOLDER',
80
+ Weights(subject=0.7, body=0.2, attachment=0.1))
81
+ weight_catalog.set_weights(
82
+ LineOfBusiness.property, CountryCode.netherlands, 'BROKER',
83
+ Weights(subject=0.1, body=0.6, attachment=0.2))
84
+
85
+ return weight_catalog
86
+
87
+
88
+ @lru_cache()
89
+ def get_settings():
90
+ return Settings()
91
+
92
+
93
+ # Instantiate the settings
94
+ settings = get_settings()
source/services/ner/__init__.py ADDED
File without changes
source/services/ner/model/hf_tokenclassification/.gitkeep ADDED
File without changes
source/services/ner/steps/steps.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union
2
+
3
+ from spacy.tokens import Doc
4
+ from configuration.config import settings
5
+ from spacy.training import docs_to_json
6
+ import pycrfsuite
7
+
8
+ import srsly
9
+
10
+ nlp = settings.spacy_pretrained_model_nl_md
11
+
12
+ DOC2JSON_FT = ["id", "orth", "lemma", "ner"]
13
+
14
+
15
+ def get_entity_prediction(lst_token: List, pred: List) -> List:
16
+ lst = []
17
+ for token, pred_token in zip(lst_token[0], pred[0]):
18
+ if pred_token != 'O':
19
+ lst.append((token[1], token[2], pred_token[2:]))
20
+ return lst
21
+
22
+
23
+ def format_prediction(offsets: List, text: str, **kwargs) -> List:
24
+ if kwargs.get('model_name', None):
25
+ source = kwargs['model_name']
26
+ else:
27
+ source = "crf-broker"
28
+
29
+ lst = []
30
+ for pred_token in offsets:
31
+ lst.append({
32
+ "text": text[pred_token[0]:pred_token[1]],
33
+ "start": pred_token[0],
34
+ "end": pred_token[1],
35
+ "label": pred_token[2],
36
+ "source": source,
37
+ "score": 1.0})
38
+ return lst
39
+
40
+
41
+ def convert_spacybilou_to_crfsuitebilou(doc2json, save_path=False):
42
+ """
43
+ Convert from spacybilou to convert_spacybilou_to_crfsuitebilou i.e. [[(),()...],[(),()...]....]
44
+ the model will be feed at document level not sentence level
45
+ doc['sentences'] represents one document eg: 1 email or 1 attachment
46
+ format example
47
+ {
48
+ "id":10,
49
+ "orth":"Belgium",
50
+ "space":"",
51
+ "tag":"SPEC|deeleigen",
52
+ "pos":"PROPN",
53
+ "morph":"",
54
+ "lemma":"belgium",
55
+ "head":-1,
56
+ "dep":"flat",
57
+ "ner":"O"
58
+ }
59
+
60
+ """
61
+ lst_crf_docs = []
62
+ for doc in doc2json['paragraphs']:
63
+ lst_crf_doc = []
64
+ for sents in doc['sentences']:
65
+ sentence = [tuple({k: v for k, v in token.items() if k in DOC2JSON_FT}.values()) for token in
66
+ sents['tokens']]
67
+ lst_crf_doc.extend(sentence)
68
+ lst_crf_docs.append(lst_crf_doc)
69
+ if save_path:
70
+ srsly.write_json(save_path, lst_crf_docs)
71
+ return lst_crf_docs
72
+
73
+
74
+ def create_raw_data(input_text: str) -> tuple[List, List, Union[Doc, Doc]]:
75
+ doc = nlp(input_text)
76
+ lst_tokens = []
77
+ for token in doc:
78
+ lst_tokens.append((token.text, token.idx, token.idx + len(token.text)))
79
+
80
+ doc2json1 = docs_to_json(doc)
81
+ lst_data1 = convert_spacybilou_to_crfsuitebilou(doc2json=doc2json1)
82
+ return lst_data1, [lst_tokens], doc
83
+
84
+
85
+ def token_feature_engineering(raw_data: List, tokenfeatures) -> List:
86
+ X = [tokenfeatures.sent2features(s) for s in raw_data]
87
+ return X
88
+
89
+
90
+ def load_crf_model(path: str) -> pycrfsuite.Tagger:
91
+ tagger = pycrfsuite.Tagger()
92
+ tagger.open(path)
93
+ return tagger
94
+
95
+
96
+ def predictor(tagger, x: List) -> List:
97
+ """Runs prediction.
98
+ Args:
99
+ tagger: CRF
100
+ x: input data
101
+ Returns:
102
+ prediction: List
103
+ """
104
+
105
+ y_pred = [tagger.tag(xseq) for xseq in x]
106
+ return y_pred
source/services/ner/train/__init__.py ADDED
File without changes
source/services/ner/utils/__init__.py ADDED
File without changes
tests/.gitkeep ADDED
File without changes
tests/ner/.gitkeep ADDED
File without changes