Update README.md
Browse files
README.md
CHANGED
@@ -15,7 +15,7 @@ This is a reduced version of the Portuguese capitalisation and punctuation resto
|
|
15 |
|
16 |
You can try the model in the following [SPACE](https://huggingface.co/spaces/VOCALINLP/punctuation_and_capitalization_restoration_sanivert)
|
17 |
## Details of the dataset
|
18 |
-
|
19 |
|
20 |
## Evaluation Metrics
|
21 |
|
@@ -23,6 +23,9 @@ You can try the model in the following [SPACE](https://huggingface.co/spaces/VOC
|
|
23 |
This work was funded by the Spanish Government, the Spanish Ministry of Economy and Digital Transformation through the Digital Transformation through the "Recovery, Transformation and Resilience Plan" and also funded by the European Union NextGenerationEU/PRTR through the research project 2021/C005/0015007
|
24 |
|
25 |
## How to use the model
|
|
|
|
|
|
|
26 |
|
27 |
```py
|
28 |
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
|
@@ -30,31 +33,38 @@ import torch
|
|
30 |
|
31 |
def get_result_text_es_pt (list_entity, text, lang):
|
32 |
result_words = []
|
|
|
33 |
if lang == "es":
|
34 |
punc_tags = ['¿', '?', '¡', '!', ',', '.', ':']
|
35 |
else:
|
36 |
punc_tags = ['?', '!', ',', '.', ':']
|
37 |
-
|
38 |
-
for entity in list_entity:
|
39 |
tag = entity["entity"]
|
40 |
word = entity["word"]
|
41 |
start = entity["start"]
|
42 |
end = entity["end"]
|
43 |
-
|
44 |
# check punctuation
|
45 |
punc_in = next((p for p in punc_tags if p in tag), "")
|
46 |
-
|
47 |
subword = False
|
48 |
# check subwords
|
49 |
-
if word[0] == "#":
|
50 |
subword = True
|
51 |
-
if
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
word =
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
elif tag == "u":
|
59 |
word = word.capitalize()
|
60 |
# case with punctuation
|
@@ -62,9 +72,12 @@ def get_result_text_es_pt (list_entity, text, lang):
|
|
62 |
if tag[-1] == "l":
|
63 |
word = (punc_in + word) if punc_in in ["¿", "¡"] else (word + punc_in)
|
64 |
elif tag[-1] == "u":
|
65 |
-
word = (punc_in + word.capitalize()) if punc_in in ["¿", "¡"] else (word.capitalize() + punc_in)
|
66 |
-
|
67 |
-
if
|
|
|
|
|
|
|
68 |
result_words[-1] = word
|
69 |
else:
|
70 |
result_words.append(word)
|
|
|
15 |
|
16 |
You can try the model in the following [SPACE](https://huggingface.co/spaces/VOCALINLP/punctuation_and_capitalization_restoration_sanivert)
|
17 |
## Details of the dataset
|
18 |
+
The model was fine-tuned for punctuation restoration using clinical reports and the OpusParaCrawl dataset.
|
19 |
|
20 |
## Evaluation Metrics
|
21 |
|
|
|
23 |
This work was funded by the Spanish Government, the Spanish Ministry of Economy and Digital Transformation through the Digital Transformation through the "Recovery, Transformation and Resilience Plan" and also funded by the European Union NextGenerationEU/PRTR through the research project 2021/C005/0015007
|
24 |
|
25 |
## How to use the model
|
26 |
+
The metrics used to the evaluation of the model are the Macro and the Weighted F1 scores.
|
27 |
+
|
28 |
+
|
29 |
|
30 |
```py
|
31 |
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
|
|
|
33 |
|
34 |
def get_result_text_es_pt (list_entity, text, lang):
|
35 |
result_words = []
|
36 |
+
tmp_word = ""
|
37 |
if lang == "es":
|
38 |
punc_tags = ['¿', '?', '¡', '!', ',', '.', ':']
|
39 |
else:
|
40 |
punc_tags = ['?', '!', ',', '.', ':']
|
41 |
+
|
42 |
+
for idx, entity in enumerate(list_entity):
|
43 |
tag = entity["entity"]
|
44 |
word = entity["word"]
|
45 |
start = entity["start"]
|
46 |
end = entity["end"]
|
47 |
+
|
48 |
# check punctuation
|
49 |
punc_in = next((p for p in punc_tags if p in tag), "")
|
50 |
+
|
51 |
subword = False
|
52 |
# check subwords
|
53 |
+
if word[0] == "#":
|
54 |
subword = True
|
55 |
+
if tmp_word == "":
|
56 |
+
p_s = list_entity[idx-1]["start"]
|
57 |
+
p_e = list_entity[idx-1]["end"]
|
58 |
+
tmp_word = text[p_s:p_e] + text[start:end]
|
59 |
+
else:
|
60 |
+
tmp_word = tmp_word + text[start:end]
|
61 |
+
word = tmp_word
|
62 |
+
else:
|
63 |
+
tmp_word = ""
|
64 |
+
word = text[start:end]
|
65 |
+
|
66 |
+
if tag == "l":
|
67 |
+
word = word
|
68 |
elif tag == "u":
|
69 |
word = word.capitalize()
|
70 |
# case with punctuation
|
|
|
72 |
if tag[-1] == "l":
|
73 |
word = (punc_in + word) if punc_in in ["¿", "¡"] else (word + punc_in)
|
74 |
elif tag[-1] == "u":
|
75 |
+
word = (punc_in + word.capitalize()) if punc_in in ["¿", "¡"] else (word.capitalize() + punc_in)
|
76 |
+
|
77 |
+
if tag != "l":
|
78 |
+
word = '<span style="font-weight:bold; color:rgb(142, 208, 129);">' + word + '</span>'
|
79 |
+
|
80 |
+
if subword == True:
|
81 |
result_words[-1] = word
|
82 |
else:
|
83 |
result_words.append(word)
|