felmateos commited on
Commit
b2dc20d
1 Parent(s): 027387e

Fixed contraction format and added jupyter tagger version

Browse files
Files changed (12) hide show
  1. .dockerignore +24 -0
  2. .env.example +7 -0
  3. .gitattributes +1 -1
  4. .gitignore +5 -1
  5. Dockerfile +21 -0
  6. README.md +2 -2
  7. app.py +36 -71
  8. main.ipynb +0 -0
  9. main.py +132 -0
  10. preprocessing.py +57 -1
  11. requirements.txt +2 -0
  12. top.html +1 -1
.dockerignore ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ **/.classpath
2
+ **/.dockerignore
3
+ **/.git
4
+ **/.gitignore
5
+ **/.project
6
+ **/.settings
7
+ **/.toolstarget
8
+ **/.vs
9
+ **/.vscode
10
+ **/*.*proj.user
11
+ **/*.dbmdl
12
+ **/*.jfm
13
+ **/bin
14
+ **/charts
15
+ **/docker-compose*
16
+ **/compose*
17
+ **/Dockerfile*
18
+ **/node_modules
19
+ **/npm-debug.log
20
+ **/obj
21
+ **/secrets.dev.yaml
22
+ **/values.dev.yaml
23
+ LICENSE
24
+ README.md
.env.example ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ DEFAULT_MODEL=Nome do modelo aqui (News / Tweets (stock market) / Oil and Gas (academic texts) / Multigenre)
2
+ ID_COLUMN=nome da coluna com os ids dos tweets
3
+ CONTENT_COLUMN=nome da coluna com o conteudo dos tweets
4
+ PREFIX=prefixo para adicionar ao id dos tweets
5
+ DATA_PATH=caminho para o arquivos .csv dos tweets
6
+ OUTPUT_PATH=caminho para o arquivo de saída .conllu
7
+ KEEP_REPLACE_CONTRACTION=Se a forma original da contrações deve ser mantida (True/False)
.gitattributes CHANGED
@@ -31,4 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
31
  *.xz filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
31
  *.xz filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -159,4 +159,8 @@ cython_debug/
159
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
  #.idea/
161
 
162
- *.conllu
 
 
 
 
 
159
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
  #.idea/
161
 
162
+ *.conllu
163
+
164
+ /data
165
+ /output
166
+ .env
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a imagem oficial do Python 3.10 como base
2
+ FROM python:3.10-slim
3
+
4
+ # Defina o diretório de trabalho como /app
5
+ WORKDIR /app
6
+
7
+ # Copie o arquivo de requisitos para o diretório de trabalho
8
+ COPY requirements.txt .
9
+
10
+ # Instale as dependências do Python
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Copie o seu arquivo Python para o diretório de trabalho
14
+ COPY . .
15
+
16
+ # Comando padrão para executar seu arquivo Python quando o contêiner for iniciado
17
+ CMD ["python", "main.py"]
18
+
19
+ #Execute os seguintes comandos em ordem:
20
+ #docker build -t porttaggerdante .
21
+ #docker run -v "caminho pro arquivo de saída:/app/output" porttaggerdante
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
- title: Porttagger
3
  emoji: ✍️
4
- colorFrom: purple
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 3.9.1
 
1
  ---
2
+ title: Porttagger-DANTE
3
  emoji: ✍️
4
+ colorFrom: orange
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 3.9.1
app.py CHANGED
@@ -7,10 +7,12 @@ import gradio as gr
7
  import pandas as pd
8
  import spacy
9
  import torch
 
10
  from dante_tokenizer import DanteTokenizer
 
11
  from transformers import AutoModelForTokenClassification, AutoTokenizer
12
 
13
- from preprocessing import expand_contractions
14
 
15
  try:
16
  nlp = spacy.load("pt_core_news_sm")
@@ -19,7 +21,7 @@ except Exception:
19
  nlp = spacy.load("pt_core_news_sm")
20
  dt_tokenizer = DanteTokenizer()
21
 
22
- default_model = "News"
23
  model_choices = {
24
  "News": "Emanuel/porttagger-news-base",
25
  "Tweets (stock market)": "Emanuel/porttagger-tweets-base",
@@ -90,59 +92,37 @@ def predict(text, logger=None) -> Tuple[List[str], List[str]]:
90
  return tokens, labels, scores
91
 
92
 
93
- def text_analysis(text):
94
- text = expand_contractions(text)
95
- tokens, labels, scores = predict(text, logger)
96
- if len(labels) != len(tokens):
97
- m = len(tokens) - len(labels)
98
- labels += [None] * m
99
- scores += [0] * m
100
- pos_count = pd.DataFrame(
101
- {
102
- "token": tokens,
103
- "tag": labels,
104
- "confidence": scores,
105
- }
106
- )
107
- pos_tokens = []
108
- for token, label in zip(tokens, labels):
109
- pos_tokens.extend([(token, label), (" ", None)])
110
-
111
- output_highlighted.update(visible=True)
112
- output_df.update(visible=True)
113
-
114
- return {
115
- output_highlighted: output_highlighted.update(visible=True, value=(pos_tokens)),
116
- output_df: output_df.update(visible=True, value=pos_count),
117
- }
118
-
119
-
120
- def batch_analysis(input_file):
121
- text = open(input_file.name, encoding="utf-8").read()
122
- text = text.split("\n")
123
- name = Path(input_file.name).stem
124
- sents = []
125
- for sent in text:
126
- sub_sents = nlp(sent).sents
127
- sub_sents = [str(_sent).strip() for _sent in sub_sents]
128
- sents += sub_sents
129
  conllu_output = []
130
 
131
- for i, sent in enumerate(sents):
132
- sent = expand_contractions(sent)
133
- conllu_output.append("# sent_id = {}-{}\n".format(name, i + 1))
134
  conllu_output.append("# text = {}\n".format(sent))
135
- tokens, labels, scores = predict(sent, logger)
136
- for j, (token, label) in enumerate(zip(tokens, labels)):
137
- conllu_output.append(
138
- "{}\t{}\t_\t{}".format(j + 1, token, label) + "\t_" * 5 + "\n"
139
- )
 
 
 
 
 
 
 
 
 
 
 
140
  conllu_output.append("\n")
141
-
142
  output_filename = "output.conllu"
143
- with open(output_filename, "w") as out_f:
144
  out_f.writelines(conllu_output)
145
-
146
  return {output_file: output_file.update(visible=True, value=output_filename)}
147
 
148
 
@@ -154,26 +134,11 @@ with gr.Blocks(css=css) as demo:
154
  gr.HTML(top_html)
155
  select_model = gr.Dropdown(choices=list(model_choices.keys()), label="Tagger model", value=default_model)
156
  select_model.change(myapp.load_model, inputs=[select_model])
157
- with gr.Tab("Single sentence"):
158
- text = gr.Textbox(placeholder="Enter your text here...", label="Input")
159
- examples = gr.Examples(
160
- examples=[
161
- [
162
- "A população não poderia ter acesso a relatórios que explicassem, por exemplo, os motivos exatos de atrasos em obras de linhas e estações."
163
- ],
164
- [
165
- "Filme 'Star Wars : Os Últimos Jedi' ganha trailer definitivo; assista."
166
- ],
167
- ],
168
- inputs=[text],
169
- label="Select an example",
170
- )
171
- output_highlighted = gr.HighlightedText(label="Colorful output", visible=False)
172
- output_df = gr.Dataframe(label="Tabular output", visible=False)
173
- submit_btn = gr.Button("Tag it")
174
- submit_btn.click(
175
- fn=text_analysis, inputs=text, outputs=[output_highlighted, output_df]
176
- )
177
  with gr.Tab("Multiple sentences"):
178
  gr.HTML(
179
  """
@@ -197,10 +162,10 @@ with gr.Blocks(css=css) as demo:
197
  output_file = gr.File(label="Tagged file", visible=False)
198
  submit_btn_batch = gr.Button("Tag it")
199
  submit_btn_batch.click(
200
- fn=batch_analysis, inputs=input_file, outputs=output_file
201
  )
202
 
203
  gr.HTML(bottom_html)
204
 
205
 
206
- demo.launch(debug=True)
 
7
  import pandas as pd
8
  import spacy
9
  import torch
10
+
11
  from dante_tokenizer import DanteTokenizer
12
+
13
  from transformers import AutoModelForTokenClassification, AutoTokenizer
14
 
15
+ from preprocessing import *
16
 
17
  try:
18
  nlp = spacy.load("pt_core_news_sm")
 
21
  nlp = spacy.load("pt_core_news_sm")
22
  dt_tokenizer = DanteTokenizer()
23
 
24
+ default_model = "Tweets (stock market)"
25
  model_choices = {
26
  "News": "Emanuel/porttagger-news-base",
27
  "Tweets (stock market)": "Emanuel/porttagger-tweets-base",
 
92
  return tokens, labels, scores
93
 
94
 
95
+ def batch_analysis_csv(input_file, id_column: str='tweet_id', content_column: str='content', prefix: str='dante_02', keep_replace_contraction=True):
96
+ df = pd.read_csv(input_file.name, encoding='utf-8')
97
+ ids = df[id_column]
98
+ texts = df[content_column]
99
+ texts = texts.replace(r'\\n', ' ', regex=True)
100
+ texts = texts.apply(lambda x : x.strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  conllu_output = []
102
 
103
+ for id, sent in zip(ids, texts):
104
+ conllu_output.append("# sent_id = {}_{}\n".format(prefix, id))
 
105
  conllu_output.append("# text = {}\n".format(sent))
106
+ tokens, labels, _ = predict(sent, logger)
107
+ tokens_labels = list(zip(tokens, labels))
108
+
109
+ for j, (token, label) in enumerate(tokens_labels):
110
+ try:
111
+ contr = tokens_labels[j][0] + ' ' + tokens_labels[j+1][0]
112
+ for expansion in expansions.keys():
113
+ replace_str = expansions[expansion]
114
+ match = re.match(expansion, contr, re.I)
115
+ expansion = replace_keep_case(expansion, replace_str, contr)
116
+ if match is not None:
117
+ conllu_output.append("{}\t{}".format(str(j+1)+'-'+str(j+2), expansion) + "\t_" * 8 + "\n")
118
+ break
119
+ conllu_output.append("{}\t{}\t_\t{}".format(j + 1, token, label) + "\t_" * 6 + "\n")
120
+ except IndexError:
121
+ conllu_output.append("{}\t{}\t_\t{}".format(j + 1, token, label) + "\t_" * 6 + "\n")
122
  conllu_output.append("\n")
 
123
  output_filename = "output.conllu"
124
+ with open(output_filename, "w", encoding='utf-8') as out_f:
125
  out_f.writelines(conllu_output)
 
126
  return {output_file: output_file.update(visible=True, value=output_filename)}
127
 
128
 
 
134
  gr.HTML(top_html)
135
  select_model = gr.Dropdown(choices=list(model_choices.keys()), label="Tagger model", value=default_model)
136
  select_model.change(myapp.load_model, inputs=[select_model])
137
+
138
+ id_column = gr.Textbox(placeholder='tweet_id', label='Id column')
139
+ content_column = gr.Textbox(placeholder='content', label='Content column')
140
+ label_prefix = gr.Textbox(placeholder='dante_02', label='Label prefix')
141
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  with gr.Tab("Multiple sentences"):
143
  gr.HTML(
144
  """
 
162
  output_file = gr.File(label="Tagged file", visible=False)
163
  submit_btn_batch = gr.Button("Tag it")
164
  submit_btn_batch.click(
165
+ fn=batch_analysis_csv, inputs=[input_file, id_column], outputs=output_file
166
  )
167
 
168
  gr.HTML(bottom_html)
169
 
170
 
171
+ demo.launch(debug=True)
main.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
main.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from typing import List, Tuple
4
+ import pandas as pd
5
+ import spacy
6
+ import torch
7
+ from dante_tokenizer import DanteTokenizer
8
+ from transformers import AutoModelForTokenClassification, AutoTokenizer
9
+ from dotenv import dotenv_values
10
+
11
+ from dante_tokenizer.data.preprocessing import split_monetary_tokens, normalize_text, split_enclisis
12
+ from preprocessing import *
13
+
14
+ try:
15
+ nlp = spacy.load("pt_core_news_sm")
16
+ except Exception:
17
+ os.system("python -m spacy download pt_core_news_sm")
18
+ nlp = spacy.load("pt_core_news_sm")
19
+ dt_tokenizer = DanteTokenizer()
20
+
21
+ model_choices = {
22
+ "News": "Emanuel/porttagger-news-base",
23
+ "Tweets (stock market)": "Emanuel/porttagger-tweets-base",
24
+ "Oil and Gas (academic texts)": "Emanuel/porttagger-oilgas-base",
25
+ "Multigenre": "Emanuel/porttagger-base",
26
+ }
27
+ pre_tokenizers = {
28
+ "News": nlp,
29
+ "Tweets (stock market)": dt_tokenizer.tokenize,
30
+ "Oil and Gas (academic texts)": nlp,
31
+ "Multigenre": nlp,
32
+ }
33
+
34
+ env_vars = dotenv_values('.env')
35
+
36
+ for key, value in env_vars.items():
37
+ globals()[key] = value
38
+
39
+ logger = logging.getLogger()
40
+ logger.setLevel(logging.DEBUG)
41
+
42
+ class MyApp:
43
+ def __init__(self) -> None:
44
+ self.model = None
45
+ self.tokenizer = None
46
+ self.pre_tokenizer = None
47
+ self.load_model()
48
+
49
+ def load_model(self, model_name: str = DEFAULT_MODEL):
50
+ if model_name not in model_choices.keys():
51
+ logger.error("Selected model is not supported, resetting to the default model.")
52
+ model_name = DEFAULT_MODEL
53
+ self.model = AutoModelForTokenClassification.from_pretrained(model_choices[model_name])
54
+ self.tokenizer = AutoTokenizer.from_pretrained(model_choices[model_name])
55
+ self.pre_tokenizer = pre_tokenizers[model_name]
56
+
57
+ myapp = MyApp()
58
+
59
+ def predict(text, logger=None) -> Tuple[List[str], List[str]]:
60
+ doc = myapp.pre_tokenizer(text)
61
+ tokens = [token.text if not isinstance(token, str) else token for token in doc]
62
+
63
+ logger.info("Starting predictions for sentence: {}".format(text))
64
+ print("Using model {}".format(myapp.model.config.__dict__["_name_or_path"]))
65
+
66
+ input_tokens = myapp.tokenizer(
67
+ tokens,
68
+ return_tensors="pt",
69
+ is_split_into_words=True,
70
+ return_offsets_mapping=True,
71
+ return_special_tokens_mask=True,
72
+
73
+ )
74
+ output = myapp.model(input_tokens["input_ids"])
75
+
76
+ i_token = 0
77
+ labels = []
78
+ scores = []
79
+ for off, is_special_token, pred in zip(
80
+ input_tokens["offset_mapping"][0],
81
+ input_tokens["special_tokens_mask"][0],
82
+ output.logits[0],
83
+ ):
84
+ if is_special_token or off[0] > 0:
85
+ continue
86
+ label = myapp.model.config.__dict__["id2label"][int(pred.argmax(axis=-1))]
87
+ if logger is not None:
88
+ logger.info("{}, {}, {}".format(off, tokens[i_token], label))
89
+ labels.append(label)
90
+ scores.append(
91
+ "{:.2f}".format(100 * float(torch.softmax(pred, dim=-1).detach().max()))
92
+ )
93
+ i_token += 1
94
+
95
+ return tokens, labels, scores
96
+
97
+ def batch_analysis_csv(ID_COLUMN: str, CONTENT_COLUMN: str, DATA_PATH: str, PREFIX:str, OUTPUT_PATH: str, KEEP_REPLACE_CONTRACTION: bool):
98
+ df = pd.read_csv(DATA_PATH)
99
+ ids = df[ID_COLUMN]
100
+ texts = df[CONTENT_COLUMN]
101
+ texts = texts.replace(r'\\n', ' ', regex=True) # remover '\n' mas não por espaço
102
+ texts = texts.apply(lambda x : x.strip()) # remover espaços excedentes
103
+ conllu_output = []
104
+
105
+ for id, sent in zip(ids, texts):
106
+ conllu_output.append("# sent_id = {}_{}\n".format(PREFIX, id))
107
+ conllu_output.append("# text = {}\n".format(sent))
108
+ tokens, labels, _ = predict(sent, logger)
109
+ tokens_labels = list(zip(tokens, labels))
110
+
111
+ for j, (token, label) in enumerate(tokens_labels):
112
+ try:
113
+ contr = tokens_labels[j][0] + ' ' + tokens_labels[j+1][0]
114
+ for expansion in expansions.keys():
115
+ replace_str = expansions[expansion]
116
+ match = re.match(expansion, contr, re.IGNORECASE)
117
+ expansion = replace_keep_case(expansion, replace_str, contr)
118
+ if match is not None:
119
+ conllu_output.append("{}\t{}".format(str(j+1)+'-'+str(j+2), expansion) + "\t_" * 8 + "\n")
120
+ break
121
+ conllu_output.append("{}\t{}\t_\t{}".format(j + 1, token, label) + "\t_" * 6 + "\n")
122
+ except IndexError:
123
+ conllu_output.append("{}\t{}\t_\t{}".format(j + 1, token, label) + "\t_" * 6 + "\n")
124
+ conllu_output.append("\n")
125
+ with open(OUTPUT_PATH, 'w', encoding='utf-8') as out_f:
126
+ out_f.writelines(conllu_output)
127
+
128
+ def main():
129
+ batch_analysis_csv(ID_COLUMN, CONTENT_COLUMN, DATA_PATH, PREFIX, OUTPUT_PATH, KEEP_REPLACE_CONTRACTION)
130
+
131
+ if __name__ == '__main__':
132
+ main()
preprocessing.py CHANGED
@@ -46,7 +46,7 @@ contractions = {
46
  r"(?<![\w.])aonde(?![$\w])": r"a onde",
47
  r"(?<![\w.])àquela(s)?(?![$\w])": r"a aquela\g<1>",
48
  r"(?<![\w.])àquele(s)?(?![$\w])": r"a aquele\g<1>",
49
- r"(?<![\w.])àquilo(?![$\w])": r"a aquelo",
50
  r"(?<![\w.])contigo(?![$\w])": r"com ti",
51
  r"(?<![\w.])né(?![$\w])": r"não é",
52
  r"(?<![\w.])comigo(?![$\w])": r"com mim",
@@ -58,6 +58,60 @@ contractions = {
58
  }
59
 
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def replace_keep_case(word, replacement, text):
62
  """
63
  Custom function for replace keeping the original case.
@@ -84,6 +138,8 @@ def replace_keep_case(word, replacement, text):
84
  return repl.capitalize()
85
  if g.isupper():
86
  return repl.upper()
 
 
87
  return repl
88
 
89
  return re.sub(word, func, text, flags=re.I)
 
46
  r"(?<![\w.])aonde(?![$\w])": r"a onde",
47
  r"(?<![\w.])àquela(s)?(?![$\w])": r"a aquela\g<1>",
48
  r"(?<![\w.])àquele(s)?(?![$\w])": r"a aquele\g<1>",
49
+ r"(?<![\w.])àquilo(?![$\w])": r"a aquilo",
50
  r"(?<![\w.])contigo(?![$\w])": r"com ti",
51
  r"(?<![\w.])né(?![$\w])": r"não é",
52
  r"(?<![\w.])comigo(?![$\w])": r"com mim",
 
58
  }
59
 
60
 
61
+ expansions = {
62
+ r'^em o(s)?$': r'no\g<1>',
63
+ r'^em a(s)?$': r'na\g<1>',
64
+ r'^de a(s)?$': r'da\g<1>',
65
+ r'^de o(s)?$': r'do\g<1>',
66
+ r'^a o(s)?$': r'ao\g<1>',
67
+ r'^a a(s)?$': r'à\g<1>',
68
+ r'^por a(s)?$': r'pela\g<1>',
69
+ r'^por o(s)?$': r'pelo\g<1>',
70
+ r'^em esta(s)?$': r'nesta\g<1>',
71
+ r'^em este(s)?$': r'neste\g<1>',
72
+ r'^em essa(s)?$': r'nessa\g<1>',
73
+ r'^em esse(s)?$': r'nesse\g<1>',
74
+ r'^em um$': r'num',
75
+ r'^em uns$': r'nuns',
76
+ r'^em uma(s)?$': r'numa\g<1>',
77
+ r'^em isso$': r'nisso',
78
+ r'^em aquele(s)?$': r'naquele\g<1>',
79
+ r'^em aquela(s)?$': r'naquela\g<1>',
80
+ r'^em aquilo$': r'naquilo',
81
+ r'^de uma(s)?$': r'duma\g<1>',
82
+ r'^de aqui$': r'daqui',
83
+ r'^de ali$': r'dali',
84
+ r'^de aquele(s)?$': r'daquele\g<1>',
85
+ r'^de aquela(s)?$': r'daquela\g<1>',
86
+ r'^de este(s)?$': r'deste\g<1>',
87
+ r'^de esta(s)?$': r'desta\g<1>',
88
+ r'^de esse(s)?$': r'desse\g<1>',
89
+ r'^de essa(s)?$': r'dessa\g<1>',
90
+ r'^de aí$': r'daí',
91
+ r'^de um$': r'dum',
92
+ r'^de onde$': r'donde',
93
+ r'^de isto$': r'disto',
94
+ r'^de isso$': r'disso',
95
+ r'^de aquilo$': r'daquilo',
96
+ r'^de ela(s)?$': r"dela\g<1>",
97
+ r'^de ele(s)?$': r"dele\g<1>",
98
+ r'^em isto$': r'nisto',
99
+ r'^em ele(s)?$': r'nele\g<1>',
100
+ r'^em ela(s)?$': r'nela\g<1>',
101
+ r'^em outro(s)?$': r'noutro\g<1>',
102
+ r'^a onde$': r'aonde',
103
+ r'^a aquela(s)?$': r'àquela\g<1>',
104
+ r'^a aquele(s)?$': r'àquele\g<1>',
105
+ r'^a aquilo$': r'àquilo',
106
+ r'^com ti$': r'contigo',
107
+ r'^não é$': r'né',
108
+ r'^com mim$': r'comigo',
109
+ r'^com nós$': r'conosco',
110
+ r'^com si$': r'consigo',
111
+ r'^para a$': r'pra',
112
+ r'^para o$': r'pro'
113
+ }
114
+
115
  def replace_keep_case(word, replacement, text):
116
  """
117
  Custom function for replace keeping the original case.
 
138
  return repl.capitalize()
139
  if g.isupper():
140
  return repl.upper()
141
+ if g[0].isupper():
142
+ return repl[0].upper() + repl[1:]
143
  return repl
144
 
145
  return re.sub(word, func, text, flags=re.I)
requirements.txt CHANGED
@@ -57,6 +57,7 @@ pydub==0.25.1
57
  PyNaCl==1.5.0
58
  pyparsing==3.0.9
59
  python-dateutil==2.8.2
 
60
  python-multipart==0.0.5
61
  pytz==2022.6
62
  PyYAML==6.0
@@ -84,3 +85,4 @@ uvicorn==0.19.0
84
  wasabi==0.10.1
85
  websockets==10.4
86
  yarl==1.8.1
 
 
57
  PyNaCl==1.5.0
58
  pyparsing==3.0.9
59
  python-dateutil==2.8.2
60
+ python-dotenv==1.0.1
61
  python-multipart==0.0.5
62
  pytz==2022.6
63
  PyYAML==6.0
 
85
  wasabi==0.10.1
86
  websockets==10.4
87
  yarl==1.8.1
88
+ option==2.1.0
top.html CHANGED
@@ -1,7 +1,7 @@
1
  <div style="text-align: center; max-width: 650px; margin: 0 auto;">
2
  <div>
3
  <h1 style="font-weight: 900; font-size: 3rem; margin: 20px;">
4
- Porttagger
5
  </h1>
6
  <p class="slogan">A Brazilian Portuguese part of speech tagger according to the <a
7
  href="https://universaldependencies.org/">Universal Dependencies</a> model
 
1
  <div style="text-align: center; max-width: 650px; margin: 0 auto;">
2
  <div>
3
  <h1 style="font-weight: 900; font-size: 3rem; margin: 20px;">
4
+ PorttaggerDANTE
5
  </h1>
6
  <p class="slogan">A Brazilian Portuguese part of speech tagger according to the <a
7
  href="https://universaldependencies.org/">Universal Dependencies</a> model