fnavales commited on
Commit
2a04af3
1 Parent(s): c239532

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -3
app.py CHANGED
@@ -5,9 +5,10 @@ from transformers import BertTokenizerFast as BertTokenizer, BertModel
5
  import pytorch_lightning as pl
6
 
7
 
8
- BERT_MODEL_NAME = 'bert-base-cased'
9
  tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
10
  LABEL_COLUMNS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
 
11
 
12
 
13
  class ToxicCommentTagger(pl.LightningModule):
@@ -36,7 +37,7 @@ def predict(model, tokenizer, sentence):
36
  encoding = tokenizer.encode_plus(
37
  sentence,
38
  add_special_tokens=False,
39
- max_length=510,
40
  return_token_type_ids=False,
41
  padding="max_length",
42
  return_attention_mask=True,
@@ -44,7 +45,7 @@ def predict(model, tokenizer, sentence):
44
  )
45
 
46
  # define target chunksize
47
- chunksize = 512
48
 
49
  # split into chunks of 510 tokens, we also convert to list (default is tuple which is immutable)
50
  input_id_chunks = list(encoding['input_ids'][0].split(chunksize - 2))
 
5
  import pytorch_lightning as pl
6
 
7
 
8
+ BERT_MODEL_NAME = 'bert-base-uncased'
9
  tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
10
  LABEL_COLUMNS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
11
+ MAX_TOKEN_COUNT = 300
12
 
13
 
14
  class ToxicCommentTagger(pl.LightningModule):
 
37
  encoding = tokenizer.encode_plus(
38
  sentence,
39
  add_special_tokens=False,
40
+ max_length=MAX_TOKEN_COUNT,
41
  return_token_type_ids=False,
42
  padding="max_length",
43
  return_attention_mask=True,
 
45
  )
46
 
47
  # define target chunksize
48
+ chunksize = MAX_TOKEN_COUNT
49
 
50
  # split into chunks of 510 tokens, we also convert to list (default is tuple which is immutable)
51
  input_id_chunks = list(encoding['input_ids'][0].split(chunksize - 2))