PereLluis13 commited on
Commit
eda7f18
1 Parent(s): 6ab8d79

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +10 -7
README.md CHANGED
@@ -24,8 +24,7 @@ widget:
24
  example_title: English
25
  inference:
26
  parameters:
27
- decoder_start_token_id: 250058
28
- src_lang: en_XX
29
  tgt_lang: <triplet>
30
  tags:
31
  - seq2seq
@@ -51,9 +50,9 @@ mREBEL is introduced in the ACL 2023 paper [RED^{FM}: a Filtered and Multilingua
51
  url = "https://arxiv.org/abs/2306.09802",
52
  }
53
 
54
- The original repository for the paper can be found [here](https://github.com/Babelscape/rebel)
55
 
56
- Be aware that the inference widget at the right does not output special tokens, which are necessary to distinguish the subject, object and relation types. For a demo of REBEL and its pre-training dataset check the [Spaces demo](https://huggingface.co/spaces/Babelscape/rebel-demo).
57
 
58
  ## Pipeline usage
59
 
@@ -62,7 +61,7 @@ from transformers import pipeline
62
 
63
  triplet_extractor = pipeline('translation_xx_to_yy', model='Babelscape/mrebel-base', tokenizer='Babelscape/mrebel-base')
64
  # We need to use the tokenizer manually since we need special tokens.
65
- extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor("The Red Hot Chili Peppers were formed in Los Angeles by Kiedis, Flea, guitarist Hillel Slovak and drummer Jack Irons.", decoder_start_token_id=250058, src_lang="en_XX", tgt_lang="<triplet>", return_tensors=True, return_text=False)[0]["translation_token_ids"]]) # change en_XX for the language of the source.
66
  print(extracted_text[0])
67
  # Function to parse the generated text and extract the triplets
68
  def extract_triplets_typed(text):
@@ -146,7 +145,11 @@ def extract_triplets_typed(text):
146
  return triplets
147
 
148
  # Load model and tokenizer
149
- tokenizer = AutoTokenizer.from_pretrained("Babelscape/mrebel-base", src_lang="en_XX", "tgt_lang": "tp_XX") # Here we set English as source language. To change the source language just change it here or swap the first token of the input for your desired language
 
 
 
 
150
  model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/mrebel-base")
151
  gen_kwargs = {
152
  "max_length": 256,
@@ -166,7 +169,7 @@ model_inputs = tokenizer(text, max_length=256, padding=True, truncation=True, re
166
  generated_tokens = model.generate(
167
  model_inputs["input_ids"].to(model.device),
168
  attention_mask=model_inputs["attention_mask"].to(model.device),
169
- decoder_start_token_id = self.tokenizer.convert_tokens_to_ids("tp_XX"),
170
  **gen_kwargs,
171
  )
172
 
 
24
  example_title: English
25
  inference:
26
  parameters:
27
+ src_lang: __en__
 
28
  tgt_lang: <triplet>
29
  tags:
30
  - seq2seq
 
50
  url = "https://arxiv.org/abs/2306.09802",
51
  }
52
 
53
+ The original repository for the paper can be found [here](https://github.com/Babelscape/rebel#REDFM)
54
 
55
+ Be aware that the inference widget at the right does not output special tokens, which are necessary to distinguish the subject, object and relation types. For a demo of mREBEL and its pre-training dataset check the [Spaces demo](https://huggingface.co/spaces/Babelscape/mrebel-demo).
56
 
57
  ## Pipeline usage
58
 
 
61
 
62
  triplet_extractor = pipeline('translation_xx_to_yy', model='Babelscape/mrebel-base', tokenizer='Babelscape/mrebel-base')
63
  # We need to use the tokenizer manually since we need special tokens.
64
+ extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor("The Red Hot Chili Peppers were formed in Los Angeles by Kiedis, Flea, guitarist Hillel Slovak and drummer Jack Irons.", decoder_start_token_id=tokenizer.convert_tokens_to_ids("tp_XX"), src_lang="__en__", tgt_lang="<triplet>", return_tensors=True, return_text=False)[0]["translation_token_ids"]]) # change __en__ for the language of the source.
65
  print(extracted_text[0])
66
  # Function to parse the generated text and extract the triplets
67
  def extract_triplets_typed(text):
 
145
  return triplets
146
 
147
  # Load model and tokenizer
148
+ tokenizer = AutoTokenizer.from_pretrained("Babelscape/mrebel-base", src_lang="en_XX", tgt_lang="tp_XX")
149
+ # Here we set English ("en_XX") as source language. To change the source language swap the first token of the input for your desired language or change to supported language. For catalan ("ca_XX") or greek ("el_EL") (not included in mBART pretraining) you need a workaround:
150
+ # tokenizer._src_lang = "ca_XX"
151
+ # tokenizer.cur_lang_code_id = tokenizer.convert_tokens_to_ids("ca_XX")
152
+ # tokenizer.set_src_lang_special_tokens("ca_XX")
153
  model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/mrebel-base")
154
  gen_kwargs = {
155
  "max_length": 256,
 
169
  generated_tokens = model.generate(
170
  model_inputs["input_ids"].to(model.device),
171
  attention_mask=model_inputs["attention_mask"].to(model.device),
172
+ decoder_start_token_id = tokenizer.convert_tokens_to_ids("tp_XX"),
173
  **gen_kwargs,
174
  )
175