Update README.md
Browse files
README.md
CHANGED
@@ -11,7 +11,7 @@ pipeline_tag: text2text-generation
|
|
11 |
This Natural Language Processing (NLP) model is made available under the Apache License, Version 2.0. You are free to use, modify, and distribute this software according to the terms and conditions of the Apache 2.0 License. For the full license text, please refer to the Apache 2.0 License.
|
12 |
# Usage and Specific Capabilities
|
13 |
## Text Length Limitation
|
14 |
-
The model is optimized to analyze texts containing up to 2048 tokens. If your text exceeds this limit, we recommend splitting it into smaller chunks, each containing no more than
|
15 |
## Supported Languages
|
16 |
Bulgarian, Chinese, Czech, Dutch, English, Estonian, Finnish, French, German, Greek, Indonesian, Italian, Japanese, Korean, Lithuanian, Norwegian, Polish, Portuguese, Romanian, Russian, Slovak, Spanish, Swedish, Turkish
|
17 |
|
@@ -26,18 +26,18 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
26 |
tokenizer = AutoTokenizer.from_pretrained("metricspace/EntityAnonymization-3B-V0.9")
|
27 |
model = AutoModelForCausalLM.from_pretrained("metricspace/EntityAnonymization-3B-V0.9", torch_dtype=torch.bfloat16)
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
-
# Extract text after each occurrence of "ASSISTANT:"
|
34 |
-
assistant_responses = []
|
35 |
-
for match in matches:
|
36 |
-
start_index = match.end() # Get the index where "ASSISTANT:" ends
|
37 |
-
response = input_text[start_index:].strip()
|
38 |
-
assistant_responses.append(response)
|
39 |
|
40 |
-
return assistant_responses
|
41 |
|
42 |
|
43 |
text_to_anonymize = "Sophia had always been enchanted by Venice, a historic city nestled in the heart of the Venetian lagoon. She had explored Venice on numerous occasions, each visit revealing hidden treasures in the enchanting city. On her latest trip, Sophia met Marco, a local historian, who shared captivating stories about the history of Venice.""
|
@@ -49,7 +49,7 @@ output_entities = model.generate(inputs.input_ids, max_new_tokens=250, do_sample
|
|
49 |
output_entities_text = tokenizer.decode(output_entities[0], skip_special_tokens=True)
|
50 |
|
51 |
# extracting entities text from assistant response
|
52 |
-
generated_part = extract_assistant_response(output_text_1)
|
53 |
|
54 |
prompt_2 = f"USER: Rephrase with {generated_part}: {text_to_anonymize}\n\nASSISTANT:"
|
55 |
inputs = tokenizer(prompt_2, return_tensors='pt').to('cuda')
|
|
|
11 |
This Natural Language Processing (NLP) model is made available under the Apache License, Version 2.0. You are free to use, modify, and distribute this software according to the terms and conditions of the Apache 2.0 License. For the full license text, please refer to the Apache 2.0 License.
|
12 |
# Usage and Specific Capabilities
|
13 |
## Text Length Limitation
|
14 |
+
The model is optimized to analyze texts containing up to 2048 tokens. If your text exceeds this limit, we recommend splitting it into smaller chunks, each containing no more than 2048 tokens. Each chunk can then be processed separately.
|
15 |
## Supported Languages
|
16 |
Bulgarian, Chinese, Czech, Dutch, English, Estonian, Finnish, French, German, Greek, Indonesian, Italian, Japanese, Korean, Lithuanian, Norwegian, Polish, Portuguese, Romanian, Russian, Slovak, Spanish, Swedish, Turkish
|
17 |
|
|
|
26 |
tokenizer = AutoTokenizer.from_pretrained("metricspace/EntityAnonymization-3B-V0.9")
|
27 |
model = AutoModelForCausalLM.from_pretrained("metricspace/EntityAnonymization-3B-V0.9", torch_dtype=torch.bfloat16)
|
28 |
|
29 |
+
import re
|
30 |
+
|
31 |
+
def extract_last_assistant_response(input_text):
|
32 |
+
# Find the occurrence of "ASSISTANT:" in the input text
|
33 |
+
match = re.search(r'ASSISTANT:', input_text)
|
34 |
+
|
35 |
+
# Get the index where the last "ASSISTANT:" ends
|
36 |
+
start_index = match.end()
|
37 |
+
response = input_text[start_index:].strip()
|
38 |
+
return response
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
|
|
41 |
|
42 |
|
43 |
text_to_anonymize = "Sophia had always been enchanted by Venice, a historic city nestled in the heart of the Venetian lagoon. She had explored Venice on numerous occasions, each visit revealing hidden treasures in the enchanting city. On her latest trip, Sophia met Marco, a local historian, who shared captivating stories about the history of Venice.""
|
|
|
49 |
output_entities_text = tokenizer.decode(output_entities[0], skip_special_tokens=True)
|
50 |
|
51 |
# extracting entities text from assistant response
|
52 |
+
generated_part = extract_assistant_response(output_text_1)
|
53 |
|
54 |
prompt_2 = f"USER: Rephrase with {generated_part}: {text_to_anonymize}\n\nASSISTANT:"
|
55 |
inputs = tokenizer(prompt_2, return_tensors='pt').to('cuda')
|