|
|
|
|
|
from transformers import GPT2Tokenizer |
|
from arabert.preprocess import ArabertPreprocessor |
|
from arabert.aragpt2.grover.modeling_gpt2 import GPT2LMHeadModel |
|
from pyarabic.araby import strip_tashkeel |
|
import pyarabic.trans |
|
model_name='alsubari/aragpt2-mega-pos-msa' |
|
|
|
|
|
tokenizer = GPT2Tokenizer.from_pretrained('alsubari/aragpt2-mega-pos-msa') |
|
model = GPT2LMHeadModel.from_pretrained('alsubari/aragpt2-mega-pos-msa').to("cuda") |
|
|
|
arabert_prep = ArabertPreprocessor(model_name='aubmindlab/aragpt2-mega') |
|
prml=['اعراب الجملة :', ' صنف الكلمات من الجملة :'] |
|
text='تعلَّمْ من أخطائِكَ' |
|
text=arabert_prep.preprocess(strip_tashkeel(text)) |
|
generation_args = { |
|
'pad_token_id':tokenizer.eos_token_id, |
|
'max_length': 256, |
|
'num_beams':20, |
|
'no_repeat_ngram_size': 3, |
|
'top_k': 20, |
|
'top_p': 0.1, |
|
'do_sample': True, |
|
'repetition_penalty':2.0 |
|
} |
|
input_text = f'<|startoftext|>Instruction: {prml[1]} {text}<|pad|>Answer:' |
|
input_ids = tokenizer.encode(input_text, return_tensors='pt').to("cuda") |
|
output_ids = model.generate(input_ids=input_ids,**generation_args) |
|
output_text = tokenizer.decode(output_ids[0],skip_special_tokens=True).split('Answer:')[1] |
|
answer_pose=pyarabic.trans.delimite_language(output_text, start="<token>", end="</token>") |
|
|
|
print(answer_pose) |
|
|
|
input_text = f'<|startoftext|>Instruction: {prml[0]} {text}<|pad|>Answer:' |
|
input_ids = tokenizer.encode(input_text, return_tensors='pt').to("cuda") |
|
output_ids = model.generate(input_ids=input_ids,**generation_args) |
|
output_text = tokenizer.decode(output_ids[0],skip_special_tokens=True).split('Answer:')[1] |
|
|
|
print(output_text) |
|
|