import streamlit as st from transformers import MarianTokenizer, MarianMTModel , BertTokenizer, AutoModelForSeq2SeqLM, pipeline from ar_corrector.corrector import Corrector import mishkal.tashkeel from arabert.preprocess import ArabertPreprocessor # Initialize Mishkal vocalizer vocalizer = mishkal.tashkeel.TashkeelClass() # Initialize Marian tokenizer and model for translation mname = "marefa-nlp/marefa-mt-en-ar" tokenizer = MarianTokenizer.from_pretrained(mname) model = MarianMTModel.from_pretrained(mname) # Initialize BERT tokenizer and model for summarization model_name = "malmarjeh/mbert2mbert-arabic-text-summarization" preprocessor = ArabertPreprocessor(model_name="") tokenizer_summarization = BertTokenizer.from_pretrained(model_name) model_summarization = AutoModelForSeq2SeqLM.from_pretrained(model_name) pipeline_summarization = pipeline("text2text-generation", model=model_summarization, tokenizer=tokenizer_summarization) corr = Corrector() def main(): st.title("U3reb Demo") # Text Input input_text = st.text_area("Enter Arabic Text:") # Tokenization st.subheader("Tokenization (Mishkal)") if input_text: text_mishkal = vocalizer.tashkeel(input_text) st.write("Tokenized Text (with diacritics):", text_mishkal) # Translation st.subheader("Translation") if input_text: translated_tokens = model.generate(**tokenizer.prepare_seq2seq_batch([input_text], return_tensors="pt")) translated_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens] st.write("Translated Text:", translated_text) # Arabic Text Correction st.subheader("Arabic Text Correction (ar_correct)") if input_text: corrected_text = corr.contextual_correct(input_text) st.write("Corrected Text:", corrected_text) # Text Summarization st.subheader("Text Summarization") if input_text: preprocessed_text = preprocessor.preprocess(input_text) result = pipeline_summarization(preprocessed_text, pad_token_id=tokenizer_summarization.eos_token_id, num_beams=3, repetition_penalty=3.0, max_length=200, length_penalty=1.0, no_repeat_ngram_size=3)[0]['generated_text'] st.write("Summarized Text:", result) if __name__ == "__main__": main()