import transformers from transformers import pipeline import whisper import datetime transformers.utils.move_cache() # ==================================== # Load speech recognition model # speech_recognition_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h") speech_recognition_model = whisper.load_model("base") # ==================================== # Load text summarization model English # text_summarization_pipeline_En = pipeline("summarization", model="facebook/bart-large-cnn") tokenizer_En = transformers.AutoTokenizer.from_pretrained("facebook/bart-large-cnn") text_summarization_model_En = transformers.AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn") # ==================================== # Load text summarization model Vietnamese tokenizer_Vi = transformers.AutoTokenizer.from_pretrained("VietAI/vit5-large-vietnews-summarization") text_summarization_model_Vi = transformers.AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-large-vietnews-summarization") def asr_transcript(input_file): audio = whisper.load_audio(input_file) output = speech_recognition_model.transcribe(audio) text = output['text'] lang = "English" if output["language"] == 'en': lang = "English" elif output["language"] == 'vi': lang = "Vietnamese" detail = "" for segment in output['segments']: start = str(datetime.timedelta(seconds=round(segment['start']))) end = str(datetime.timedelta(seconds=round(segment['end']))) small_text = segment['text'] detail = detail + start + "-" + end + " " + small_text + "\n" return text, lang, detail def text_summarize_en(text_input): encoding = tokenizer_En(text_input, truncation=True, return_tensors="pt") input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"] outputs = text_summarization_model_En.generate( input_ids=input_ids, attention_mask=attention_masks, max_length=256, early_stopping=True ) text = "" for output in outputs: line = tokenizer_En.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) text = text + line return text def text_summarize_vi(text_input): encoding = tokenizer_Vi(text_input, truncation=True, return_tensors="pt") input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"] outputs = text_summarization_model_Vi.generate( input_ids=input_ids, attention_mask=attention_masks, max_length=256, early_stopping=True ) text = "" for output in outputs: line = tokenizer_Vi.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) text = text + line return text def text_summarize(text_input, lang): if lang == 'English': return text_summarize_en(text_input) elif lang == 'Vietnamese': return text_summarize_vi(text_input) else: return ""