# -*- coding: utf-8 -*- """ Created on Mon Mar 28 01:07:44 2022 @author: adeep """ import numpy as np import pandas as pd from sklearn.metrics import label_ranking_average_precision_score import streamlit as st import joblib import os from translate import Translator from moviepy.editor import VideoFileClip import speech_recognition as sr from pydub import AudioSegment from pydub.silence import split_on_silence import transformers from transformers import pipeline import nltk nltk.download('punkt') nltk.download('averaged_perceptron_tagger') import nltk nltk.download('punkt') nltk.download('averaged_perceptron_tagger') from nltk.tokenize import sent_tokenize import re import stanfordnlp def welcome(): return "Welcome All" def get_large_audio_transcription(path): r = sr.Recognizer() sound = AudioSegment.from_wav(path) chunks = split_on_silence(sound, min_silence_len = 500, silence_thresh = sound.dBFS-14, keep_silence=500, ) whole_text = "" for i, audio_chunk in enumerate(chunks, start=1): chunk_filename = os.path.join(f"chunk{i}.wav") audio_chunk.export(chunk_filename, format="wav") with sr.AudioFile(chunk_filename) as source: audio_listened = r.record(source) try: text = r.recognize_google(audio_listened) except sr.UnknownValueError as e: print("Error:", str(e)) else: text = f"{text.capitalize()}. " whole_text += text return whole_text def get_translation(source, dest, text): #src = "en" #dst = "hi" lang_dict = { 'Hindi': 'hi', # 'English':'en', 'Malayalam': 'ml', 'Marathi': 'mr', 'Kannada':'kn', 'Telugu':'te', 'Tamil':'ta', 'Oriya':'or', 'Bengali':'bn', 'Gujarati':'gu', 'Urdu':'ur' } #src = lang_dict[source] dst = lang_dict[dest] #task_name = f"translation_{src}_to_{dst}" #model_name = f"Helsinki-NLP/opus-mt-{src}-{dst}" #translator = pipeline(task_name, model=model_name, tokenizer=model_name) translator = Translator(from_lang = 'en', to_lang=dst) a_list = nltk.tokenize.sent_tokenize(text) trans = [] for i in a_list: translation = translator.translate(i) trans.append(translation) return ' '.join(trans) def truecasing_by_sentence_segmentation(input_text): # split the text into sentences sentences = sent_tokenize(input_text, language='english') # capitalize the sentences sentences_capitalized = [s.capitalize() for s in sentences] # join the capitalized sentences text_truecase = re.sub(" (?=[\.,'!?:;])", "", ' '.join(sentences_capitalized)) return text_truecase