SuMmeet / utils.py
AdWeeb's picture
Update utils.py
362b303
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 28 01:07:44 2022
@author: adeep
"""
import numpy as np
import pandas as pd
from sklearn.metrics import label_ranking_average_precision_score
import streamlit as st
import joblib
import os
from translate import Translator
from moviepy.editor import VideoFileClip
import speech_recognition as sr
from pydub import AudioSegment
from pydub.silence import split_on_silence
import transformers
from transformers import pipeline
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import sent_tokenize
import re
import stanfordnlp
def welcome():
return "Welcome All"
def get_large_audio_transcription(path):
r = sr.Recognizer()
sound = AudioSegment.from_wav(path)
chunks = split_on_silence(sound,
min_silence_len = 500,
silence_thresh = sound.dBFS-14,
keep_silence=500,
)
whole_text = ""
for i, audio_chunk in enumerate(chunks, start=1):
chunk_filename = os.path.join(f"chunk{i}.wav")
audio_chunk.export(chunk_filename, format="wav")
with sr.AudioFile(chunk_filename) as source:
audio_listened = r.record(source)
try:
text = r.recognize_google(audio_listened)
except sr.UnknownValueError as e:
print("Error:", str(e))
else:
text = f"{text.capitalize()}. "
whole_text += text
return whole_text
def get_translation(source, dest, text):
#src = "en"
#dst = "hi"
lang_dict = {
'Hindi': 'hi',
# 'English':'en',
'Malayalam': 'ml',
'Marathi': 'mr',
'Kannada':'kn',
'Telugu':'te',
'Tamil':'ta',
'Oriya':'or',
'Bengali':'bn',
'Gujarati':'gu',
'Urdu':'ur'
}
#src = lang_dict[source]
dst = lang_dict[dest]
#task_name = f"translation_{src}_to_{dst}"
#model_name = f"Helsinki-NLP/opus-mt-{src}-{dst}"
#translator = pipeline(task_name, model=model_name, tokenizer=model_name)
translator = Translator(from_lang = 'en', to_lang=dst)
a_list = nltk.tokenize.sent_tokenize(text)
trans = []
for i in a_list:
translation = translator.translate(i)
trans.append(translation)
return ' '.join(trans)
def truecasing_by_sentence_segmentation(input_text):
# split the text into sentences
sentences = sent_tokenize(input_text, language='english')
# capitalize the sentences
sentences_capitalized = [s.capitalize() for s in sentences]
# join the capitalized sentences
text_truecase = re.sub(" (?=[\.,'!?:;])", "", ' '.join(sentences_capitalized))
return text_truecase