Create utils.py
Browse files
utils.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn.metrics import label_ranking_average_precision_score
|
4 |
+
import streamlit as st
|
5 |
+
import joblib
|
6 |
+
import os
|
7 |
+
from translate import Translator
|
8 |
+
from moviepy.editor import VideoFileClip
|
9 |
+
import speech_recognition as sr
|
10 |
+
from pydub import AudioSegment
|
11 |
+
from pydub.silence import split_on_silence
|
12 |
+
import transformers
|
13 |
+
from transformers import pipeline
|
14 |
+
import nltk
|
15 |
+
nltk.download('punkt')
|
16 |
+
nltk.download('averaged_perceptron_tagger')
|
17 |
+
import nltk
|
18 |
+
nltk.download('punkt')
|
19 |
+
nltk.download('averaged_perceptron_tagger')
|
20 |
+
from nltk.tokenize import sent_tokenize
|
21 |
+
import re
|
22 |
+
import stanfordnlp
|
23 |
+
def welcome():
|
24 |
+
return "Welcome All"
|
25 |
+
|
26 |
+
def get_large_audio_transcription(path):
|
27 |
+
r = sr.Recognizer()
|
28 |
+
sound = AudioSegment.from_wav(path)
|
29 |
+
chunks = split_on_silence(sound,
|
30 |
+
min_silence_len = 500,
|
31 |
+
silence_thresh = sound.dBFS-14,
|
32 |
+
keep_silence=500,
|
33 |
+
)
|
34 |
+
whole_text = ""
|
35 |
+
for i, audio_chunk in enumerate(chunks, start=1):
|
36 |
+
chunk_filename = os.path.join(f"chunk{i}.wav")
|
37 |
+
audio_chunk.export(chunk_filename, format="wav")
|
38 |
+
with sr.AudioFile(chunk_filename) as source:
|
39 |
+
audio_listened = r.record(source)
|
40 |
+
try:
|
41 |
+
text = r.recognize_google(audio_listened)
|
42 |
+
except sr.UnknownValueError as e:
|
43 |
+
print("Error:", str(e))
|
44 |
+
else:
|
45 |
+
text = f"{text.capitalize()}. "
|
46 |
+
whole_text += text
|
47 |
+
return whole_text
|
48 |
+
|
49 |
+
def get_translation(source, dest, text):
|
50 |
+
|
51 |
+
#src = "en"
|
52 |
+
#dst = "hi"
|
53 |
+
|
54 |
+
lang_dict = {
|
55 |
+
'Hindi': 'hi',
|
56 |
+
# 'English':'en',
|
57 |
+
'Malayalam': 'ml',
|
58 |
+
'Marathi': 'mr',
|
59 |
+
'Kannada':'kn',
|
60 |
+
'Telugu':'te',
|
61 |
+
'Tamil':'ta',
|
62 |
+
'Oriya':'or',
|
63 |
+
'Bengali':'bn',
|
64 |
+
'Gujarati':'gu',
|
65 |
+
'Urdu':'ur'
|
66 |
+
}
|
67 |
+
|
68 |
+
#src = lang_dict[source]
|
69 |
+
dst = lang_dict[dest]
|
70 |
+
|
71 |
+
#task_name = f"translation_{src}_to_{dst}"
|
72 |
+
#model_name = f"Helsinki-NLP/opus-mt-{src}-{dst}"
|
73 |
+
|
74 |
+
#translator = pipeline(task_name, model=model_name, tokenizer=model_name)
|
75 |
+
translator = Translator(from_lang = 'en', to_lang=dst)
|
76 |
+
a_list = nltk.tokenize.sent_tokenize(text)
|
77 |
+
trans = []
|
78 |
+
for i in a_list:
|
79 |
+
translation = translator.translate(i)
|
80 |
+
trans.append(translation)
|
81 |
+
|
82 |
+
return ' '.join(trans)
|
83 |
+
|
84 |
+
|
85 |
+
def truecasing_by_sentence_segmentation(input_text):
|
86 |
+
# split the text into sentences
|
87 |
+
sentences = sent_tokenize(input_text, language='english')
|
88 |
+
# capitalize the sentences
|
89 |
+
sentences_capitalized = [s.capitalize() for s in sentences]
|
90 |
+
# join the capitalized sentences
|
91 |
+
text_truecase = re.sub(" (?=[\.,'!?:;])", "", ' '.join(sentences_capitalized))
|
92 |
+
return text_truecase
|