karthik11 commited on
Commit
f889b9f
1 Parent(s): ed13421

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +92 -0
utils.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.metrics import label_ranking_average_precision_score
4
+ import streamlit as st
5
+ import joblib
6
+ import os
7
+ from translate import Translator
8
+ from moviepy.editor import VideoFileClip
9
+ import speech_recognition as sr
10
+ from pydub import AudioSegment
11
+ from pydub.silence import split_on_silence
12
+ import transformers
13
+ from transformers import pipeline
14
+ import nltk
15
+ nltk.download('punkt')
16
+ nltk.download('averaged_perceptron_tagger')
17
+ import nltk
18
+ nltk.download('punkt')
19
+ nltk.download('averaged_perceptron_tagger')
20
+ from nltk.tokenize import sent_tokenize
21
+ import re
22
+ import stanfordnlp
23
+ def welcome():
24
+ return "Welcome All"
25
+
26
+ def get_large_audio_transcription(path):
27
+ r = sr.Recognizer()
28
+ sound = AudioSegment.from_wav(path)
29
+ chunks = split_on_silence(sound,
30
+ min_silence_len = 500,
31
+ silence_thresh = sound.dBFS-14,
32
+ keep_silence=500,
33
+ )
34
+ whole_text = ""
35
+ for i, audio_chunk in enumerate(chunks, start=1):
36
+ chunk_filename = os.path.join(f"chunk{i}.wav")
37
+ audio_chunk.export(chunk_filename, format="wav")
38
+ with sr.AudioFile(chunk_filename) as source:
39
+ audio_listened = r.record(source)
40
+ try:
41
+ text = r.recognize_google(audio_listened)
42
+ except sr.UnknownValueError as e:
43
+ print("Error:", str(e))
44
+ else:
45
+ text = f"{text.capitalize()}. "
46
+ whole_text += text
47
+ return whole_text
48
+
49
+ def get_translation(source, dest, text):
50
+
51
+ #src = "en"
52
+ #dst = "hi"
53
+
54
+ lang_dict = {
55
+ 'Hindi': 'hi',
56
+ # 'English':'en',
57
+ 'Malayalam': 'ml',
58
+ 'Marathi': 'mr',
59
+ 'Kannada':'kn',
60
+ 'Telugu':'te',
61
+ 'Tamil':'ta',
62
+ 'Oriya':'or',
63
+ 'Bengali':'bn',
64
+ 'Gujarati':'gu',
65
+ 'Urdu':'ur'
66
+ }
67
+
68
+ #src = lang_dict[source]
69
+ dst = lang_dict[dest]
70
+
71
+ #task_name = f"translation_{src}_to_{dst}"
72
+ #model_name = f"Helsinki-NLP/opus-mt-{src}-{dst}"
73
+
74
+ #translator = pipeline(task_name, model=model_name, tokenizer=model_name)
75
+ translator = Translator(from_lang = 'en', to_lang=dst)
76
+ a_list = nltk.tokenize.sent_tokenize(text)
77
+ trans = []
78
+ for i in a_list:
79
+ translation = translator.translate(i)
80
+ trans.append(translation)
81
+
82
+ return ' '.join(trans)
83
+
84
+
85
+ def truecasing_by_sentence_segmentation(input_text):
86
+ # split the text into sentences
87
+ sentences = sent_tokenize(input_text, language='english')
88
+ # capitalize the sentences
89
+ sentences_capitalized = [s.capitalize() for s in sentences]
90
+ # join the capitalized sentences
91
+ text_truecase = re.sub(" (?=[\.,'!?:;])", "", ' '.join(sentences_capitalized))
92
+ return text_truecase