AdWeeb commited on
Commit
5df7b7a
1 Parent(s): bac278b

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +96 -0
utils.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created on Mon Mar 28 01:07:44 2022
3
+ @author: adeep
4
+ """
5
+ import numpy as np
6
+ import pandas as pd
7
+ from sklearn.metrics import label_ranking_average_precision_score
8
+ import streamlit as st
9
+ import joblib
10
+ import os
11
+ from translate import Translator
12
+ from moviepy.editor import VideoFileClip
13
+ import speech_recognition as sr
14
+ from pydub import AudioSegment
15
+ from pydub.silence import split_on_silence
16
+ import transformers
17
+ from transformers import pipeline
18
+ import nltk
19
+ nltk.download('punkt')
20
+ nltk.download('averaged_perceptron_tagger')
21
+ import nltk
22
+ nltk.download('punkt')
23
+ nltk.download('averaged_perceptron_tagger')
24
+ from nltk.tokenize import sent_tokenize
25
+ import re
26
+ import stanfordnlp
27
+ def welcome():
28
+ return "Welcome All"
29
+
30
+ def get_large_audio_transcription(path):
31
+ r = sr.Recognizer()
32
+ sound = AudioSegment.from_wav(path)
33
+ chunks = split_on_silence(sound,
34
+ min_silence_len = 500,
35
+ silence_thresh = sound.dBFS-14,
36
+ keep_silence=500,
37
+ )
38
+ whole_text = ""
39
+ for i, audio_chunk in enumerate(chunks, start=1):
40
+ chunk_filename = os.path.join(f"chunk{i}.wav")
41
+ audio_chunk.export(chunk_filename, format="wav")
42
+ with sr.AudioFile(chunk_filename) as source:
43
+ audio_listened = r.record(source)
44
+ try:
45
+ text = r.recognize_google(audio_listened)
46
+ except sr.UnknownValueError as e:
47
+ print("Error:", str(e))
48
+ else:
49
+ text = f"{text.capitalize()}. "
50
+ whole_text += text
51
+ return whole_text
52
+
53
+ def get_translation(source, dest, text):
54
+
55
+ #src = "en"
56
+ #dst = "hi"
57
+
58
+ lang_dict = {
59
+ 'Hindi': 'hi',
60
+ # 'English':'en',
61
+ 'Malayalam': 'ml',
62
+ 'Marathi': 'mr',
63
+ 'Kannada':'kn',
64
+ 'Telugu':'te',
65
+ 'Tamil':'ta',
66
+ 'Oriya':'or',
67
+ 'Bengali':'bn',
68
+ 'Gujarati':'gu',
69
+ 'Urdu':'ur'
70
+ }
71
+
72
+ #src = lang_dict[source]
73
+ dst = lang_dict[dest]
74
+
75
+ #task_name = f"translation_{src}_to_{dst}"
76
+ #model_name = f"Helsinki-NLP/opus-mt-{src}-{dst}"
77
+
78
+ #translator = pipeline(task_name, model=model_name, tokenizer=model_name)
79
+ translator = Translator(from_lang = 'en', to_lang=dst)
80
+ a_list = nltk.tokenize.sent_tokenize(text)
81
+ trans = []
82
+ for i in a_list:
83
+ translation = translator.translate(i)
84
+ trans.append(translation)
85
+
86
+ return ' '.join(trans)
87
+
88
+
89
+ def truecasing_by_sentence_segmentation(input_text):
90
+ # split the text into sentences
91
+ sentences = sent_tokenize(input_text, language='english')
92
+ # capitalize the sentences
93
+ sentences_capitalized = [s.capitalize() for s in sentences]
94
+ # join the capitalized sentences
95
+ text_truecase = re.sub(" (?=[\.,'!?:;])", "", ' '.join(sentences_capitalized))
96
+ return text_truecase