AdWeeb commited on
Commit
260ca9e
1 Parent(s): 4b3f952

create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +87 -0
utils.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Mon Mar 28 01:07:44 2022
4
+
5
+ @author: adeep
6
+ """
7
+ import numpy as np
8
+ import pandas as pd
9
+ from sklearn.metrics import label_ranking_average_precision_score
10
+ import streamlit as st
11
+ import joblib
12
+ import os
13
+ from translate import Translator
14
+ from moviepy.editor import VideoFileClip
15
+ import speech_recognition as sr
16
+ from pydub import AudioSegment
17
+ from pydub.silence import split_on_silence
18
+ import transformers
19
+ from transformers import pipeline
20
+ import nltk
21
+ nltk.download('punkt')
22
+ nltk.download('averaged_perceptron_tagger')
23
+ import nltk
24
+ nltk.download('punkt')
25
+ nltk.download('averaged_perceptron_tagger')
26
+ from nltk.tokenize import sent_tokenize
27
+ import re
28
+ def welcome():
29
+ return "Welcome All"
30
+
31
+ def get_large_audio_transcription(path):
32
+ r = sr.Recognizer()
33
+ sound = AudioSegment.from_wav(path)
34
+ chunks = split_on_silence(sound,
35
+ min_silence_len = 500,
36
+ silence_thresh = sound.dBFS-14,
37
+ keep_silence=500,
38
+ )
39
+ whole_text = ""
40
+ for i, audio_chunk in enumerate(chunks, start=1):
41
+ chunk_filename = os.path.join(f"chunk{i}.wav")
42
+ audio_chunk.export(chunk_filename, format="wav")
43
+ with sr.AudioFile(chunk_filename) as source:
44
+ audio_listened = r.record(source)
45
+ try:
46
+ text = r.recognize_google(audio_listened)
47
+ except sr.UnknownValueError as e:
48
+ print("Error:", str(e))
49
+ else:
50
+ text = f"{text.capitalize()}. "
51
+ whole_text += text
52
+ return whole_text
53
+
54
+ def get_translation(source, dest, text):
55
+
56
+ #src = "en"
57
+ #dst = "hi"
58
+
59
+ lang_dict = {
60
+ 'Hindi': 'hi',
61
+ # 'English':'en',
62
+ 'Malayalam': 'ml',
63
+ 'Marathi': 'mr',
64
+ 'Kannada':'kn',
65
+ 'Telugu':'ta',
66
+ 'Tamil':'ta',
67
+ 'Oriya':'or',
68
+ 'Bengali':'bn',
69
+ 'Gujarati':'gu',
70
+ 'Urdu':'ur'
71
+ }
72
+
73
+ #src = lang_dict[source]
74
+ dst = lang_dict[dest]
75
+
76
+ #task_name = f"translation_{src}_to_{dst}"
77
+ #model_name = f"Helsinki-NLP/opus-mt-{src}-{dst}"
78
+
79
+ #translator = pipeline(task_name, model=model_name, tokenizer=model_name)
80
+ translator = Translator(from_lang = 'en', to_lang=dst)
81
+ a_list = nltk.tokenize.sent_tokenize(text)
82
+ trans = []
83
+ for i in a_list:
84
+ translation = translator.translate(i)
85
+ trans.append(translation)
86
+
87
+ return ' '.join(trans)