File size: 2,727 Bytes
f889b9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import numpy as np
import pandas as pd
from sklearn.metrics import label_ranking_average_precision_score
import streamlit as st 
import joblib
import os
from translate import Translator
from moviepy.editor import VideoFileClip
import speech_recognition as sr
from pydub import AudioSegment
from pydub.silence import split_on_silence
import transformers
from transformers import pipeline
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import sent_tokenize
import re 
import stanfordnlp
def welcome():
    return "Welcome All"

def get_large_audio_transcription(path):  
    r = sr.Recognizer()
    sound = AudioSegment.from_wav(path)  
    chunks = split_on_silence(sound,
        min_silence_len = 500,
        silence_thresh = sound.dBFS-14,
        keep_silence=500,
    )
    whole_text = ""
    for i, audio_chunk in enumerate(chunks, start=1):
        chunk_filename = os.path.join(f"chunk{i}.wav")
        audio_chunk.export(chunk_filename, format="wav")
        with sr.AudioFile(chunk_filename) as source:
            audio_listened = r.record(source)
            try:
                text = r.recognize_google(audio_listened)
            except sr.UnknownValueError as e:
                print("Error:", str(e))
            else:
                text = f"{text.capitalize()}. "
                whole_text += text
    return whole_text

def get_translation(source, dest, text):

    #src = "en"
    #dst = "hi"

    lang_dict = {
        'Hindi': 'hi',
    #    'English':'en',
        'Malayalam': 'ml',
        'Marathi': 'mr',
        'Kannada':'kn', 
        'Telugu':'te', 
        'Tamil':'ta', 
        'Oriya':'or',
        'Bengali':'bn',
        'Gujarati':'gu', 
        'Urdu':'ur'
    }

    #src = lang_dict[source]
    dst = lang_dict[dest]

    #task_name = f"translation_{src}_to_{dst}"
    #model_name = f"Helsinki-NLP/opus-mt-{src}-{dst}"

    #translator  = pipeline(task_name, model=model_name, tokenizer=model_name)
    translator = Translator(from_lang = 'en', to_lang=dst)
    a_list = nltk.tokenize.sent_tokenize(text)
    trans = []
    for i in a_list: 
        translation = translator.translate(i)
        trans.append(translation)
    
    return ' '.join(trans)
    

def truecasing_by_sentence_segmentation(input_text):
    # split the text into sentences
    sentences = sent_tokenize(input_text, language='english')
    # capitalize the sentences
    sentences_capitalized = [s.capitalize() for s in sentences]
    # join the capitalized sentences
    text_truecase = re.sub(" (?=[\.,'!?:;])", "", ' '.join(sentences_capitalized))
    return text_truecase