File size: 2,685 Bytes
2faf743
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import torch as th
import whisper
from whisper.audio import SAMPLE_RATE
from tenacity import retry, wait_random
import openai
import requests
# os.environ['OPENAI_API_KEY'] = "sk-<API KEY>"

class WHISPERModel:
    def __init__(self, model_name='small', device='cuda',openai_flag=False):
        self.device = device
        self.openai_flag = openai_flag
        self.model = whisper.load_model(model_name, device=self.device)

    def get_info(self, audio_data, conv_duration=30):
        clip_audio = whisper.pad_or_trim(audio_data, length=SAMPLE_RATE * conv_duration)
        result = self.model.transcribe(clip_audio)
        return result['language']

    def speech_to_text(self, audio_path):
        self.logger.info("Reading url {}".format(audio_path))
        text_data = dict()
        audio_duration = 0
        conv_language = ""
        r = requests.get(audio_path)
        if r.status_code == 200:
            try:
                audio = whisper.load_audio(audio_path)
                conv_language = self.get_info(audio)
                if conv_language !='en':
                    res = self.model.transcribe(audio,task='translate')
                    if self.openai_flag:
                        res['text'] = self.translate_text(res['text'], orginal_text=conv_language, convert_to='English')
                else:
                    res = self.model.transcribe(audio)
                audio_duration = audio.shape[0] / SAMPLE_RATE
                text_data['text'] = res['text']
                text_data['duration'] = audio_duration
                text_data['language'] = conv_language
            except IOError as err:
                raise f"Issue in loading audio {audio_path}"
        else:
            raise("Unable to reach for URL {}".format(audio_path))
        return text_data



    @retry(wait=wait_random(min=5, max=10))
    def translate_text(self, text, orginal_text='ar', convert_to='english'):
        prompt = f'Translate the following {orginal_text} text to {convert_to}:\n\n{orginal_text}: ' + text + '\n{convert_to}:'
        # Generate response using ChatGPT
        response = openai.Completion.create(
            engine='text-davinci-003',
            prompt=prompt,
            max_tokens=100,
            n=1,
            stop=None,
            temperature=0.7
        )
        # Extract the translated English text from the response
        translation = response.choices[0].text.strip()
        return translation

if __name__ == '__main__':
    url = "https://prypto-api.aswat.co/surveillance/recordings/5f53c28b-3504-4b8b-9db5-0c8b69a96233.mp3"
    audio2text = WHISPERModel()
    text = audio2text.speech_to_text(url)