Samarth991 commited on
Commit
fe5b216
1 Parent(s): 2144d43

Resolving audio path for file and url

Browse files
Files changed (2) hide show
  1. app.py +2 -0
  2. whisper_app.py +27 -20
app.py CHANGED
@@ -61,8 +61,10 @@ def audio_processor(wav_file,API_key,wav_model='small',llm='HuggingFace',tempera
61
 
62
  metadata = {"source": f"{wav_file}","duration":text_info['duration'],"language":text_info['language']}
63
  document = [Document(page_content=text_info['text'], metadata=metadata)]
 
64
  logger.info("Document",document)
65
  logging.info("Loading General Text Embeddings (GTE) model{}".format('thenlper/gte-large'))
 
66
  embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large',model_kwargs={"device": device})
67
  texts = process_documents(documents=document)
68
  global vector_db
 
61
 
62
  metadata = {"source": f"{wav_file}","duration":text_info['duration'],"language":text_info['language']}
63
  document = [Document(page_content=text_info['text'], metadata=metadata)]
64
+
65
  logger.info("Document",document)
66
  logging.info("Loading General Text Embeddings (GTE) model{}".format('thenlper/gte-large'))
67
+
68
  embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large',model_kwargs={"device": device})
69
  texts = process_documents(documents=document)
70
  global vector_db
whisper_app.py CHANGED
@@ -17,30 +17,37 @@ class WHISPERModel:
17
  clip_audio = whisper.pad_or_trim(audio_data, length=SAMPLE_RATE * conv_duration)
18
  result = self.model.transcribe(clip_audio)
19
  return result['language']
20
-
 
 
 
 
 
 
 
 
21
  def speech_to_text(self, audio_path):
22
  text_data = dict()
23
  audio_duration = 0
24
  conv_language = ""
25
- r = requests.get(audio_path)
26
- if r.status_code == 200:
27
- try:
28
- audio = whisper.load_audio(audio_path)
29
- conv_language = self.get_info(audio)
30
- if conv_language !='en':
31
- res = self.model.transcribe(audio,task='translate')
32
- if self.openai_flag:
33
- res['text'] = self.translate_text(res['text'], orginal_text=conv_language, convert_to='English')
34
- else:
35
- res = self.model.transcribe(audio)
36
- audio_duration = audio.shape[0] / SAMPLE_RATE
37
- text_data['text'] = res['text']
38
- text_data['duration'] = audio_duration
39
- text_data['language'] = conv_language
40
- except IOError as err:
41
- raise f"Issue in loading audio {audio_path}"
42
- else:
43
- raise("Unable to reach for URL {}".format(audio_path))
44
  return text_data
45
 
46
 
 
17
  clip_audio = whisper.pad_or_trim(audio_data, length=SAMPLE_RATE * conv_duration)
18
  result = self.model.transcribe(clip_audio)
19
  return result['language']
20
+
21
+ def read_audio(self,audio_path):
22
+ audio = None
23
+ try:
24
+ audio = whisper.load_audio(audio_path)
25
+ except IOError as err:
26
+ raise err
27
+ return audio
28
+
29
  def speech_to_text(self, audio_path):
30
  text_data = dict()
31
  audio_duration = 0
32
  conv_language = ""
33
+ if audio_path.startswith('http'):
34
+ r = requests.get(audio_path)
35
+ if r.status_code == 200:
36
+ audio = self.read_audio(audio_path)
37
+ else:
38
+ raise("Unable to reach for URL {}".format(audio_path))
39
+ if audio :
40
+ conv_language = self.get_info(audio)
41
+ if conv_language !='en':
42
+ res = self.model.transcribe(audio,task='translate')
43
+ if self.openai_flag:
44
+ res['text'] = self.translate_text(res['text'], orginal_text=conv_language, convert_to='English')
45
+ else:
46
+ res = self.model.transcribe(audio)
47
+ audio_duration = audio.shape[0] / SAMPLE_RATE
48
+ text_data['text'] = res['text']
49
+ text_data['duration'] = audio_duration
50
+ text_data['language'] = conv_language
 
51
  return text_data
52
 
53