Spaces:

Samarth991
/

LLAMA-QA-AudioFiles

Running

Samarth991 commited on Sep 28, 2023

Commit

fe5b216

•

1 Parent(s): 2144d43

Resolving audio path for file and url

Files changed (2) hide show

app.py CHANGED Viewed

@@ -61,8 +61,10 @@ def audio_processor(wav_file,API_key,wav_model='small',llm='HuggingFace',tempera
     metadata = {"source": f"{wav_file}","duration":text_info['duration'],"language":text_info['language']}
     document = [Document(page_content=text_info['text'], metadata=metadata)]
     logger.info("Document",document)
     logging.info("Loading General Text Embeddings (GTE) model{}".format('thenlper/gte-large'))
     embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large',model_kwargs={"device": device})
     texts = process_documents(documents=document)
     global vector_db

     metadata = {"source": f"{wav_file}","duration":text_info['duration'],"language":text_info['language']}
     document = [Document(page_content=text_info['text'], metadata=metadata)]
     logger.info("Document",document)
     logging.info("Loading General Text Embeddings (GTE) model{}".format('thenlper/gte-large'))
     embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large',model_kwargs={"device": device})
     texts = process_documents(documents=document)
     global vector_db

whisper_app.py CHANGED Viewed

@@ -17,30 +17,37 @@ class WHISPERModel:
         clip_audio = whisper.pad_or_trim(audio_data, length=SAMPLE_RATE * conv_duration)
         result = self.model.transcribe(clip_audio)
         return result['language']
     def speech_to_text(self, audio_path):
         text_data = dict()
         audio_duration = 0
         conv_language = ""
-        r = requests.get(audio_path)
-        if r.status_code == 200:
-            try:
-                audio = whisper.load_audio(audio_path)
-                conv_language = self.get_info(audio)
-                if conv_language !='en':
-                    res = self.model.transcribe(audio,task='translate')
-                    if self.openai_flag:
-                        res['text'] = self.translate_text(res['text'], orginal_text=conv_language, convert_to='English')
-                else:
-                    res = self.model.transcribe(audio)
-                audio_duration = audio.shape[0] / SAMPLE_RATE
-                text_data['text'] = res['text']
-                text_data['duration'] = audio_duration
-                text_data['language'] = conv_language
-            except IOError as err:
-                raise f"Issue in loading audio {audio_path}"
-        else:
-            raise("Unable to reach for URL {}".format(audio_path))
         return text_data

         clip_audio = whisper.pad_or_trim(audio_data, length=SAMPLE_RATE * conv_duration)
         result = self.model.transcribe(clip_audio)
         return result['language']
+    def read_audio(self,audio_path):
+        audio = None
+        try:
+            audio = whisper.load_audio(audio_path)
+        except IOError as err:
+            raise err
+        return audio
     def speech_to_text(self, audio_path):
         text_data = dict()
         audio_duration = 0
         conv_language = ""
+        if audio_path.startswith('http'):
+            r = requests.get(audio_path)
+            if r.status_code == 200:
+                audio = self.read_audio(audio_path)
+            else:
+                raise("Unable to reach for URL {}".format(audio_path))
+        if audio :
+            conv_language = self.get_info(audio)
+            if conv_language !='en':
+                res = self.model.transcribe(audio,task='translate')
+                if self.openai_flag:
+                    res['text'] = self.translate_text(res['text'], orginal_text=conv_language, convert_to='English')
+            else:
+                res = self.model.transcribe(audio)
+            audio_duration = audio.shape[0] / SAMPLE_RATE
+            text_data['text'] = res['text']
+            text_data['duration'] = audio_duration
+            text_data['language'] = conv_language
         return text_data