dioarafl commited on
Commit
f2fc28a
1 Parent(s): bf15ff0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -24
app.py CHANGED
@@ -1,13 +1,11 @@
1
- import cv2
2
  import gradio as gr
3
- import tempfile
 
4
  import torch
5
  import torchaudio
6
  from torchvision.models.detection import fasterrcnn_resnet50_fpn
7
  import torchvision.transforms as transforms
8
  from PIL import Image
9
- import numpy as np
10
- import soundfile as sf
11
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
12
 
13
  class FasterRCNNDetector:
@@ -54,23 +52,9 @@ class JarvisModels:
54
  self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
55
 
56
  async def generate_response(self, prompt):
57
- # Logika untuk menghasilkan tanggapan
58
- generate_kwargs = dict(
59
- temperature=0.6,
60
- max_new_tokens=256,
61
- top_p=0.95,
62
- repetition_penalty=1,
63
- do_sample=True,
64
- seed=42,
65
- )
66
- formatted_prompt = system_instructions1 + prompt + "[JARVIS]"
67
- stream = self.client1.text_generation(
68
- formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=True)
69
- output = ""
70
- for response in stream:
71
- output += response.token.text
72
-
73
- return output
74
 
75
  async def transcribe_audio(self, audio_file):
76
  input_audio, _ = torchaudio.load(audio_file)
@@ -80,12 +64,36 @@ class JarvisModels:
80
  transcription = self.processor.batch_decode(predicted_ids)
81
  return transcription[0]
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  detector = FasterRCNNDetector()
84
 
85
  iface = gr.Interface(
86
- fn=[detector.detect_objects, JarvisModels().transcribe_audio],
87
- inputs=gr.inputs.Video(label="Webcam", parameters={"fps": 30}),
88
- outputs=[gr.outputs.Image(), "text"],
 
 
 
 
 
 
 
 
89
  title="Vision and Speech Interface",
90
  description="This interface detects objects in the webcam feed and transcribes speech recorded through the microphone."
91
  )
 
 
1
  import gradio as gr
2
+ import subprocess
3
+ import cv2
4
  import torch
5
  import torchaudio
6
  from torchvision.models.detection import fasterrcnn_resnet50_fpn
7
  import torchvision.transforms as transforms
8
  from PIL import Image
 
 
9
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
10
 
11
  class FasterRCNNDetector:
 
52
  self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
53
 
54
  async def generate_response(self, prompt):
55
+ # Logika untuk menghasilkan tanggapan
56
+ response = gr.Interface.load("models/openai-community/gpt2").process(prompt)
57
+ return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  async def transcribe_audio(self, audio_file):
60
  input_audio, _ = torchaudio.load(audio_file)
 
64
  transcription = self.processor.batch_decode(predicted_ids)
65
  return transcription[0]
66
 
67
+ def transcribe(audio):
68
+ global messages
69
+
70
+ audio_file = open(audio, "rb")
71
+ # Transkripsi audio secara lokal (Anda dapat menambahkan logika transkripsi sesuai kebutuhan)
72
+ transcript = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
73
+
74
+ # Logika tanggapan (Anda dapat menambahkan logika untuk menghasilkan tanggapan sesuai kebutuhan)
75
+ system_message = {"role": "system", "content": "Lorem ipsum dolor sit amet, consectetur adipiscing elit."}
76
+
77
+ subprocess.call(["say", system_message['content']])
78
+
79
+ chat_transcript = "User: " + transcript + "\n\n" + "System: " + system_message['content'] + "\n\n"
80
+
81
+ return chat_transcript
82
+
83
  detector = FasterRCNNDetector()
84
 
85
  iface = gr.Interface(
86
+ fn=[detector.detect_objects, JarvisModels().transcribe_audio, JarvisModels().generate_response, transcribe],
87
+ inputs=[
88
+ gr.inputs.Video(label="Webcam", parameters={"fps": 30}),
89
+ gr.inputs.Audio(source="microphone", type="filepath")
90
+ ],
91
+ outputs=[
92
+ gr.outputs.Image(),
93
+ "text",
94
+ "text",
95
+ "text"
96
+ ],
97
  title="Vision and Speech Interface",
98
  description="This interface detects objects in the webcam feed and transcribes speech recorded through the microphone."
99
  )