change whisper
Browse files
serve.py
CHANGED
@@ -7,13 +7,17 @@ import io
|
|
7 |
from pydub import AudioSegment
|
8 |
import time
|
9 |
import logging
|
|
|
10 |
|
11 |
logging.basicConfig(level=logging.INFO)
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
14 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
15 |
|
16 |
-
model = load_model(device)
|
|
|
|
|
|
|
17 |
|
18 |
def transcribe_audio(audio_data_bytes):
|
19 |
try:
|
@@ -33,6 +37,35 @@ def transcribe_audio(audio_data_bytes):
|
|
33 |
return result
|
34 |
except Exception as e:
|
35 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
app = FastAPI()
|
38 |
|
|
|
7 |
from pydub import AudioSegment
|
8 |
import time
|
9 |
import logging
|
10 |
+
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
11 |
|
12 |
logging.basicConfig(level=logging.INFO)
|
13 |
logger = logging.getLogger(__name__)
|
14 |
|
15 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
16 |
|
17 |
+
# model = load_model(device)
|
18 |
+
|
19 |
+
processor = WhisperProcessor.from_pretrained("Ivydata/whisper-small-japanese").to(device)
|
20 |
+
model = WhisperForConditionalGeneration.from_pretrained("Ivydata/whisper-small-japanese").to(device)
|
21 |
|
22 |
def transcribe_audio(audio_data_bytes):
|
23 |
try:
|
|
|
37 |
return result
|
38 |
except Exception as e:
|
39 |
raise HTTPException(status_code=500, detail=str(e))
|
40 |
+
|
41 |
+
def transcribe_whisper(audio_data_bytes):
|
42 |
+
try:
|
43 |
+
start_time = time.time()
|
44 |
+
audio_segment = AudioSegment.from_mp3(io.BytesIO(audio_data_bytes))
|
45 |
+
|
46 |
+
# Get audio data as numpy array
|
47 |
+
audio_data_int16 = np.array(audio_segment.get_array_of_samples())
|
48 |
+
# Convert to float32 normalized to [-1, 1]
|
49 |
+
audio_data_float32 = audio_data_int16.astype(np.float32) / 32768.0
|
50 |
+
|
51 |
+
# Process with whisper
|
52 |
+
input_features = processor(audio=audio_data_float32,
|
53 |
+
sampling_rate=audio_segment.frame_rate,
|
54 |
+
return_tensors="pt").input_features.to(device)
|
55 |
+
|
56 |
+
predicted_ids = model.generate(input_features=input_features)
|
57 |
+
|
58 |
+
result = processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
59 |
+
resultText = result[0] if isinstance(result, list) and len(result) > 0 else str(result)
|
60 |
+
result = {
|
61 |
+
"text": resultText
|
62 |
+
}
|
63 |
+
end_time = time.time()
|
64 |
+
print(f"Time taken: {end_time - start_time} seconds")
|
65 |
+
return result
|
66 |
+
except Exception as e:
|
67 |
+
raise HTTPException(status_code=500, detail=str(e))
|
68 |
+
|
69 |
|
70 |
app = FastAPI()
|
71 |
|
uv.lock
CHANGED
The diff for this file is too large to render.
See raw diff
|
|