Spaces:
Sleeping
Sleeping
Aditi Tewari
commited on
Commit
·
ee1587c
1
Parent(s):
fd31c98
Add files
Browse files- Dockerfile +24 -0
- requirements.txt +7 -0
- run.sh +1 -0
- soundscripter.html +147 -0
- soundscripter_flaskAPI.py +99 -0
- ss_flaskAPI.py +92 -0
- wave.png +0 -0
Dockerfile
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use an official Python runtime as a parent image
|
2 |
+
FROM python:3.8-slim
|
3 |
+
|
4 |
+
# Set the working directory in the container
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
# Copy the current directory contents into the container at /app
|
8 |
+
COPY . /app
|
9 |
+
|
10 |
+
# Install any needed packages specified in requirements.txt
|
11 |
+
RUN apt-get update && \
|
12 |
+
apt-get install -y ffmpeg && \
|
13 |
+
rm -rf /var/lib/apt/lists/* && \
|
14 |
+
pip install --no-cache-dir -r requirements.txt && \
|
15 |
+
pip install python-multipart
|
16 |
+
|
17 |
+
# Make port 7860 available to the world outside this container
|
18 |
+
EXPOSE 7860
|
19 |
+
|
20 |
+
# Define environment variable for FastAPI
|
21 |
+
ENV PYTHONUNBUFFERED 1
|
22 |
+
|
23 |
+
# Command to run the application
|
24 |
+
CMD ["uvicorn", "back-end:app", "--host", "0.0.0.0", "--port", "7860"]
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn
|
3 |
+
pydub
|
4 |
+
SpeechRecognition
|
5 |
+
numpy
|
6 |
+
librosa
|
7 |
+
sounddevice
|
run.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
python -m uvicorn soundscripter_flaskAPI:app --reload
|
soundscripter.html
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>SoundScripter</title>
|
7 |
+
<!-- Add Bootstrap CSS link here -->
|
8 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/5.3.0/css/bootstrap.min.css">
|
9 |
+
<style>
|
10 |
+
* {
|
11 |
+
margin: 10px;
|
12 |
+
}
|
13 |
+
</style>
|
14 |
+
</head>
|
15 |
+
<body style="background-color:#daeaf7";>
|
16 |
+
|
17 |
+
<nav class="navbar" style="background-color:whitesmoke; height:98px; border-radius:30px" >
|
18 |
+
<div>
|
19 |
+
<img src="wave.png" alt="SoundScripter" width="60" height="70" style="margin-top:-5px">
|
20 |
+
<span class="navbar-brand" style="font-size:35px;"><b>SoundScripter</b></span>
|
21 |
+
</div>
|
22 |
+
<h5>Automatic Speech Recognition</h5>
|
23 |
+
</nav>
|
24 |
+
|
25 |
+
<div class="container mt-5">
|
26 |
+
<div class="form-group">
|
27 |
+
<input type="file" class="form-control" id="audioUpload" style="height: 40px; width: 50%; max-width: 300px;">
|
28 |
+
|
29 |
+
<div><h6><b>OR</b></h6></div>
|
30 |
+
<button class="btn" id="recordButton" style="background-color: white; border-radius:30px; height:50px; width:150px;"><h3>Record</h3></button>
|
31 |
+
</div>
|
32 |
+
<div class="form-group" style="text-align: center;">
|
33 |
+
<button class="btn btn-danger" id="submitButton" style="border-radius:30px;"><b>UPLOAD</b></button>
|
34 |
+
</div>
|
35 |
+
<div class="form-group">
|
36 |
+
<textarea class="form-control" id="outputText" rows="5" style="border-radius:15px; margin-top:5%" readonly></textarea>
|
37 |
+
</div>
|
38 |
+
</div>
|
39 |
+
|
40 |
+
<!-- Add Bootstrap JS link and any other required scripts here -->
|
41 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.16.0/umd/popper.min.js"></script>
|
42 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/5.3.0/js/bootstrap.min.js"></script>
|
43 |
+
<script>
|
44 |
+
let isRecording = false;
|
45 |
+
let recordedChunks = [];
|
46 |
+
let uploadedFile;
|
47 |
+
let recordedAudioBlob;
|
48 |
+
let mediaRecorder;
|
49 |
+
|
50 |
+
const recordButton = document.getElementById("recordButton");
|
51 |
+
const submitButton = document.getElementById("submitButton");
|
52 |
+
const outputText = document.getElementById("outputText");
|
53 |
+
const audioUpload = document.getElementById("audioUpload");
|
54 |
+
|
55 |
+
let mediaStream; // Store the media stream to stop it later
|
56 |
+
|
57 |
+
recordButton.addEventListener("click", function () {
|
58 |
+
if (!isRecording) {
|
59 |
+
navigator.mediaDevices.getUserMedia({ audio: true })
|
60 |
+
.then(function (stream) {
|
61 |
+
mediaStream = stream; // Save the stream for stopping later
|
62 |
+
mediaRecorder = new MediaRecorder(stream);
|
63 |
+
recordedChunks = [];
|
64 |
+
|
65 |
+
mediaRecorder.ondataavailable = function (e) {
|
66 |
+
if (e.data.size > 0) {
|
67 |
+
recordedChunks.push(e.data);
|
68 |
+
}
|
69 |
+
};
|
70 |
+
|
71 |
+
mediaRecorder.onstop = function () {
|
72 |
+
recordedAudioBlob = new Blob(recordedChunks, { type: "audio/wav" });
|
73 |
+
};
|
74 |
+
|
75 |
+
mediaRecorder.start();
|
76 |
+
isRecording = true;
|
77 |
+
recordButton.innerText = "Stop Recording";
|
78 |
+
})
|
79 |
+
.catch(function (err) {
|
80 |
+
console.error("Error Accessing Microphone", err);
|
81 |
+
});
|
82 |
+
} else {
|
83 |
+
mediaRecorder.stop();
|
84 |
+
mediaStream.getTracks().forEach(track => track.stop()); // Stop the media stream
|
85 |
+
isRecording = false;
|
86 |
+
recordButton.innerText = "RECORD";
|
87 |
+
}
|
88 |
+
});
|
89 |
+
|
90 |
+
audioUpload.addEventListener("change", function (event) {
|
91 |
+
uploadedFile = event.target.files[0];
|
92 |
+
if (uploadedFile) {
|
93 |
+
console.error("File uploading")
|
94 |
+
const fileReader = new FileReader();
|
95 |
+
fileReader.onload = function () {
|
96 |
+
uploadedArrayBuffer = fileReader.result;
|
97 |
+
};
|
98 |
+
|
99 |
+
fileReader.readAsArrayBuffer(uploadedFile);
|
100 |
+
}
|
101 |
+
});
|
102 |
+
|
103 |
+
submitButton.addEventListener("click", function () {
|
104 |
+
if (recordedAudioBlob || uploadedFile) {
|
105 |
+
const formData = new FormData();
|
106 |
+
|
107 |
+
if (recordedAudioBlob) {
|
108 |
+
formData.append("audio", recordedAudioBlob, "recorded_audio.wav");
|
109 |
+
} else if (uploadedFile) {
|
110 |
+
formData.append("audio", uploadedFile, uploadedFile.name);
|
111 |
+
}
|
112 |
+
|
113 |
+
|
114 |
+
fetch("http://localhost:8000/asr", {
|
115 |
+
method: "POST",
|
116 |
+
body: formData,
|
117 |
+
})
|
118 |
+
.then((response) => response.json())
|
119 |
+
.then((data) => {
|
120 |
+
if (data.Text) {
|
121 |
+
outputText.value = data.text;
|
122 |
+
} else {
|
123 |
+
outputText.value = "No text recognized.";
|
124 |
+
}
|
125 |
+
})
|
126 |
+
.catch((error) => {
|
127 |
+
console.error("Error while connecting with backend", error);
|
128 |
+
outputText.value = "Backend communication failed.";
|
129 |
+
});
|
130 |
+
|
131 |
+
if (recordedAudioBlob) {
|
132 |
+
const downloadLink = document.createElement("a");
|
133 |
+
downloadLink.href = URL.createObjectURL(recordedAudioBlob);
|
134 |
+
downloadLink.download = "recorded_audio.wav";
|
135 |
+
downloadLink.style.display = "none";
|
136 |
+
document.body.appendChild(downloadLink);
|
137 |
+
downloadLink.click();
|
138 |
+
URL.revokeObjectURL(downloadLink.href);
|
139 |
+
document.body.removeChild(downloadLink);
|
140 |
+
}
|
141 |
+
} else {
|
142 |
+
outputText.value = "Please upload an audio file or record.";
|
143 |
+
}
|
144 |
+
});
|
145 |
+
</script>
|
146 |
+
</body>
|
147 |
+
</html>
|
soundscripter_flaskAPI.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import os
|
3 |
+
import uvicorn
|
4 |
+
import sounddevice as sd
|
5 |
+
import numpy as np
|
6 |
+
import speech_recognition as sr
|
7 |
+
from fastapi import FastAPI, File, UploadFile
|
8 |
+
from fastapi.responses import FileResponse, JSONResponse
|
9 |
+
from fastapi.responses import HTMLResponse
|
10 |
+
from pydub import AudioSegment
|
11 |
+
import librosa
|
12 |
+
from fastapi.middleware.cors import CORSMiddleware
|
13 |
+
|
14 |
+
|
15 |
+
app = FastAPI()
|
16 |
+
recognizer = sr.Recognizer()
|
17 |
+
|
18 |
+
origins = ["*"]
|
19 |
+
|
20 |
+
app.add_middleware(
|
21 |
+
CORSMiddleware,
|
22 |
+
allow_origins=origins,
|
23 |
+
allow_credentials=True,
|
24 |
+
allow_methods=["*"],
|
25 |
+
allow_headers=["*"],
|
26 |
+
)
|
27 |
+
|
28 |
+
@app.get("/", response_class=HTMLResponse)
|
29 |
+
async def read_root():
|
30 |
+
# Provide the path to the HTML file containing the front-end code
|
31 |
+
with open("soundscripter.html", "r") as file:
|
32 |
+
html_content = file.read()
|
33 |
+
return HTMLResponse(content=html_content)
|
34 |
+
|
35 |
+
|
36 |
+
def convert_audio_format(input_data, input_format, output_format='wav'):
|
37 |
+
# Convert audio data to WAV format
|
38 |
+
audio = AudioSegment.from_file(io.BytesIO(input_data), format=input_format)
|
39 |
+
output_data = audio.export(format=output_format).read()
|
40 |
+
return output_data
|
41 |
+
|
42 |
+
def recognize_speech(audio_data, language="hi-IN"):
|
43 |
+
with io.BytesIO(audio_data) as audio_io:
|
44 |
+
with sr.AudioFile(audio_io) as source:
|
45 |
+
audio = recognizer.record(source)
|
46 |
+
try:
|
47 |
+
text = recognizer.recognize_google(audio, language=language)
|
48 |
+
return text
|
49 |
+
except sr.UnknownValueError:
|
50 |
+
return "Speech not recognized."
|
51 |
+
except sr.RequestError as e:
|
52 |
+
return f"API request failed: {e}"
|
53 |
+
|
54 |
+
@app.post("/asr")
|
55 |
+
async def transcribe_audio(file: UploadFile = File(...)):
|
56 |
+
contents = await file.read()
|
57 |
+
|
58 |
+
# Determine the input audio format (assumes the format is part of the file name)
|
59 |
+
input_format = file.filename.split('.')[-1].lower()
|
60 |
+
|
61 |
+
# Convert audio to WAV format
|
62 |
+
wav_data = convert_audio_format(contents, input_format)
|
63 |
+
|
64 |
+
# Saving the received audio file in WAV format for future analysis (optional)
|
65 |
+
wav_file_path = "received_audio.wav"
|
66 |
+
with open(wav_file_path, "wb") as f:
|
67 |
+
f.write(wav_data)
|
68 |
+
|
69 |
+
# Transcribe the audio
|
70 |
+
result = recognize_speech(wav_data)
|
71 |
+
|
72 |
+
# return {"Text": result}
|
73 |
+
return JSONResponse(content={"text": result})
|
74 |
+
|
75 |
+
# @app.post("/asr/live")
|
76 |
+
# async def transcribe_live_audio():
|
77 |
+
# fs = 16000 # Target sample rate
|
78 |
+
# duration = 3 # seconds
|
79 |
+
# chunks = int(fs * duration)
|
80 |
+
|
81 |
+
# # Record live audio
|
82 |
+
# audio_data = sd.rec(chunks, samplerate=fs, channels=1, dtype=np.float32)
|
83 |
+
# sd.wait()
|
84 |
+
|
85 |
+
# # Resample the audio data to the target sample rate
|
86 |
+
# audio_data_resampled = librosa.resample(audio_data.flatten(), orig_sr=fs, target_sr=16000)
|
87 |
+
|
88 |
+
# # Convert audio data to bytes (use np.int16)
|
89 |
+
# audio_bytes = audio_data_resampled.astype(np.int16).tobytes()
|
90 |
+
|
91 |
+
# # Transcribe the audio
|
92 |
+
# result = recognize_speech(audio_bytes)
|
93 |
+
|
94 |
+
# return {"Text": result}
|
95 |
+
|
96 |
+
#Run the FastAPI app
|
97 |
+
# if __name__ == "__main__":
|
98 |
+
# uvicorn.run(app, host="127.0.0.1", port=8000)
|
99 |
+
|
ss_flaskAPI.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
import subprocess
|
4 |
+
from fastapi import FastAPI, File, UploadFile, Form
|
5 |
+
from fastapi.responses import FileResponse, JSONResponse
|
6 |
+
from fastapi.responses import HTMLResponse
|
7 |
+
from pydub import AudioSegment
|
8 |
+
import shutil
|
9 |
+
import tempfile
|
10 |
+
import speech_recognition as sr
|
11 |
+
import os
|
12 |
+
r = sr.Recognizer()
|
13 |
+
|
14 |
+
app = FastAPI()
|
15 |
+
|
16 |
+
def resample_audio(input_path, output_path, target_sample_rate):
|
17 |
+
ffmpeg_cmd = [
|
18 |
+
"ffmpeg",
|
19 |
+
"-i", input_path,
|
20 |
+
"-ar", str(target_sample_rate),
|
21 |
+
output_path
|
22 |
+
]
|
23 |
+
subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
24 |
+
|
25 |
+
@app.get("/", response_class=HTMLResponse)
|
26 |
+
async def read_root():
|
27 |
+
# Provide the path to the HTML file containing the front-end code
|
28 |
+
with open("soundscripter.html", "r") as file:
|
29 |
+
html_content = file.read()
|
30 |
+
return html_content
|
31 |
+
|
32 |
+
def get_sampling_rate(audio_file_path):
|
33 |
+
audio = AudioSegment.from_file(audio_file_path)
|
34 |
+
return audio.frame_rate
|
35 |
+
|
36 |
+
@app.post("/process_audio")
|
37 |
+
async def process_audio(audio: UploadFile = File(...), language: str = Form(...)):
|
38 |
+
if not audio or not language:
|
39 |
+
return JSONResponse(content={"success": False}, status_code=400)
|
40 |
+
|
41 |
+
# Check if the uploaded file is in WAV format
|
42 |
+
if audio.content_type != "audio/wav":
|
43 |
+
return JSONResponse(content={"success": False, "message": "Audio must be in WAV format."}, status_code=400)
|
44 |
+
|
45 |
+
try:
|
46 |
+
# Save the received audio to a temporary file
|
47 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
48 |
+
temp_file_path = temp_file.name
|
49 |
+
shutil.copyfileobj(audio.file, temp_file)
|
50 |
+
|
51 |
+
# Print the file path for debugging
|
52 |
+
print(temp_file_path)
|
53 |
+
output_path = tempfile.mktemp(suffix=".wav")
|
54 |
+
|
55 |
+
# Resample the audio to 16000 Hz
|
56 |
+
resample_audio(temp_file_path, output_path, target_sample_rate=16000)
|
57 |
+
print(output_path)
|
58 |
+
|
59 |
+
# Get the sampling rate of the received audio
|
60 |
+
sampling_rate = get_sampling_rate(output_path)
|
61 |
+
|
62 |
+
# Resample the audio to 16 kHz if needed
|
63 |
+
if sampling_rate != 16000:
|
64 |
+
return JSONResponse(content={"success": False, "message": "Sample rate is not 16000Hz."}, status_code=500)
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
except Exception as e:
|
70 |
+
print("Error processing audio:", e)
|
71 |
+
return JSONResponse(content={"success": False, "message": "Error processing audio."}, status_code=500)
|
72 |
+
# finally:
|
73 |
+
# # Cleanup: remove the temporary received audio file
|
74 |
+
# if os.path.exists(audio_file_path):
|
75 |
+
# os.remove(audio_file_path)
|
76 |
+
return JSONResponse(content={"success": True, "language":calling_asr(output_path,"hi-IN")})
|
77 |
+
|
78 |
+
def calling_asr(wav_file,lid):
|
79 |
+
AUDIO_FILE=wav_file
|
80 |
+
# aud_name=AUDIO_FILE.split('/')[-1].split('.')[0]
|
81 |
+
file=open(wav_file+".txt","w")
|
82 |
+
text="cant read wav file"
|
83 |
+
try:
|
84 |
+
with sr.AudioFile(AUDIO_FILE) as source:
|
85 |
+
audio = r.record(source)
|
86 |
+
text = r.recognize_google(audio, language=lid)
|
87 |
+
#file.write(aud_name +"\t"+text)
|
88 |
+
return text
|
89 |
+
except:
|
90 |
+
#file.write(" "+"Error in segement"+" ")
|
91 |
+
return text
|
92 |
+
#file.close()
|
wave.png
ADDED
![]() |