Aditi Tewari commited on
Commit
ee1587c
·
1 Parent(s): fd31c98
Files changed (7) hide show
  1. Dockerfile +24 -0
  2. requirements.txt +7 -0
  3. run.sh +1 -0
  4. soundscripter.html +147 -0
  5. soundscripter_flaskAPI.py +99 -0
  6. ss_flaskAPI.py +92 -0
  7. wave.png +0 -0
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.8-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Copy the current directory contents into the container at /app
8
+ COPY . /app
9
+
10
+ # Install any needed packages specified in requirements.txt
11
+ RUN apt-get update && \
12
+ apt-get install -y ffmpeg && \
13
+ rm -rf /var/lib/apt/lists/* && \
14
+ pip install --no-cache-dir -r requirements.txt && \
15
+ pip install python-multipart
16
+
17
+ # Make port 7860 available to the world outside this container
18
+ EXPOSE 7860
19
+
20
+ # Define environment variable for FastAPI
21
+ ENV PYTHONUNBUFFERED 1
22
+
23
+ # Command to run the application
24
+ CMD ["uvicorn", "back-end:app", "--host", "0.0.0.0", "--port", "7860"]
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pydub
4
+ SpeechRecognition
5
+ numpy
6
+ librosa
7
+ sounddevice
run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python -m uvicorn soundscripter_flaskAPI:app --reload
soundscripter.html ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>SoundScripter</title>
7
+ <!-- Add Bootstrap CSS link here -->
8
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/5.3.0/css/bootstrap.min.css">
9
+ <style>
10
+ * {
11
+ margin: 10px;
12
+ }
13
+ </style>
14
+ </head>
15
+ <body style="background-color:#daeaf7";>
16
+
17
+ <nav class="navbar" style="background-color:whitesmoke; height:98px; border-radius:30px" >
18
+ <div>
19
+ <img src="wave.png" alt="SoundScripter" width="60" height="70" style="margin-top:-5px">
20
+ <span class="navbar-brand" style="font-size:35px;"><b>SoundScripter</b></span>
21
+ </div>
22
+ <h5>Automatic Speech Recognition</h5>
23
+ </nav>
24
+
25
+ <div class="container mt-5">
26
+ <div class="form-group">
27
+ <input type="file" class="form-control" id="audioUpload" style="height: 40px; width: 50%; max-width: 300px;">
28
+
29
+ <div><h6><b>OR</b></h6></div>
30
+ <button class="btn" id="recordButton" style="background-color: white; border-radius:30px; height:50px; width:150px;"><h3>Record</h3></button>
31
+ </div>
32
+ <div class="form-group" style="text-align: center;">
33
+ <button class="btn btn-danger" id="submitButton" style="border-radius:30px;"><b>UPLOAD</b></button>
34
+ </div>
35
+ <div class="form-group">
36
+ <textarea class="form-control" id="outputText" rows="5" style="border-radius:15px; margin-top:5%" readonly></textarea>
37
+ </div>
38
+ </div>
39
+
40
+ <!-- Add Bootstrap JS link and any other required scripts here -->
41
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.16.0/umd/popper.min.js"></script>
42
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/5.3.0/js/bootstrap.min.js"></script>
43
+ <script>
44
+ let isRecording = false;
45
+ let recordedChunks = [];
46
+ let uploadedFile;
47
+ let recordedAudioBlob;
48
+ let mediaRecorder;
49
+
50
+ const recordButton = document.getElementById("recordButton");
51
+ const submitButton = document.getElementById("submitButton");
52
+ const outputText = document.getElementById("outputText");
53
+ const audioUpload = document.getElementById("audioUpload");
54
+
55
+ let mediaStream; // Store the media stream to stop it later
56
+
57
+ recordButton.addEventListener("click", function () {
58
+ if (!isRecording) {
59
+ navigator.mediaDevices.getUserMedia({ audio: true })
60
+ .then(function (stream) {
61
+ mediaStream = stream; // Save the stream for stopping later
62
+ mediaRecorder = new MediaRecorder(stream);
63
+ recordedChunks = [];
64
+
65
+ mediaRecorder.ondataavailable = function (e) {
66
+ if (e.data.size > 0) {
67
+ recordedChunks.push(e.data);
68
+ }
69
+ };
70
+
71
+ mediaRecorder.onstop = function () {
72
+ recordedAudioBlob = new Blob(recordedChunks, { type: "audio/wav" });
73
+ };
74
+
75
+ mediaRecorder.start();
76
+ isRecording = true;
77
+ recordButton.innerText = "Stop Recording";
78
+ })
79
+ .catch(function (err) {
80
+ console.error("Error Accessing Microphone", err);
81
+ });
82
+ } else {
83
+ mediaRecorder.stop();
84
+ mediaStream.getTracks().forEach(track => track.stop()); // Stop the media stream
85
+ isRecording = false;
86
+ recordButton.innerText = "RECORD";
87
+ }
88
+ });
89
+
90
+ audioUpload.addEventListener("change", function (event) {
91
+ uploadedFile = event.target.files[0];
92
+ if (uploadedFile) {
93
+ console.error("File uploading")
94
+ const fileReader = new FileReader();
95
+ fileReader.onload = function () {
96
+ uploadedArrayBuffer = fileReader.result;
97
+ };
98
+
99
+ fileReader.readAsArrayBuffer(uploadedFile);
100
+ }
101
+ });
102
+
103
+ submitButton.addEventListener("click", function () {
104
+ if (recordedAudioBlob || uploadedFile) {
105
+ const formData = new FormData();
106
+
107
+ if (recordedAudioBlob) {
108
+ formData.append("audio", recordedAudioBlob, "recorded_audio.wav");
109
+ } else if (uploadedFile) {
110
+ formData.append("audio", uploadedFile, uploadedFile.name);
111
+ }
112
+
113
+
114
+ fetch("http://localhost:8000/asr", {
115
+ method: "POST",
116
+ body: formData,
117
+ })
118
+ .then((response) => response.json())
119
+ .then((data) => {
120
+ if (data.Text) {
121
+ outputText.value = data.text;
122
+ } else {
123
+ outputText.value = "No text recognized.";
124
+ }
125
+ })
126
+ .catch((error) => {
127
+ console.error("Error while connecting with backend", error);
128
+ outputText.value = "Backend communication failed.";
129
+ });
130
+
131
+ if (recordedAudioBlob) {
132
+ const downloadLink = document.createElement("a");
133
+ downloadLink.href = URL.createObjectURL(recordedAudioBlob);
134
+ downloadLink.download = "recorded_audio.wav";
135
+ downloadLink.style.display = "none";
136
+ document.body.appendChild(downloadLink);
137
+ downloadLink.click();
138
+ URL.revokeObjectURL(downloadLink.href);
139
+ document.body.removeChild(downloadLink);
140
+ }
141
+ } else {
142
+ outputText.value = "Please upload an audio file or record.";
143
+ }
144
+ });
145
+ </script>
146
+ </body>
147
+ </html>
soundscripter_flaskAPI.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ import uvicorn
4
+ import sounddevice as sd
5
+ import numpy as np
6
+ import speech_recognition as sr
7
+ from fastapi import FastAPI, File, UploadFile
8
+ from fastapi.responses import FileResponse, JSONResponse
9
+ from fastapi.responses import HTMLResponse
10
+ from pydub import AudioSegment
11
+ import librosa
12
+ from fastapi.middleware.cors import CORSMiddleware
13
+
14
+
15
+ app = FastAPI()
16
+ recognizer = sr.Recognizer()
17
+
18
+ origins = ["*"]
19
+
20
+ app.add_middleware(
21
+ CORSMiddleware,
22
+ allow_origins=origins,
23
+ allow_credentials=True,
24
+ allow_methods=["*"],
25
+ allow_headers=["*"],
26
+ )
27
+
28
+ @app.get("/", response_class=HTMLResponse)
29
+ async def read_root():
30
+ # Provide the path to the HTML file containing the front-end code
31
+ with open("soundscripter.html", "r") as file:
32
+ html_content = file.read()
33
+ return HTMLResponse(content=html_content)
34
+
35
+
36
+ def convert_audio_format(input_data, input_format, output_format='wav'):
37
+ # Convert audio data to WAV format
38
+ audio = AudioSegment.from_file(io.BytesIO(input_data), format=input_format)
39
+ output_data = audio.export(format=output_format).read()
40
+ return output_data
41
+
42
+ def recognize_speech(audio_data, language="hi-IN"):
43
+ with io.BytesIO(audio_data) as audio_io:
44
+ with sr.AudioFile(audio_io) as source:
45
+ audio = recognizer.record(source)
46
+ try:
47
+ text = recognizer.recognize_google(audio, language=language)
48
+ return text
49
+ except sr.UnknownValueError:
50
+ return "Speech not recognized."
51
+ except sr.RequestError as e:
52
+ return f"API request failed: {e}"
53
+
54
+ @app.post("/asr")
55
+ async def transcribe_audio(file: UploadFile = File(...)):
56
+ contents = await file.read()
57
+
58
+ # Determine the input audio format (assumes the format is part of the file name)
59
+ input_format = file.filename.split('.')[-1].lower()
60
+
61
+ # Convert audio to WAV format
62
+ wav_data = convert_audio_format(contents, input_format)
63
+
64
+ # Saving the received audio file in WAV format for future analysis (optional)
65
+ wav_file_path = "received_audio.wav"
66
+ with open(wav_file_path, "wb") as f:
67
+ f.write(wav_data)
68
+
69
+ # Transcribe the audio
70
+ result = recognize_speech(wav_data)
71
+
72
+ # return {"Text": result}
73
+ return JSONResponse(content={"text": result})
74
+
75
+ # @app.post("/asr/live")
76
+ # async def transcribe_live_audio():
77
+ # fs = 16000 # Target sample rate
78
+ # duration = 3 # seconds
79
+ # chunks = int(fs * duration)
80
+
81
+ # # Record live audio
82
+ # audio_data = sd.rec(chunks, samplerate=fs, channels=1, dtype=np.float32)
83
+ # sd.wait()
84
+
85
+ # # Resample the audio data to the target sample rate
86
+ # audio_data_resampled = librosa.resample(audio_data.flatten(), orig_sr=fs, target_sr=16000)
87
+
88
+ # # Convert audio data to bytes (use np.int16)
89
+ # audio_bytes = audio_data_resampled.astype(np.int16).tobytes()
90
+
91
+ # # Transcribe the audio
92
+ # result = recognize_speech(audio_bytes)
93
+
94
+ # return {"Text": result}
95
+
96
+ #Run the FastAPI app
97
+ # if __name__ == "__main__":
98
+ # uvicorn.run(app, host="127.0.0.1", port=8000)
99
+
ss_flaskAPI.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+ from fastapi import FastAPI, File, UploadFile, Form
5
+ from fastapi.responses import FileResponse, JSONResponse
6
+ from fastapi.responses import HTMLResponse
7
+ from pydub import AudioSegment
8
+ import shutil
9
+ import tempfile
10
+ import speech_recognition as sr
11
+ import os
12
+ r = sr.Recognizer()
13
+
14
+ app = FastAPI()
15
+
16
+ def resample_audio(input_path, output_path, target_sample_rate):
17
+ ffmpeg_cmd = [
18
+ "ffmpeg",
19
+ "-i", input_path,
20
+ "-ar", str(target_sample_rate),
21
+ output_path
22
+ ]
23
+ subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
24
+
25
+ @app.get("/", response_class=HTMLResponse)
26
+ async def read_root():
27
+ # Provide the path to the HTML file containing the front-end code
28
+ with open("soundscripter.html", "r") as file:
29
+ html_content = file.read()
30
+ return html_content
31
+
32
+ def get_sampling_rate(audio_file_path):
33
+ audio = AudioSegment.from_file(audio_file_path)
34
+ return audio.frame_rate
35
+
36
+ @app.post("/process_audio")
37
+ async def process_audio(audio: UploadFile = File(...), language: str = Form(...)):
38
+ if not audio or not language:
39
+ return JSONResponse(content={"success": False}, status_code=400)
40
+
41
+ # Check if the uploaded file is in WAV format
42
+ if audio.content_type != "audio/wav":
43
+ return JSONResponse(content={"success": False, "message": "Audio must be in WAV format."}, status_code=400)
44
+
45
+ try:
46
+ # Save the received audio to a temporary file
47
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
48
+ temp_file_path = temp_file.name
49
+ shutil.copyfileobj(audio.file, temp_file)
50
+
51
+ # Print the file path for debugging
52
+ print(temp_file_path)
53
+ output_path = tempfile.mktemp(suffix=".wav")
54
+
55
+ # Resample the audio to 16000 Hz
56
+ resample_audio(temp_file_path, output_path, target_sample_rate=16000)
57
+ print(output_path)
58
+
59
+ # Get the sampling rate of the received audio
60
+ sampling_rate = get_sampling_rate(output_path)
61
+
62
+ # Resample the audio to 16 kHz if needed
63
+ if sampling_rate != 16000:
64
+ return JSONResponse(content={"success": False, "message": "Sample rate is not 16000Hz."}, status_code=500)
65
+
66
+
67
+
68
+
69
+ except Exception as e:
70
+ print("Error processing audio:", e)
71
+ return JSONResponse(content={"success": False, "message": "Error processing audio."}, status_code=500)
72
+ # finally:
73
+ # # Cleanup: remove the temporary received audio file
74
+ # if os.path.exists(audio_file_path):
75
+ # os.remove(audio_file_path)
76
+ return JSONResponse(content={"success": True, "language":calling_asr(output_path,"hi-IN")})
77
+
78
+ def calling_asr(wav_file,lid):
79
+ AUDIO_FILE=wav_file
80
+ # aud_name=AUDIO_FILE.split('/')[-1].split('.')[0]
81
+ file=open(wav_file+".txt","w")
82
+ text="cant read wav file"
83
+ try:
84
+ with sr.AudioFile(AUDIO_FILE) as source:
85
+ audio = r.record(source)
86
+ text = r.recognize_google(audio, language=lid)
87
+ #file.write(aud_name +"\t"+text)
88
+ return text
89
+ except:
90
+ #file.write(" "+"Error in segement"+" ")
91
+ return text
92
+ #file.close()
wave.png ADDED