Spaces:

benjolo
/

InterpreTalk

Paused

App Files Files Community

benjolo commited on Apr 20, 2024

Commit

9680844

verified ·

1 Parent(s): f9913e7

adding updates to monolingual transcript functionality

Browse files

Files changed (45) hide show

backend/.DS_Store +0 -0
backend/.env +2 -0
backend/.gitignore +2 -0
backend/Client.py +81 -0
backend/__pycache__/Client.cpython-310.pyc +0 -0
backend/__pycache__/main.cpython-310.pyc +0 -0
backend/logging.yaml +22 -0
backend/main.py +343 -0
backend/models/Seamless/vad_s2st_sc_24khz_main.yaml +25 -0
backend/models/SeamlessStreaming/vad_s2st_sc_main.yaml +21 -0
backend/mongodb/endpoints/__pycache__/calls.cpython-310.pyc +0 -0
backend/mongodb/endpoints/__pycache__/users.cpython-310.pyc +0 -0
backend/mongodb/endpoints/calls.py +96 -0
backend/mongodb/endpoints/users.py +53 -0
backend/mongodb/models/__pycache__/calls.cpython-310.pyc +0 -0
backend/mongodb/models/__pycache__/users.cpython-310.pyc +0 -0
backend/mongodb/models/calls.py +75 -0
backend/mongodb/models/users.py +44 -0
backend/mongodb/operations/__pycache__/calls.cpython-310.pyc +0 -0
backend/mongodb/operations/__pycache__/users.cpython-310.pyc +0 -0
backend/mongodb/operations/calls.py +274 -0
backend/mongodb/operations/users.py +77 -0
backend/pcmToWav.py +34 -0
backend/preprocess_wav.py +65 -0
backend/requirements.txt +28 -0
backend/routes/__init__.py +1 -0
backend/routes/__pycache__/__init__.cpython-310.pyc +0 -0
backend/routes/__pycache__/routing.cpython-310.pyc +0 -0
backend/routes/routing.py +9 -0
backend/seamless/__init__.py +0 -0
backend/seamless/room.py +64 -0
backend/seamless/simuleval_agent_directory.py +171 -0
backend/seamless/simuleval_transcoder.py +428 -0
backend/seamless/speech_and_text_output.py +15 -0
backend/seamless/transcoder_helpers.py +43 -0
backend/seamless_utils.py +210 -0
backend/tests/__pycache__/test_client.cpython-310-pytest-8.1.1.pyc +0 -0
backend/tests/__pycache__/test_main.cpython-310-pytest-8.1.1.pyc +0 -0
backend/tests/__pycache__/test_main.cpython-310.pyc +0 -0
backend/tests/silence.wav +0 -0
backend/tests/speaking.wav +0 -0
backend/tests/test_client.py +59 -0
backend/tests/test_main.py +90 -0
backend/utils/__pycache__/text_rank.cpython-310.pyc +0 -0
backend/utils/text_rank.py +60 -0

backend/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

backend/.env ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ MONGODB_URI=mongodb+srv://benjolo:26qtppddzz2jx9@it-cluster1.4cwyb2f.mongodb.net/?retryWrites=true&w=majority&appName=IT-Cluster1
2	+ OPENAI_API_KEY=sk-proj-vc4w7s6gkfwFG8xLBunZT3BlbkFJ8h9zOoyS0OY756vMgBcc

backend/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ myenv
2	+ .pytest_cache

backend/Client.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from typing import Tuple
+import wave
+import os
+import torchaudio
+from vad import EnergyVAD
+TARGET_SAMPLING_RATE = 16000
+def create_frames(data: bytes, frame_duration: int) -> Tuple[bytes]:
+    frame_size = int(TARGET_SAMPLING_RATE * (frame_duration / 1000))
+    return (data[i:i + frame_size] for i in range(0, len(data), frame_size)), frame_size
+def detect_activity(energies: list):
+    if sum(energies) < len(energies) / 12:
+        return False
+    count = 0
+    for energy in energies:
+        if energy == 1:
+            count += 1
+            if count == 12:
+                return True
+        else:
+            count = 0
+    return False
+class Client:
+    def __init__(self, sid, client_id, username, call_id=None, original_sr=None):
+        self.sid = sid
+        self.client_id = client_id
+        self.username = username,
+        self.call_id = call_id
+        self.buffer = bytearray()
+        self.output_path = self.sid + "_output_audio.wav"
+        self.target_language = None
+        self.original_sr = original_sr
+        self.vad = EnergyVAD(
+            sample_rate=TARGET_SAMPLING_RATE,
+            frame_length=25,
+            frame_shift=20,
+            energy_threshold=0.05,
+            pre_emphasis=0.95,
+        ) # PM - Default values given in the docs for this class
+    def add_bytes(self, new_bytes):
+        self.buffer += new_bytes
+    def resample_and_clear(self):
+        print(f"📥 [ClientAudioBuffer] Writing {len(self.buffer)} bytes to {self.output_path}")
+        with wave.open(self.sid + "_OG.wav", "wb") as wf:
+            wf.setnchannels(1)
+            wf.setsampwidth(2)
+            wf.setframerate(self.original_sr)
+            wf.setnframes(0)
+            wf.setcomptype("NONE", "not compressed")
+            wf.writeframes(self.buffer)
+        waveform, sample_rate = torchaudio.load(self.sid + "_OG.wav")
+        resampler = torchaudio.transforms.Resample(sample_rate, TARGET_SAMPLING_RATE, dtype=waveform.dtype)
+        resampled_waveform = resampler(waveform)
+        self.buffer = bytearray()
+        return resampled_waveform
+    def vad_analyse(self, resampled_waveform):
+        torchaudio.save(self.output_path, resampled_waveform, TARGET_SAMPLING_RATE)
+        vad_array = self.vad(resampled_waveform)
+        print(f"VAD OUTPUT: {vad_array}")
+        return detect_activity(vad_array)
+    def write_to_file(self, resampled_waveform):
+        torchaudio.save(self.output_path, resampled_waveform, TARGET_SAMPLING_RATE)
+    def get_length(self):
+        return len(self.buffer)
+    def __del__(self):
+        if len(self.buffer) > 0:
+            print(f"🚨 [ClientAudioBuffer] Buffer not empty for {self.sid} ({len(self.buffer)} bytes)!")
+        if os.path.exists(self.output_path):
+            os.remove(self.output_path)
+        if os.path.exists(self.sid + "_OG.wav"):
+            os.remove(self.sid + "_OG.wav")

backend/__pycache__/Client.cpython-310.pyc ADDED Viewed

Binary file (3.41 kB). View file

backend/__pycache__/main.cpython-310.pyc ADDED Viewed

Binary file (7.1 kB). View file

backend/logging.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+version: 1
+disable_existing_loggers: false
+formatters:
+  standard:
+    format: "%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s"
+handlers:
+  console:
+    class: logging.StreamHandler
+    formatter: standard
+    stream: ext://sys.stdout
+loggers:
+  uvicorn:
+    error:
+      propagate: true
+root:
+  level: INFO
+  handlers: [console]
+  propagate: no

backend/main.py ADDED Viewed

	@@ -0,0 +1,343 @@

+from operator import itemgetter
+import os
+from datetime import datetime
+import uvicorn
+from typing import Any, Optional, Tuple, Dict, TypedDict
+from urllib import parse
+from uuid import uuid4
+import logging
+from fastapi.logger import logger as fastapi_logger
+import sys
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi import APIRouter, Body, Request, status
+from pymongo import MongoClient
+from dotenv import dotenv_values
+from routes import router as api_router
+from contextlib import asynccontextmanager
+import requests
+from typing import List
+from datetime import date
+from mongodb.operations.calls import *
+from mongodb.operations.users import *
+from mongodb.models.calls import UserCall, UpdateCall
+# from mongodb.endpoints.calls import *
+from transformers import AutoProcessor, SeamlessM4Tv2Model
+# from seamless_communication.inference import Translator
+from Client import Client
+import numpy as np
+import torch
+import socketio
+# Configure logger
+gunicorn_error_logger = logging.getLogger("gunicorn.error")
+gunicorn_logger = logging.getLogger("gunicorn")
+uvicorn_access_logger = logging.getLogger("uvicorn.access")
+gunicorn_error_logger.propagate = True
+gunicorn_logger.propagate = True
+uvicorn_access_logger.propagate = True
+uvicorn_access_logger.handlers = gunicorn_error_logger.handlers
+fastapi_logger.handlers = gunicorn_error_logger.handlers
+# sio is the main socket.io entrypoint
+sio = socketio.AsyncServer(
+    async_mode="asgi",
+    cors_allowed_origins="*",
+    logger=gunicorn_logger,
+    engineio_logger=gunicorn_logger,
+)
+# sio.logger.setLevel(logging.DEBUG)
+socketio_app = socketio.ASGIApp(sio)
+# app.mount("/", socketio_app)
+config = dotenv_values(".env")
+# Read connection string from environment vars
+# uri = os.environ['MONGODB_URI']
+# Read connection string from .env file
+uri = config['MONGODB_URI']
+# MongoDB Connection Lifespan Events
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # startup logic
+    app.mongodb_client = MongoClient(uri)
+    app.database = app.mongodb_client['IT-Cluster1'] #connect to interpretalk primary db
+    try:
+        app.mongodb_client.admin.command('ping')
+        print("MongoDB Connection Established...")
+    except Exception as e:
+        print(e)
+    yield
+    # shutdown logic
+    print("Closing MongoDB Connection...")
+    app.mongodb_client.close()
+app = FastAPI(lifespan=lifespan, logger=gunicorn_logger)
+# New CORS funcitonality
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"], # configured node app port
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+app.include_router(api_router) # include routers for user, calls and transcripts operations
+DEBUG = True
+ESCAPE_HATCH_SERVER_LOCK_RELEASE_NAME = "remove_server_lock"
+TARGET_SAMPLING_RATE = 16000
+MAX_BYTES_BUFFER = 960_000
+print("")
+print("")
+print("=" * 18 + " Interpretalk is starting... " + "=" * 18)
+###############################################
+# Configure socketio server
+###############################################
+# TODO PM - change this to the actual path
+# seamless remnant code
+CLIENT_BUILD_PATH = "../streaming-react-app/dist/"
+static_files = {
+    "/": CLIENT_BUILD_PATH,
+    "/assets/seamless-db6a2555.svg": {
+        "filename": CLIENT_BUILD_PATH + "assets/seamless-db6a2555.svg",
+        "content_type": "image/svg+xml",
+    },
+}
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
+# PM - hardcoding temporarily as my GPU doesnt have enough vram
+model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large").to(device)
+bytes_data = bytearray()
+model_name = "seamlessM4T_v2_large"
+vocoder_name = "vocoder_v2" if model_name == "seamlessM4T_v2_large" else "vocoder_36langs"
+clients = {}
+rooms = {}
+def get_collection_users():
+    return app.database["user_records"]
+def get_collection_calls():
+    return app.database["call_records"]
+@app.get("/home/", response_description="Welcome User")
+def test():
+    return {"message": "Welcome to InterpreTalk!"}
+async def send_translated_text(client_id, original_text, translated_text, room_id):
+    print('SEND_TRANSLATED_TEXT IS WOKRING IN FASTAPI BACKEND...')
+    print(rooms) # Debugging
+    print(clients) # Debugging
+    data = {
+        "author_id": str(client_id),
+        "original_text": str(original_text),
+        "translated_text": str(translated_text),
+        "timestamp": str(datetime.now())
+    }
+    gunicorn_logger.info("SENDING TRANSLATED TEXT TO CLIENT")
+    await sio.emit("translated_text", data, room=room_id)
+    gunicorn_logger.info("SUCCESSFULLY SEND AUDIO TO FRONTEND")
+@sio.on("connect")
+async def connect(sid, environ):
+    print(f"📥 [event: connected] sid={sid}")
+    query_params = dict(parse.parse_qsl(environ["QUERY_STRING"]))
+    client_id = query_params.get("client_id")
+    gunicorn_logger.info(f"📥 [event: connected] sid={sid}, client_id={client_id}")
+    # get username to Client Object from DB
+    username = find_name_from_id(get_collection_users(), client_id)
+    # sid = socketid, client_id = client specific ID ,always the same for same user
+    clients[sid] = Client(sid, client_id, username)
+    print(clients[sid].username)
+    gunicorn_logger.warning(f"Client connected: {sid}")
+    gunicorn_logger.warning(clients)
+@sio.on("disconnect")
+async def disconnect(sid):
+    gunicorn_logger.debug(f"📤 [event: disconnected] sid={sid}")
+    call_id = clients[sid].call_id
+    user_id = clients[sid].client_id
+    target_language = clients[sid].target_language
+    clients.pop(sid, None)
+    # Perform Key Term Extraction and summarisation
+    try:
+        # Get combined caption field for call record based on call_id
+        key_terms = term_extraction(get_collection_calls(), call_id, user_id, target_language)
+        # Perform summarisation based on target language
+        summary_result = summarise(get_collection_calls(), call_id, user_id, target_language)
+    except:
+        gunicorn_logger.error(f"📤 [event: term_extraction/summarisation request error] sid={sid}, call={call_id}")
+@sio.on("target_language")
+async def target_language(sid, target_lang):
+    gunicorn_logger.info(f"📥 [event: target_language] sid={sid}, target_lang={target_lang}")
+    clients[sid].target_language = target_lang
+@sio.on("call_user")
+async def call_user(sid, call_id):
+    clients[sid].call_id = call_id
+    gunicorn_logger.info(f"CALL {sid}: entering room {call_id}")
+    rooms[call_id] = rooms.get(call_id, [])
+    if sid not in rooms[call_id] and len(rooms[call_id]) < 2:
+        rooms[call_id].append(sid)
+        sio.enter_room(sid, call_id)
+    else:
+        gunicorn_logger.info(f"CALL {sid}: room {call_id} is full")
+        # await sio.emit("room_full", room=call_id, to=sid)
+    # BO - Get call id from dictionary created during socketio connection
+    client_id = clients[sid].client_id
+    gunicorn_logger.warning(f"NOW TRYING TO CREATE DB RECORD FOR Caller with ID: {client_id} for call: {call_id}")
+    # BO -> Create Call Record with Caller and call_id field (None for callee, duration, terms..)
+    request_data = {
+        "call_id": str(call_id),
+        "caller_id": str(client_id),
+        "creation_date": str(datetime.now())
+    }
+    response =  create_calls(get_collection_calls(), request_data)
+    print(response) # BO - print created db call record
+@sio.on("audio_config")
+async def audio_config(sid, sample_rate):
+    clients[sid].original_sr = sample_rate
+@sio.on("answer_call")
+async def answer_call(sid, call_id):
+    clients[sid].call_id = call_id
+    gunicorn_logger.info(f"ANSWER {sid}: entering room {call_id}")
+    rooms[call_id] = rooms.get(call_id, [])
+    if sid not in rooms[call_id] and len(rooms[call_id]) < 2:
+        rooms[call_id].append(sid)
+        sio.enter_room(sid, call_id)
+    else:
+        gunicorn_logger.info(f"ANSWER {sid}: room {call_id} is full")
+        # await sio.emit("room_full", room=call_id, to=sid)
+    # BO - Get call id from dictionary created during socketio connection
+    client_id = clients[sid].client_id
+    # BO -> Update Call Record with Callee field based on call_id
+    gunicorn_logger.warning(f"NOW UPDATING MongoDB RECORD FOR Caller with ID: {client_id} for call: {call_id}")
+    # BO -> Create Call Record with callee_id field (None for callee, duration, terms..)
+    request_data = {
+        "callee_id": client_id
+    }
+    response =  update_calls(get_collection_calls(), call_id, request_data)
+    print(response) # BO - print created db call record
+@sio.on("incoming_audio")
+async def incoming_audio(sid, data, call_id):
+    try:
+        clients[sid].add_bytes(data)
+        if clients[sid].get_length() >= MAX_BYTES_BUFFER:
+            gunicorn_logger.info('Buffer full, now outputting...')
+            output_path = clients[sid].output_path
+            resampled_audio = clients[sid].resample_and_clear()
+            vad_result = clients[sid].vad_analyse(resampled_audio)
+            # source lang is speakers tgt language 😃
+            src_lang = clients[sid].target_language
+            if vad_result:
+                gunicorn_logger.info('Speech detected, now processing audio.....')
+                tgt_sid = next(id for id in rooms[call_id] if id != sid)
+                tgt_lang = clients[tgt_sid].target_language
+                # following example from https://github.com/facebookresearch/seamless_communication/blob/main/docs/m4t/README.md#transformers-usage
+                output_tokens = processor(audios=resampled_audio, src_lang=src_lang, return_tensors="pt", sampling_rate=TARGET_SAMPLING_RATE).to(device)
+                model_output = model.generate(**output_tokens, tgt_lang=src_lang, generate_speech=False)[0].tolist()[0]
+                asr_text = processor.decode(model_output, skip_special_tokens=True)
+                print(f"ASR TEXT = {asr_text}")
+                # ASR TEXT => ORIGINAL TEXT
+                if src_lang != tgt_lang:
+                    t2t_tokens = processor(text=asr_text, src_lang=src_lang, tgt_lang=tgt_lang, return_tensors="pt").to(device)
+                    translated_data = model.generate(**t2t_tokens, tgt_lang=tgt_lang, generate_speech=False)[0].tolist()[0]
+                    translated_text = processor.decode(translated_data, skip_special_tokens=True)
+                    print(f"TRANSLATED TEXT = {translated_text}")
+                else:
+                    # PM - both users have same language selected, no need to translate
+                    translated_text = asr_text
+                # PM - text_output is a list with 1 string
+                await send_translated_text(clients[sid].client_id, asr_text, translated_text, call_id)
+                # BO -> send translated_text to mongodb as caption record update based on call_id
+                await send_captions(clients[sid].client_id, clients[sid].username, asr_text, translated_text, call_id)
+    except Exception as e:
+        gunicorn_logger.error(f"Error in incoming_audio: {e.with_traceback()}")
+async def send_captions(client_id, username, original_text, translated_text, call_id):
+    # BO -> Update Call Record with Callee field based on call_id
+    print(f"Now updating Caption field in call record for Caller with ID: {client_id} for call: {call_id}")
+    data = {
+        "author_id": str(client_id),
+        "author_username": str(username),
+        "original_text": str(original_text),
+        "translated_text": str(translated_text),
+        "timestamp": str(datetime.now())
+    }
+    response = update_captions(get_collection_calls(), get_collection_users(), call_id, data)
+    return response
+app.mount("/", socketio_app)
+if __name__ == '__main__':
+    uvicorn.run("main:app", host='0.0.0.0', port=7860, log_level="info")
+# Running in Docker Container
+if __name__ != "__main__":
+    fastapi_logger.setLevel(gunicorn_logger.level)
+else:
+    fastapi_logger.setLevel(logging.DEBUG)

backend/models/Seamless/vad_s2st_sc_24khz_main.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+agent_class: seamless_communication.streaming.agents.seamless_s2st.SeamlessS2STDualVocoderVADAgent
+monotonic_decoder_model_name: seamless_streaming_monotonic_decoder
+unity_model_name: seamless_streaming_unity
+sentencepiece_model: spm_256k_nllb100.model
+task: s2st
+tgt_lang: "eng"
+min_unit_chunk_size: 50
+decision_threshold: 0.7
+no_early_stop: True
+block_ngrams: True
+vocoder_name: vocoder_v2
+expr_vocoder_name: vocoder_pretssel
+gated_model_dir: .
+expr_vocoder_gain: 3.0
+upstream_idx: 1
+wav2vec_yaml: wav2vec.yaml
+min_starting_wait_w2vbert: 192
+config_yaml: cfg_fbank_u2t.yaml
+upstream_idx: 1
+detokenize_only: True
+device: cuda:0
+max_len_a: 0
+max_len_b: 1000

backend/models/SeamlessStreaming/vad_s2st_sc_main.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+agent_class: seamless_communication.streaming.agents.seamless_streaming_s2st.SeamlessStreamingS2STJointVADAgent
+monotonic_decoder_model_name: seamless_streaming_monotonic_decoder
+unity_model_name: seamless_streaming_unity
+sentencepiece_model: spm_256k_nllb100.model
+task: s2st
+tgt_lang: "eng"
+min_unit_chunk_size: 50
+decision_threshold: 0.7
+no_early_stop: True
+block_ngrams: True
+vocoder_name: vocoder_v2
+wav2vec_yaml: wav2vec.yaml
+min_starting_wait_w2vbert: 192
+config_yaml: cfg_fbank_u2t.yaml
+upstream_idx: 1
+detokenize_only: True
+device: cuda:0
+max_len_a: 0
+max_len_b: 1000

backend/mongodb/endpoints/__pycache__/calls.cpython-310.pyc ADDED Viewed

Binary file (4.71 kB). View file

backend/mongodb/endpoints/__pycache__/users.cpython-310.pyc ADDED Viewed

Binary file (2.43 kB). View file

backend/mongodb/endpoints/calls.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from fastapi import APIRouter, Body, Request, status, HTTPException
+from typing import List
+from datetime import date
+import sys
+from ..operations import calls as calls
+from ..models.calls import UserCaptions, UserCall, UpdateCall
+from ..endpoints.users import get_collection_users
+router = APIRouter(prefix="/call",
+    tags=["Calls"])
+def get_collection_calls(request: Request):
+  try:
+    return request.app.database["call_records"]
+    #   return request.app.database["call_test"]
+  except:
+      raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Unable to find call records Database.")
+@router.post("/create-call", response_description="Create a new user call record", status_code=status.HTTP_201_CREATED, response_model=UserCall)
+async def create_calls(request: Request, user_calls: UserCall = Body(...)):
+    collection = get_collection_calls(request)
+    return calls.create_calls(collection, user_calls)
+@router.get("/list-call", response_description="List all existing call records", response_model=List[UserCall])
+async def list_calls(request: Request, limit: int):
+    collection = get_collection_calls(request)
+    return calls.list_calls(collection, 100)
+@router.get("/find-call/{call_id}", response_description="Find user's calls based on User ID", response_model=UserCall)
+async def find_call(request: Request, call_id: str):
+    collection = get_collection_calls(request)
+    return calls.find_call(collection, call_id)
+@router.get("/find-user-calls/{user_id}", response_description="Find user's calls based on User ID", response_model=List[UserCall])
+async def find_user_calls(request: Request, user_id: str):
+    collection = get_collection_calls(request)
+    return calls.find_user_calls(collection, user_id)
+@router.get("/get-captions/{user_id}", response_description="Find user's calls based on User ID")
+async def get_caption_text(request: Request, call_id: str, user_id: str):
+    collection = get_collection_calls(request)
+    return calls.get_caption_text(collection, call_id, user_id)
+'''Key terms list can have variable length -> using POST request over GET'''
+@router.post("/find-term/", response_description="Find calls based on key term list", response_model=List[UserCall])
+async def list_transcripts_by_key_terms(request: Request, key_terms: List[str]):
+    collection = get_collection_calls(request)
+    return calls.list_transcripts_by_key_terms(collection, key_terms)
+@router.get("/find-date/{start_date}/{end_date}", response_description="Find calls based on date ranges", response_model=List[UserCall])
+async def list_transcripts_by_dates(request: Request, start_date: str, end_date: str):
+    collection = get_collection_calls(request)
+    return calls.list_transcripts_by_dates(collection, start_date, end_date)
+@router.get("/find-duration/{min_len}/{max_len}", response_description="Find calls based on call duration in minutes", response_model=List[UserCall])
+async def list_transcripts_by_duration(request: Request, min_len: int, max_len: int):
+    collection = get_collection_calls(request)
+    return calls.list_transcripts_by_duration(collection, min_len, max_len)
+@router.put("/update-call/{call_id}", response_description="Update an existing call", response_model=UpdateCall)
+async def update_calls(request: Request, call_id: str, user_calls: UpdateCall = Body(...)):
+    collection = get_collection_calls(request)
+    return calls.update_calls(collection, call_id, user_calls)
+@router.put("/update-captions/{call_id}", response_description="Update an existing call", response_model=UpdateCall)
+async def update_captions(request: Request, call_id: str, user_calls: UserCaptions = Body(...)):
+    call_collection = get_collection_calls(request)
+    user_collection = get_collection_users(request)
+    return calls.update_captions(call_collection, user_collection, call_id, user_calls)
+@router.delete("/delete-call/{call_id}", response_description="Delete a call by its id")
+async def delete_call(request: Request, call_id: str):
+    collection = get_collection_calls(request)
+    return calls.delete_calls(collection, call_id)
+@router.get("/full-text-search/{query}", response_description="Perform full text search on caption fields", response_model=List[UserCall])
+async def full_text_search(request: Request, query: str):
+    collection = get_collection_calls(request)
+    return calls.full_text_search(collection, query)
+@router.get("/fuzzy-search/{query}", response_description="Perform fuzzy text search on caption fields", response_model=List[UserCall])
+async def fuzzy_search(request: Request, query: str):
+    collection = get_collection_calls(request)
+    return calls.fuzzy_search(collection, query)
+@router.get("/summarise/{call_id}/{user_id}/{target_language}", response_description="Perform gpt-3.5 summarisation on call_id")
+async def summarise(request: Request, call_id: str, user_id: str, target_language: str):
+    collection = get_collection_calls(request)
+    return calls.summarise(collection, call_id, user_id, target_language)
+@router.get("/term-extraction/{call_id}/{user_id}/{target_language}", response_description="Perform key term extraction on call record")
+async def term_extraction(request: Request, call_id: str, user_id: str, target_language: str):
+    collection = get_collection_calls(request)
+    return calls.term_extraction(collection, call_id, user_id, target_language)

backend/mongodb/endpoints/users.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from fastapi import APIRouter, Body, Request, status, HTTPException
+from typing import List
+import sys
+from ..models.users import User, UpdateUser
+from ..operations import users as users
+router = APIRouter(prefix="/user",
+    tags=["User"])
+def get_collection_users(request: Request):
+  db = request.app.database["user_records"]
+  return db
+@router.post("/", response_description="Create a new user", status_code=status.HTTP_201_CREATED, response_model=User)
+async def create_user(request: Request, user: User = Body(...)):
+    collection = get_collection_users(request)
+    return users.create_user(collection, user)
+@router.get("/", response_description="List users", response_model=List[User])
+async def list_users(request: Request):
+    collection = get_collection_users(request)
+    return users.list_users(collection, 100)
+@router.put("/{user_id}", response_description="Update a User", response_model=UpdateUser)
+async def update_user(request: Request, user_id: str, user: UpdateUser = Body(...)):
+    collection = get_collection_users(request)
+    return users.update_user(collection, user_id, user)
+@router.get("/{user_id}", response_description="Get a single user by id", response_model=User)
+async def find_user(request: Request, user_id: str):
+    collection = get_collection_users(request)
+    return users.find_user(collection, user_id)
+@router.get("/find-name-id/{user_id}", response_description="Get a username from user id")
+async def find_name_from_id(request: Request, user_id: str):
+    collection = get_collection_users(request)
+    return users.find_name_from_id(collection, user_id)
+@router.get("/name/{user_name}", response_description="Get a single user by name", response_model=User)
+async def find_user_name(request: Request, name: str):
+    collection = get_collection_users(request)
+    return users.find_user_name(collection, name)
+@router.get("/email/{email_addr}", response_description="Get a single user by email", response_model=User)
+async def find_user_email(request: Request, email: str):
+    collection = get_collection_users(request)
+    return users.find_user_email(collection, email)
+@router.delete("/{user_id}", response_description="Delete a user")
+async def delete_user(request: Request, user_id:str):
+    collection = get_collection_users(request)
+    return users.delete_user(collection, user_id)

backend/mongodb/models/__pycache__/calls.cpython-310.pyc ADDED Viewed

Binary file (3.09 kB). View file

backend/mongodb/models/__pycache__/users.cpython-310.pyc ADDED Viewed

Binary file (1.73 kB). View file

backend/mongodb/models/calls.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import uuid
+from typing import List, Dict, Optional
+from datetime import datetime
+from pydantic import BaseModel, Field, PrivateAttr
+import sys
+''' Class for storing captions generated by SeamlessM4T'''
+class UserCaptions(BaseModel):
+    _id: uuid.UUID = PrivateAttr(default_factory=uuid.uuid4) # private attr not included in http calls
+    author_id: Optional[str] = None
+    author_username: Optional[str] = None
+    original_text: str
+    translated_text: str
+    timestamp: datetime = Field(default_factory=datetime.now)
+    class Config:
+        populate_by_name = True
+        json_schema_extra = {
+            "example": {
+                "author_id": "gLZrfTwXyLUPB3eT7xT2HZnZiZT2",
+                "author_username": "shamzino",
+                "original_text": "eng: This is original_text english text",
+                "translated_text": "spa: este es el texto traducido al español",
+                "timestamp": "2024-03-28T16:15:50.956055",
+            }
+        }
+'''Class for storing past call records from users'''
+class UserCall(BaseModel):
+    _id: uuid.UUID = PrivateAttr(default_factory=uuid.uuid4)
+    call_id: Optional[str] = None
+    caller_id: Optional[str] = None
+    callee_id: Optional[str] = None
+    creation_date: datetime = Field(default_factory=datetime.now, alias="date")
+    duration: Optional[int] = None # milliseconds
+    captions: Optional[List[UserCaptions]] = None
+    key_terms: Optional[dict] = None
+    summaries: Optional[dict] = None
+    class Config:
+        populate_by_name = True
+        json_schema_extra = {
+            "example": {
+                "call_id": "65eef930e9abd3b1e3506906",
+                "caller_id": "65ede65b6d246e52aaba9d4f",
+                "callee_id": "65edda944340ac84c1f00758",
+                "duration": 360,
+                "captions": [{"author_id": "gLZrfTwXyLUPB3eT7xT2HZnZiZT2", "author_username": "shamzino", "original_text": "eng: This is original_text english text", "translated_text": "spa: este es el texto traducido al español", "timestamp": "2024-03-28T16:15:50.956055"},
+                             {"author_id": "g7pR1qCibzQf5mDP9dGtcoWeEc92", "author_username": "benjino", "original_text": "eng: This is source english text", "translated_text": "spa: este es el texto fuente al español",  "timestamp": "2024-03-28T16:16:20.34625"}],
+                "key_terms": {"gLZrfTwXyLUPB3eT7xT2HZnZiZT2": ["original_text", "source", "english", "text"], "g7pR1qCibzQf5mDP9dGtcoWeEc92": ["translated_text", "destination", "spanish", "text"]},
+                "summaries": {"gLZrfTwXyLUPB3eT7xT2HZnZiZT2": "This is a short test on lanuguage translation", "65edda944340ac84c1f00758": "Esta es una breve prueba sobre traducción de idiomas."}
+            }
+        }
+''' Class for updating User Call record'''
+class UpdateCall(BaseModel):
+    call_id: Optional[str] = None
+    caller_id: Optional[str] = None
+    callee_id: Optional[str] = None
+    duration: Optional[int] = None
+    captions: Optional[List[UserCaptions]] = None
+    key_terms: Optional[List[str]] = None
+    class Config:
+        populate_by_name = True
+        json_schema_extra = {
+            "example": {
+                "duration": "500"
+            }
+        }

backend/mongodb/models/users.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import uuid
+from typing import List, Optional
+from pydantic import BaseModel, Field, SecretStr, PrivateAttr
+from pydantic.networks import EmailStr
+'''Class for user model used to relate users to past calls'''
+class User(BaseModel):
+    _id: uuid.UUID = PrivateAttr(default_factory=uuid.uuid4) # private attr not included in http calls
+    user_id: str
+    name: str
+    email: EmailStr = Field(unique=True, index=True)
+    # password: SecretStr
+    call_ids: Optional[List[str]] = None
+    class Config:
+        populate_by_name = True
+        json_schema_extra = {
+            "example": {
+                "user_id": "65ede65b6d246e52aaba9d4f",
+                "name": "benjolo",
+                "email": "benjolounchained@gmail.com",
+                "call_ids": ["65e205ced1be3a22854ff300", "65df8c3eba9c7c2ed1b20e85"]
+            }
+        }
+'''Class for updating user records'''
+class UpdateUser(BaseModel):
+    user_id: Optional[str] = None
+    name: Optional[str] = None
+    email: Optional[EmailStr] = None
+    ''' To decode use -> SecretStr("abc").get_secret_value()'''
+    # password: Optional[SecretStr]
+    call_ids: Optional[List[str]] = None
+    class Config:
+        populate_by_name = True
+        json_schema_extra = {
+            "example": {
+                "email": "benjolounchained21@gmail.com",
+                "call_ids": ["65e205ced1be3a22854ff300", "65df8c3eba9c7c2ed1b20e85", "65eef930e9abd3b1e3506906"]
+            }
+        }

backend/mongodb/operations/__pycache__/calls.cpython-310.pyc ADDED Viewed

Binary file (6.41 kB). View file

backend/mongodb/operations/__pycache__/users.cpython-310.pyc ADDED Viewed

Binary file (2.93 kB). View file

backend/mongodb/operations/calls.py ADDED Viewed

	@@ -0,0 +1,274 @@

+from fastapi import Body, Request, HTTPException, status
+from fastapi.encoders import jsonable_encoder
+import sys
+from ..models.calls import UpdateCall, UserCall, UserCaptions
+from ..operations.users import *
+from utils.text_rank import extract_terms
+from openai import OpenAI
+from time import sleep
+import os
+from dotenv import dotenv_values
+# Used within calls to create call record in main.py
+def create_calls(collection, user: UserCall = Body(...)):
+    calls = jsonable_encoder(user)
+    new_calls = collection.insert_one(calls)
+    created_calls = collection.find_one({"_id": new_calls.inserted_id})
+    return created_calls
+def list_calls(collection, limit: int):
+    try:
+        calls = collection.find(limit = limit)
+        return list(calls)
+    except:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"No existing call records yet.")
+'''Finding calls based on call id'''
+def find_call(collection, call_id: str):
+    user_calls = collection.find_one({"call_id": call_id})
+    if user_calls is not None:
+        return user_calls
+    else:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call with ID: '{call_id}' not found.")
+'''Finding calls based on user id'''
+def find_user_calls(collection, user_id: str):
+    user_calls = list(collection.find({"$or": [{"caller_id": user_id}, {"callee_id": user_id}]})) # match on caller or callee ID
+    if len(user_calls):
+        return user_calls
+    else:
+        return [] # return empty list if no existing calls for TranscriptView frontend component
+'''Finding calls based on key terms list'''
+def list_transcripts_by_key_terms(collection, key_terms_list: list[str] = Body(...)):
+    key_terms_list = jsonable_encoder(key_terms_list)
+    call_records = list(collection.find({"key_terms": {"$in": key_terms_list}}, {'_id': 0})) # exclude returning ObjectID in find()
+    # Check if any call records were returned
+    if len(call_records):
+        return call_records
+    else:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call with key terms: '{key_terms_list}' not found!")
+'''Finding calls based on date ranges'''
+def list_transcripts_by_dates(collection, start_date: str, end_date: str):
+    # print(start_date, end_date)
+    # Convert strings to date string in YYYY-MM-ddT00:00:00 format
+    start_date = f'{start_date}T00:00:00'
+    end_date = f'{end_date}T00:00:00'
+    call_records = list(collection.find({"date":{"$gte": start_date, "$lte": end_date}}))
+    if len(call_records):
+        return call_records
+    else:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call with creation date between: '{start_date} - {end_date}' not found!")
+'''Finding calls based on call lengths'''
+def list_transcripts_by_duration(collection, min_len: int, max_len: int):
+    call_records = list(collection.find({"duration":{"$gte": min_len, "$lte": max_len}}))
+    if len(call_records):
+        return call_records
+    else:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call with duration between: '{min_len} - {max_len}' milliseconds not found!")
+def update_calls(collection, call_id: str, calls: UpdateCall = Body(...)):
+    # calls = {k: v for k, v in calls.model_dump().items() if v is not None} #loop in the dict
+    calls = {k: v for k, v in calls.items() if v is not None} #loop in the dict
+    print(calls)
+    if len(calls) >= 1:
+        update_result = collection.update_one({"call_id": call_id}, {"$set": calls})
+        if update_result.modified_count == 0:
+            raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call not updated!")
+    if (existing_item := collection.find_one({"call_id": call_id})) is not None:
+        return existing_item
+    raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call not found!")
+def update_captions(call_collection, user_collection, call_id: str, captions: UserCaptions = Body(...)):
+    # captions = {k: v for k, v in calls.model_dump().items() if v is not None}
+    captions = {k: v for k, v in captions.items() if v is not None}
+    # print(captions)
+    # index user_id from caption object
+    userID = captions["author_id"]
+    # print(userID)
+    # use user id to get user name
+    username = find_name_from_id(user_collection, userID)
+    # print(username)
+    # add user name to captions json/object
+    captions["author_username"] = username
+    # print(captions)
+    if len(captions) >= 1:
+        update_result = call_collection.update_one({"call_id": call_id},
+                                              {"$push": {"captions": captions}})
+        if update_result.modified_count == 0:
+            raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Captions not updated!")
+    if (existing_item := call_collection.find_one({"call_id": call_id})) is not None:
+        return existing_item
+    raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Captions not found!")
+def delete_calls(collection, call_id: str):
+    deleted_calls = collection.delete_one({"call_id": call_id})
+    if deleted_calls.deleted_count == 1:
+        return f"Call deleted sucessfully!"
+    raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call not found!")
+# def get_caption_text(collection, call_id):
+#     call_record = find_call((collection), call_id)
+#     try: # Check if call has any captions first
+#         caption_records = call_record['captions']
+#     except KeyError:
+#         return None
+#     # iterate through caption embedded document and store original text
+#     combined_text = [caption['original_text'] for caption in caption_records]
+#     return " ".join(combined_text)
+def get_caption_text(collection, call_id, user_id):
+    call_record = find_call((collection), call_id)
+    try: # Check if call has any captions first
+        caption_records = call_record['captions']
+    except KeyError:
+        return None
+    # iterate through caption embedded document and store original text
+    # combined_text = [caption['original_text'] for caption in caption_records]
+    combined_text = []
+    for caption_segment in caption_records:
+        if caption_segment['author_id'] == user_id:
+            combined_text.append(caption_segment['original_text'])
+        else:
+            combined_text.append(caption_segment['translated_text'])
+    return " ".join(combined_text)
+# standard exact match based full text search
+def full_text_search(collection, query):
+    # drop any existing indexes and create new one
+    collection.drop_indexes()
+    collection.create_index([('captions.original_text', 'text'), ('captions.tranlated_text', 'text')],
+                            name='captions')
+    # print(collection.index_information())
+    results = list(collection.find({"$text": {"$search": query}}))
+    return results
+# approximate string matching
+def fuzzy_search(collection, query):
+    # drop any existing indexes and create new one
+    collection.drop_indexes()
+    collection.create_index([('captions.original_text', 'text'), ('captions.tranlated_text', 'text')],
+                            name='captions')
+    # print(collection.index_information())
+    pipeline = [
+        {
+            "$search": {
+                "text": {
+                    "query": query,
+                    "path": {"wildcard": "*"},
+                    "fuzzy": {}
+                }
+            }
+        }
+    ]
+    collection_results = list(collection.aggregate(pipeline))
+    # print(collection_results)
+    return collection_results
+def summarise(collection, call_id, user_id, target_language):
+    # client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+    config = dotenv_values(".env")
+    client = OpenAI(api_key=config["OPENAI_API_KEY"])
+    # get caption text using call_id
+    caption_text = get_caption_text(collection, call_id, user_id)
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": f"The following is an extract from a call transcript. Rewrite this as a structured, clear summary in {target_language}. \
+                            \n\Call Transcript: \"\"\"\n{caption_text}\n\"\"\"\n"
+            }
+        ],
+        model="gpt-3.5-turbo",
+    )
+    # Gpt-3.5 turbo has 4096 token limit -> request will fail if exceeded
+    try:
+        result = chat_completion.choices[0].message.content.split(":")[1].strip() # parse summary
+    except:
+        return None
+    # BO - add result to mongodb -> should be done asynchronously
+    # summary_payload = {"summaries": {user_id: result}}
+    update_result = collection.update_one({"call_id": call_id}, {"$set": {f"summaries.{user_id}": result}})
+    if update_result.modified_count == 0:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call not updated!")
+    # if (existing_item := collection.find_one({"call_id": call_id})) is not None:
+    #     print(existing_item)
+    return result
+def term_extraction(collection, call_id, user_id, target_language):
+    combined_text = get_caption_text(collection, call_id, user_id)
+    if len(combined_text) > 50: # > min_caption_length: -> poor term extraction on short transcripts
+        # Extract Key Terms from Concatenated Caption Field
+        key_terms = extract_terms(combined_text, target_language, len(combined_text))
+        update_result = collection.update_one({"call_id": call_id}, {"$set": {f"key_terms.{user_id}": key_terms}})
+    if update_result.modified_count == 0:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call not updated!")
+    return key_terms

backend/mongodb/operations/users.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from fastapi import Body, Request, HTTPException, status
+from fastapi.encoders import jsonable_encoder
+import sys
+from ..models.users import User, UpdateUser
+from bson import ObjectId
+import re
+def create_user(collection, user: User = Body(...)):
+    user = jsonable_encoder(user)
+    new_user = collection.insert_one(user)
+    created_user = collection.find_one({"_id": new_user.inserted_id})
+    print("NEW ID IS:.........", new_user.inserted_id)
+    return created_user
+def list_users(collection, limit: int):
+    try:
+        users = list(collection.find(limit = limit))
+        return users
+    except:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"No users found!")
+def find_user(collection, user_id: str):
+    if (user := collection.find_one({"user_id": user_id})):
+        return user
+    else:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with user_id {user_id} not found!")
+def find_name_from_id(collection, user_id: str):
+    # find_one user record based on user id and project for user name
+    if (user_name := collection.find_one({"user_id": user_id}, {"name": 1, "_id": 0})):
+        return user_name['name'] # index name field from single field record returned
+    else:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with user_id {user_id} not found!")
+def find_user_name(collection, name: str):
+    # search for name in lowercase
+    if (user := collection.find_one({"name": re.compile('^' + re.escape(name) + '$', re.IGNORECASE)})):
+        return user
+    else:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with name {name} not found!")
+def find_user_email(collection, email: str):
+    if (user := collection.find_one({"email": re.compile('^' + re.escape(email) + '$', re.IGNORECASE)})):
+        return user
+    else:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with Email Address {email} not found!")
+''' Update user record based on user object/json'''
+def update_user(collection, user_id: str, user: UpdateUser):
+    try:
+        user = {k: v for k, v in user.model_dump().items() if v is not None}
+        if len(user) >= 1:
+            update_result = collection.update_one({"user_id": user_id}, {"$set": user})
+            if update_result.modified_count == 0:
+                raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with user_id: '{user_id}' not found and updated!")
+        if (existing_users := collection.find_one({"user_id": user_id})) is not None:
+            return existing_users
+    except:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with user_id: '{user_id}' not found and updated!")
+def delete_user(collection, user_id: str):
+    try:
+        deleted_user = collection.delete_one({"user_id": user_id})
+        if deleted_user.deleted_count == 1:
+            return f"User with user_id {user_id} deleted sucessfully"
+    except:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"User with user_id {user_id} not found!")

backend/pcmToWav.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import wave
+import os
+basePath = os.path.expanduser("~/Desktop/")
+def convert_pcm_to_wav():
+    # PCM file parameters (should match the parameters used to create the PCM file)
+    pcm_file = basePath + 'output.pcm'
+    wav_file = 'pcmconverted.wav'
+    sample_rate = 16000  # Example: 16000 Hz
+    channels = 1         # Example: 2 for stereo
+    sample_width = 2     # Example: 2 bytes (16 bits), change if your PCM format is different
+    # Read the PCM file and write to a WAV file
+    with open(pcm_file, 'rb') as pcmfile:
+        pcm_data = pcmfile.read()
+    with wave.open(wav_file, 'wb') as wavfile:
+        wavfile.setnchannels(channels)
+        wavfile.setsampwidth(sample_width)
+        wavfile.setframerate(sample_rate)
+        wavfile.writeframes(pcm_data)
+convert_pcm_to_wav()
+# def generateCaptions(filepath):
+# ! This might be redundant due to seamless-streaming
+print(f"Converted {pcm_file} to {wav_file}")

backend/preprocess_wav.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import soundfile
+import io
+from typing import Any, Tuple, Union, Optional
+import numpy as np
+import torch
+def preprocess_wav(data: Any, incoming_sample_rate) -> Tuple[np.ndarray, int]:
+        segment, sample_rate = soundfile.read(
+            io.BytesIO(data),
+            dtype="float32",
+            always_2d=True,
+            frames=-1,
+            start=0,
+            format="RAW",
+            subtype="PCM_16",
+            samplerate=incoming_sample_rate,
+            channels=1,
+        )
+        return segment, sample_rate
+def convert_waveform(
+        waveform: Union[np.ndarray, torch.Tensor],
+        sample_rate: int,
+        normalize_volume: bool = False,
+        to_mono: bool = False,
+        to_sample_rate: Optional[int] = None,
+    ) -> Tuple[Union[np.ndarray, torch.Tensor], int]:
+        """convert a waveform:
+        - to a target sample rate
+        - from multi-channel to mono channel
+        - volume normalization
+        Args:
+            waveform (numpy.ndarray or torch.Tensor): 2D original waveform
+                (channels x length)
+            sample_rate (int): original sample rate
+            normalize_volume (bool): perform volume normalization
+            to_mono (bool): convert to mono channel if having multiple channels
+            to_sample_rate (Optional[int]): target sample rate
+        Returns:
+            waveform (numpy.ndarray): converted 2D waveform (channels x length)
+            sample_rate (float): target sample rate
+        """
+        try:
+            import torchaudio.sox_effects as ta_sox
+        except ImportError:
+            raise ImportError("Please install torchaudio: pip install torchaudio")
+        effects = []
+        if normalize_volume:
+            effects.append(["gain", "-n"])
+        if to_sample_rate is not None and to_sample_rate != sample_rate:
+            effects.append(["rate", f"{to_sample_rate}"])
+        if to_mono and waveform.shape[0] > 1:
+            effects.append(["channels", "1"])
+        if len(effects) > 0:
+            is_np_input = isinstance(waveform, np.ndarray)
+            _waveform = torch.from_numpy(waveform) if is_np_input else waveform
+            converted, converted_sample_rate = ta_sox.apply_effects_tensor(
+                _waveform, sample_rate, effects
+            )
+            if is_np_input:
+                converted = converted.numpy()
+            return converted, converted_sample_rate
+        return waveform, sample_rate

backend/requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+colorlog==6.8.2
+contextlib2==21.6.0
+fastapi==0.110.1
+g2p_en==2.1.0
+matplotlib==3.7.0
+numpy==1.24.2
+openai==1.20.0
+protobuf==5.26.1
+pydantic==2.7.0
+pydub==0.25.1
+pymongo==4.6.2
+PySoundFile==0.9.0.post1
+python-dotenv==1.0.1
+python-socketio==5.9.0
+pymongo==4.6.2
+Requests==2.31.0
+sentencepiece==0.1.99
+simuleval==1.1.4
+soundfile==0.12.1
+spacy==3.7.4
+pytextrank==3.3.0
+torch==2.1.2
+torchaudio==2.1.2
+#transformers==4.20.1
+uvicorn==0.29.0
+vad==1.0.2
+hf_transfer==0.1.4
+huggingface_hub==0.19.4

backend/routes/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from.routing import router

backend/routes/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (235 Bytes). View file

backend/routes/__pycache__/routing.cpython-310.pyc ADDED Viewed

Binary file (375 Bytes). View file

backend/routes/routing.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from fastapi import APIRouter
+import sys
+# sys.path.append('/Users/benolojo/DCU/CA4/ca400_FinalYearProject/2024-ca400-olojob2-majdap2/src/backend/src/')
+from mongodb.endpoints import users, calls
+router = APIRouter()
+router.include_router(calls.router)
+router.include_router(users.router)
+# router.include_router(transcripts.router)

backend/seamless/__init__.py ADDED Viewed

File without changes

backend/seamless/room.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# import json
+import uuid
+class Room:
+    def __init__(self, room_id) -> None:
+        self.room_id = room_id
+        # members is a dict from client_id to Member
+        self.members = {}
+        # listeners and speakers are lists of client_id's
+        self.listeners = []
+        self.speakers = []
+    def __str__(self) -> str:
+        return f"Room {self.room_id} ({len(self.members)} member{'s' if len(self.members) == 1 else ''})"
+    def to_json(self):
+        varsResult = vars(self)
+        # Remember: result is just a shallow copy, so result.members === self.members
+        # Because of that, we need to jsonify self.members without writing over result.members,
+        # which we do here via dictionary unpacking (the ** operator)
+        result = {
+            **varsResult,
+            "members": {key: value.to_json() for (key, value) in self.members.items()},
+            "activeTranscoders": self.get_active_transcoders(),
+        }
+        return result
+    def get_active_connections(self):
+        return len(
+            [m for m in self.members.values() if m.connection_status == "connected"]
+        )
+    def get_active_transcoders(self):
+        return len([m for m in self.members.values() if m.transcoder is not None])
+    def get_room_status_dict(self):
+        return {
+            "activeConnections": self.get_active_connections(),
+            "activeTranscoders": self.get_active_transcoders(),
+        }
+class Member:
+    def __init__(self, client_id, session_id, name) -> None:
+        self.client_id = client_id
+        self.session_id = session_id
+        self.name = name
+        self.connection_status = "connected"
+        self.transcoder = None
+        self.requested_output_type = None
+        self.transcoder_dynamic_config = None
+    def __str__(self) -> str:
+        return f"{self.name} (id: {self.client_id[:4]}...) ({self.connection_status})"
+    def to_json(self):
+        self_vars = vars(self)
+        return {
+            **self_vars,
+            "transcoder": self.transcoder is not None,
+        }

backend/seamless/simuleval_agent_directory.py ADDED Viewed

	@@ -0,0 +1,171 @@

+# Creates a directory in which to look up available agents
+import os
+from typing import List, Optional
+from seamless.simuleval_transcoder import SimulevalTranscoder
+import json
+import logging
+logger = logging.getLogger("gunicorn")
+# fmt: off
+M4T_P0_LANGS = [
+    "eng",
+    "arb", "ben", "cat", "ces", "cmn", "cym", "dan",
+    "deu", "est", "fin", "fra", "hin", "ind", "ita",
+    "jpn", "kor", "mlt", "nld", "pes", "pol", "por",
+    "ron", "rus", "slk", "spa", "swe", "swh", "tel",
+    "tgl", "tha", "tur", "ukr", "urd", "uzn", "vie",
+]
+# fmt: on
+class NoAvailableAgentException(Exception):
+    pass
+class AgentWithInfo:
+    def __init__(
+        self,
+        agent,
+        name: str,
+        modalities: List[str],
+        target_langs: List[str],
+        # Supported dynamic params are defined in StreamingTypes.ts
+        dynamic_params: List[str] = [],
+        description="",
+        has_expressive: Optional[bool] = None,
+    ):
+        self.agent = agent
+        self.has_expressive = has_expressive
+        self.name = name
+        self.description = description
+        self.modalities = modalities
+        self.target_langs = target_langs
+        self.dynamic_params = dynamic_params
+    def get_capabilities_for_json(self):
+        return {
+            "name": self.name,
+            "description": self.description,
+            "modalities": self.modalities,
+            "targetLangs": self.target_langs,
+            "dynamicParams": self.dynamic_params,
+        }
+    @classmethod
+    def load_from_json(cls, config: str):
+        """
+        Takes in JSON array of models to load in, e.g.
+        [{"name": "s2s_m4t_emma-unity2_multidomain_v0.1", "description": "M4T model that supports simultaneous S2S and S2T", "modalities": ["s2t", "s2s"], "targetLangs": ["en"]},
+        {"name": "s2s_m4t_expr-emma_v0.1", "description": "ES-EN expressive model that supports S2S and S2T", "modalities": ["s2t", "s2s"], "targetLangs": ["en"]}]
+        """
+        configs = json.loads(config)
+        agents = []
+        for config in configs:
+            agent = SimulevalTranscoder.build_agent(config["name"])
+            agents.append(
+                AgentWithInfo(
+                    agent=agent,
+                    name=config["name"],
+                    modalities=config["modalities"],
+                    target_langs=config["targetLangs"],
+                )
+            )
+        return agents
+class SimulevalAgentDirectory:
+    # Available models. These are the directories where the models can be found, and also serve as an ID for the model.
+    seamless_streaming_agent = "SeamlessStreaming"
+    seamless_agent = "Seamless"
+    def __init__(self):
+        self.agents = []
+        self.did_build_and_add_agents = False
+    def add_agent(self, agent: AgentWithInfo):
+        self.agents.append(agent)
+    def build_agent_if_available(self, model_id, config_name=None):
+        agent = None
+        try:
+            if config_name is not None:
+                agent = SimulevalTranscoder.build_agent(
+                    model_id,
+                    config_name=config_name,
+                )
+            else:
+                agent = SimulevalTranscoder.build_agent(
+                    model_id,
+                )
+        except Exception as e:
+            from fairseq2.assets.error import AssetError
+            logger.warning("Failed to build agent %s: %s" % (model_id, e))
+            if isinstance(e, AssetError):
+                logger.warning(
+                    "Please download gated assets and set `gated_model_dir` in the config"
+                )
+            raise e
+        return agent
+    def build_and_add_agents(self, models_override=None):
+        if self.did_build_and_add_agents:
+            return
+        if models_override is not None:
+            agent_infos = AgentWithInfo.load_from_json(models_override)
+            for agent_info in agent_infos:
+                self.add_agent(agent_info)
+        else:
+            s2s_agent = None
+            if os.environ.get("USE_EXPRESSIVE_MODEL", "0") == "1":
+                logger.info("Building expressive model...")
+                s2s_agent = self.build_agent_if_available(
+                    SimulevalAgentDirectory.seamless_agent,
+                    config_name="vad_s2st_sc_24khz_main.yaml",
+                )
+                has_expressive = True
+            else:
+                logger.info("Building non-expressive model...")
+                s2s_agent = self.build_agent_if_available(
+                    SimulevalAgentDirectory.seamless_streaming_agent,
+                    config_name="vad_s2st_sc_main.yaml",
+                )
+                has_expressive = False
+            if s2s_agent:
+                self.add_agent(
+                    AgentWithInfo(
+                        agent=s2s_agent,
+                        name=SimulevalAgentDirectory.seamless_streaming_agent,
+                        modalities=["s2t", "s2s"],
+                        target_langs=M4T_P0_LANGS,
+                        dynamic_params=["expressive"],
+                        description="multilingual expressive model that supports S2S and S2T",
+                        has_expressive=has_expressive,
+                    )
+                )
+        if len(self.agents) == 0:
+            logger.error(
+                "No agents were loaded. This likely means you are missing the actual model files specified in simuleval_agent_directory."
+            )
+        self.did_build_and_add_agents = True
+    def get_agent(self, name):
+        for agent in self.agents:
+            if agent.name == name:
+                return agent
+        return None
+    def get_agent_or_throw(self, name):
+        agent = self.get_agent(name)
+        if agent is None:
+            raise NoAvailableAgentException("No agent found with name= %s" % (name))
+        return agent
+    def get_agents_capabilities_list_for_json(self):
+        return [agent.get_capabilities_for_json() for agent in self.agents]

backend/seamless/simuleval_transcoder.py ADDED Viewed

	@@ -0,0 +1,428 @@

+from simuleval.utils.agent import build_system_from_dir
+from typing import Any, List, Optional, Tuple, Union
+import numpy as np
+import soundfile
+import io
+import asyncio
+from simuleval.agents.pipeline import TreeAgentPipeline
+from simuleval.agents.states import AgentStates
+from simuleval.data.segments import Segment, EmptySegment, SpeechSegment
+import threading
+import math
+import logging
+import sys
+from pathlib import Path
+import time
+from g2p_en import G2p
+import torch
+import traceback
+import time
+import random
+import colorlog
+from .speech_and_text_output import SpeechAndTextOutput
+MODEL_SAMPLE_RATE = 16_000
+logger = logging.getLogger(__name__)
+# logger.propagate = False
+handler = colorlog.StreamHandler(stream=sys.stdout)
+formatter = colorlog.ColoredFormatter(
+    "%(log_color)s[%(asctime)s][%(levelname)s][%(module)s]:%(reset)s %(message)s",
+    reset=True,
+    log_colors={
+        "DEBUG": "cyan",
+        "INFO": "green",
+        "WARNING": "yellow",
+        "ERROR": "red",
+        "CRITICAL": "red,bg_white",
+    },
+)
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.setLevel(logging.WARNING)
+class OutputSegments:
+    def __init__(self, segments: Union[List[Segment], Segment]):
+        if isinstance(segments, Segment):
+            segments = [segments]
+        self.segments: List[Segment] = [s for s in segments]
+    @property
+    def is_empty(self):
+        return all(segment.is_empty for segment in self.segments)
+    @property
+    def finished(self):
+        return all(segment.finished for segment in self.segments)
+    def compute_length(self, g2p):
+        lengths = []
+        for segment in self.segments:
+            if segment.data_type == "text":
+                lengths.append(len([x for x in g2p(segment.content) if x != " "]))
+            elif segment.data_type == "speech":
+                lengths.append(len(segment.content) / MODEL_SAMPLE_RATE)
+            elif isinstance(segment, EmptySegment):
+                continue
+            else:
+                logger.warning(
+                    f"Unexpected data_type: {segment.data_type} not in 'speech', 'text'"
+                )
+        return max(lengths)
+    @classmethod
+    def join_output_buffer(
+        cls, buffer: List[List[Segment]], output: SpeechAndTextOutput
+    ):
+        num_segments = len(buffer[0])
+        for i in range(num_segments):
+            segment_list = [
+                buffer[j][i]
+                for j in range(len(buffer))
+                if buffer[j][i].data_type is not None
+            ]
+            if len(segment_list) == 0:
+                continue
+            if len(set(segment.data_type for segment in segment_list)) != 1:
+                logger.warning(
+                    f"Data type mismatch at {i}: {set(segment.data_type for segment in segment_list)}"
+                )
+                continue
+            data_type = segment_list[0].data_type
+            if data_type == "text":
+                if output.text is not None:
+                    logger.warning("Multiple text outputs, overwriting!")
+                output.text = " ".join([segment.content for segment in segment_list])
+            elif data_type == "speech":
+                if output.speech_samples is not None:
+                    logger.warning("Multiple speech outputs, overwriting!")
+                speech_out = []
+                for segment in segment_list:
+                    speech_out += segment.content
+                output.speech_samples = speech_out
+                output.speech_sample_rate = segment.sample_rate
+            elif isinstance(segment_list[0], EmptySegment):
+                continue
+            else:
+                logger.warning(
+                    f"Invalid output buffer data type: {data_type}, expected 'speech' or 'text"
+                )
+        return output
+    def __repr__(self) -> str:
+        repr_str = str(self.segments)
+        return f"{self.__class__.__name__}(\n\t{repr_str}\n)"
+class SimulevalTranscoder:
+    def __init__(self, agent, sample_rate, debug, buffer_limit):
+        self.agent = agent.agent
+        self.has_expressive = agent.has_expressive
+        self.input_queue = asyncio.Queue()
+        self.output_queue = asyncio.Queue()
+        self.states = self.agent.build_states()
+        if debug:
+            self.get_states_root().debug = True
+        self.incoming_sample_rate = sample_rate
+        self.close = False
+        self.g2p = G2p()
+        # buffer all outgoing translations within this amount of time
+        self.output_buffer_idle_ms = 5000
+        self.output_buffer_size_limit = (
+            buffer_limit  # phonemes for text, seconds for speech
+        )
+        self.output_buffer_cur_size = 0
+        self.output_buffer: List[List[Segment]] = []
+        self.speech_output_sample_rate = None
+        self.last_output_ts = time.time() * 1000
+        self.timeout_ms = (
+            30000  # close the transcoder thread after this amount of silence
+        )
+        self.first_input_ts = None
+        self.first_output_ts = None
+        self.debug = debug
+        self.debug_ts = f"{time.time()}_{random.randint(1000, 9999)}"
+        if self.debug:
+            debug_folder = Path(__file__).resolve().parent.parent / "debug"
+            self.test_incoming_wav = soundfile.SoundFile(
+                debug_folder / f"{self.debug_ts}_test_incoming.wav",
+                mode="w+",
+                format="WAV",
+                subtype="PCM_16",
+                samplerate=self.incoming_sample_rate,
+                channels=1,
+            )
+            self.get_states_root().test_input_segments_wav = soundfile.SoundFile(
+                debug_folder / f"{self.debug_ts}_test_input_segments.wav",
+                mode="w+",
+                format="WAV",
+                samplerate=MODEL_SAMPLE_RATE,
+                channels=1,
+            )
+    def get_states_root(self) -> AgentStates:
+        if isinstance(self.agent, TreeAgentPipeline):
+            # self.states is a dict
+            return self.states[self.agent.source_module]
+        else:
+            # self.states is a list
+            return self.states[0]
+    def reset_states(self):
+        if isinstance(self.agent, TreeAgentPipeline):
+            states_iter = self.states.values()
+        else:
+            states_iter = self.states
+        for state in states_iter:
+            state.reset()
+    def debug_log(self, *args):
+        if self.debug:
+            logger.info(*args)
+    @classmethod
+    def build_agent(cls, model_path, config_name):
+        logger.info(f"Building simuleval agent: {model_path}, {config_name}")
+        agent = build_system_from_dir(
+            Path(__file__).resolve().parent.parent / f"models/{model_path}",
+            config_name=config_name,
+        )
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        agent.to(device, fp16=True)
+        logger.info(
+            f"Successfully built simuleval agent {model_path} on device {device}"
+        )
+        return agent
+    def process_incoming_bytes(self, incoming_bytes, dynamic_config):
+        # TODO: We probably want to do some validation on dynamic_config to ensure it has what we needs
+        segment, sr = self._preprocess_wav(incoming_bytes)
+        segment = SpeechSegment(
+            content=segment,
+            sample_rate=sr,
+            tgt_lang=dynamic_config.get("targetLanguage"),
+            config=dynamic_config,
+        )
+        if dynamic_config.get("expressive") is True and self.has_expressive is False:
+            logger.warning(
+                "Passing 'expressive' but the agent does not support expressive output!"
+            )
+        # # segment is array([0, 0, 0, ..., 0, 0, 0], dtype=int16)
+        self.input_queue.put_nowait(segment)
+    def get_input_segment(self):
+        if self.input_queue.empty():
+            return None
+        chunk = self.input_queue.get_nowait()
+        self.input_queue.task_done()
+        return chunk
+    def convert_waveform(
+        self,
+        waveform: Union[np.ndarray, torch.Tensor],
+        sample_rate: int,
+        normalize_volume: bool = False,
+        to_mono: bool = False,
+        to_sample_rate: Optional[int] = None,
+    ) -> Tuple[Union[np.ndarray, torch.Tensor], int]:
+        """convert a waveform:
+        - to a target sample rate
+        - from multi-channel to mono channel
+        - volume normalization
+        Args:
+            waveform (numpy.ndarray or torch.Tensor): 2D original waveform
+                (channels x length)
+            sample_rate (int): original sample rate
+            normalize_volume (bool): perform volume normalization
+            to_mono (bool): convert to mono channel if having multiple channels
+            to_sample_rate (Optional[int]): target sample rate
+        Returns:
+            waveform (numpy.ndarray): converted 2D waveform (channels x length)
+            sample_rate (float): target sample rate
+        """
+        try:
+            import torchaudio.sox_effects as ta_sox
+        except ImportError:
+            raise ImportError("Please install torchaudio: pip install torchaudio")
+        effects = []
+        if normalize_volume:
+            effects.append(["gain", "-n"])
+        if to_sample_rate is not None and to_sample_rate != sample_rate:
+            effects.append(["rate", f"{to_sample_rate}"])
+        if to_mono and waveform.shape[0] > 1:
+            effects.append(["channels", "1"])
+        if len(effects) > 0:
+            is_np_input = isinstance(waveform, np.ndarray)
+            _waveform = torch.from_numpy(waveform) if is_np_input else waveform
+            converted, converted_sample_rate = ta_sox.apply_effects_tensor(
+                _waveform, sample_rate, effects
+            )
+            if is_np_input:
+                converted = converted.numpy()
+            return converted, converted_sample_rate
+        return waveform, sample_rate
+    def _preprocess_wav(self, data: Any) -> Tuple[np.ndarray, int]:
+        segment, sample_rate = soundfile.read(
+            io.BytesIO(data),
+            dtype="float32",
+            always_2d=True,
+            frames=-1,
+            start=0,
+            format="RAW",
+            subtype="PCM_16",
+            samplerate=self.incoming_sample_rate,
+            channels=1,
+        )
+        if self.debug:
+            self.test_incoming_wav.seek(0, soundfile.SEEK_END)
+            self.test_incoming_wav.write(segment)
+        segment = segment.T
+        segment, new_sample_rate = self.convert_waveform(
+            segment,
+            sample_rate,
+            normalize_volume=False,
+            to_mono=True,
+            to_sample_rate=MODEL_SAMPLE_RATE,
+        )
+        assert MODEL_SAMPLE_RATE == new_sample_rate
+        segment = segment.squeeze(axis=0)
+        return segment, new_sample_rate
+    def process_pipeline_impl(self, input_segment):
+        try:
+            with torch.no_grad():
+                output_segment = OutputSegments(
+                    self.agent.pushpop(input_segment, self.states)
+                )
+            if (
+                self.get_states_root().first_input_ts is not None
+                and self.first_input_ts is None
+            ):
+                # TODO: this is hacky
+                self.first_input_ts = self.get_states_root().first_input_ts
+            if not output_segment.is_empty:
+                self.output_queue.put_nowait(output_segment)
+            if output_segment.finished:
+                self.debug_log("OUTPUT SEGMENT IS FINISHED. Resetting states.")
+                self.reset_states()
+                if self.debug:
+                    # when we rebuild states, this value is reset to whatever
+                    # is in the system dir config, which defaults debug=False.
+                    self.get_states_root().debug = True
+        except Exception as e:
+            logger.error(f"Got exception while processing pipeline: {e}")
+            traceback.print_exc()
+        return input_segment
+    def process_pipeline_loop(self):
+        if self.close:
+            return  # closes the thread
+        self.debug_log("processing_pipeline")
+        while not self.close:
+            input_segment = self.get_input_segment()
+            if input_segment is None:
+                if self.get_states_root().is_fresh_state:  # TODO: this is hacky
+                    time.sleep(0.3)
+                else:
+                    time.sleep(0.03)
+                continue
+            self.process_pipeline_impl(input_segment)
+        self.debug_log("finished processing_pipeline")
+    def process_pipeline_once(self):
+        if self.close:
+            return
+        self.debug_log("processing pipeline once")
+        input_segment = self.get_input_segment()
+        if input_segment is None:
+            return
+        self.process_pipeline_impl(input_segment)
+        self.debug_log("finished processing_pipeline_once")
+    def get_output_segment(self):
+        if self.output_queue.empty():
+            return None
+        output_chunk = self.output_queue.get_nowait()
+        self.output_queue.task_done()
+        return output_chunk
+    def start(self):
+        self.debug_log("starting transcoder in a thread")
+        threading.Thread(target=self.process_pipeline_loop).start()
+    def first_translation_time(self):
+        return round((self.first_output_ts - self.first_input_ts) / 1000, 2)
+    def get_buffered_output(self) -> SpeechAndTextOutput:
+        now = time.time() * 1000
+        self.debug_log(f"get_buffered_output queue size: {self.output_queue.qsize()}")
+        while not self.output_queue.empty():
+            tmp_out = self.get_output_segment()
+            if tmp_out and tmp_out.compute_length(self.g2p) > 0:
+                if len(self.output_buffer) == 0:
+                    self.last_output_ts = now
+                self._populate_output_buffer(tmp_out)
+                self._increment_output_buffer_size(tmp_out)
+                if tmp_out.finished:
+                    self.debug_log("tmp_out.finished")
+                    res = self._gather_output_buffer_data(final=True)
+                    self.debug_log(f"gathered output data: {res}")
+                    self.output_buffer = []
+                    self.increment_output_buffer_size = 0
+                    self.last_output_ts = now
+                    self.first_output_ts = now
+                    return res
+            else:
+                self.debug_log("tmp_out.compute_length is not > 0")
+        if len(self.output_buffer) > 0 and (
+            now - self.last_output_ts >= self.output_buffer_idle_ms
+            or self.output_buffer_cur_size >= self.output_buffer_size_limit
+        ):
+            self.debug_log(
+                "[get_buffered_output] output_buffer is not empty. getting res to return."
+            )
+            self.last_output_ts = now
+            res = self._gather_output_buffer_data(final=False)
+            self.debug_log(f"gathered output data: {res}")
+            self.output_buffer = []
+            self.output_buffer_phoneme_count = 0
+            self.first_output_ts = now
+            return res
+        else:
+            self.debug_log("[get_buffered_output] output_buffer is empty...")
+            return None
+    def _gather_output_buffer_data(self, final):
+        output = SpeechAndTextOutput()
+        output.final = final
+        output = OutputSegments.join_output_buffer(self.output_buffer, output)
+        return output
+    def _increment_output_buffer_size(self, segment: OutputSegments):
+        self.output_buffer_cur_size += segment.compute_length(self.g2p)
+    def _populate_output_buffer(self, segment: OutputSegments):
+        self.output_buffer.append(segment.segments)
+    def _compute_phoneme_count(self, string: str) -> int:
+        return len([x for x in self.g2p(string) if x != " "])

backend/seamless/speech_and_text_output.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Provides a container to return both speech and text output from our model at the same time
+class SpeechAndTextOutput:
+    def __init__(
+        self,
+        text: str = None,
+        speech_samples: list = None,
+        speech_sample_rate: float = None,
+        final: bool = False,
+    ):
+        self.text = text
+        self.speech_samples = speech_samples
+        self.speech_sample_rate = speech_sample_rate
+        self.final = final

backend/seamless/transcoder_helpers.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import logging
+logger = logging.getLogger("gunicorn")
+def get_transcoder_output_events(transcoder) -> list:
+    speech_and_text_output = transcoder.get_buffered_output()
+    if speech_and_text_output is None:
+        logger.debug("No output from transcoder.get_buffered_output()")
+        return []
+    logger.debug(f"We DID get output from the transcoder! {speech_and_text_output}")
+    lat = None
+    events = []
+    if speech_and_text_output.speech_samples:
+        events.append(
+            {
+                "event": "translation_speech",
+                "payload": speech_and_text_output.speech_samples,
+                "sample_rate": speech_and_text_output.speech_sample_rate,
+            }
+        )
+    if speech_and_text_output.text:
+        events.append(
+            {
+                "event": "translation_text",
+                "payload": speech_and_text_output.text,
+            }
+        )
+    for e in events:
+        e["eos"] = speech_and_text_output.final
+    # if not latency_sent:
+    #     lat = transcoder.first_translation_time()
+    #     latency_sent = True
+    #     to_send["latency"] = lat
+    return events

backend/seamless_utils.py ADDED Viewed

	@@ -0,0 +1,210 @@

+# base seamless imports
+# ---------------------------------
+import io
+import json
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import mmap
+import numpy as np
+import soundfile
+import torchaudio
+import torch
+from pydub import AudioSegment
+# ---------------------------------
+# seamless-streaming specific imports
+# ---------------------------------
+import math
+from simuleval.data.segments import SpeechSegment, EmptySegment
+from seamless_communication.streaming.agents.seamless_streaming_s2st import (
+    SeamlessStreamingS2STVADAgent,
+)
+from simuleval.utils.arguments import cli_argument_list
+from simuleval import options
+from typing import Union, List
+from simuleval.data.segments import Segment, TextSegment
+from simuleval.agents.pipeline import TreeAgentPipeline
+from simuleval.agents.states import AgentStates
+# ---------------------------------
+# seamless setup
+# source: https://colab.research.google.com/github/kauterry/seamless_communication/blob/main/Seamless_Tutorial.ipynb?
+SAMPLE_RATE = 16000
+# PM - THis class is used to simulate the audio frontend in the seamless streaming pipeline
+# need to replace this with the actual audio frontend
+# TODO: replacement class that takes in PCM-16 bytes and returns SpeechSegment
+class AudioFrontEnd:
+    def __init__(self, wav_file, segment_size) -> None:
+        self.samples, self.sample_rate = soundfile.read(wav_file)
+        print(self.sample_rate, "sample rate")
+        assert self.sample_rate == SAMPLE_RATE
+        # print(len(self.samples), self.samples[:100])
+        self.samples = self.samples  # .tolist()
+        self.segment_size = segment_size
+        self.step = 0
+    def send_segment(self):
+        """
+        This is the front-end logic in simuleval instance.py
+        """
+        num_samples = math.ceil(self.segment_size / 1000 * self.sample_rate)
+        if self.step < len(self.samples):
+            if self.step + num_samples >= len(self.samples):
+                samples = self.samples[self.step :]
+                is_finished = True
+            else:
+                samples = self.samples[self.step : self.step + num_samples]
+                is_finished = False
+                self.samples = self.samples[self.step:]
+            self.step = min(self.step + num_samples, len(self.samples))
+            segment = SpeechSegment(
+                content=samples,
+                sample_rate=self.sample_rate,
+                finished=is_finished,
+            )
+        else:
+            # Finish reading this audio
+            segment = EmptySegment(
+                finished=True,
+            )
+            self.step = 0
+            self.samples = []
+        return segment
+        # samples = self.samples[:num_samples]
+        # self.samples = self.samples[num_samples:]
+        # segment = SpeechSegment(
+        #     content=samples,
+        #     sample_rate=self.sample_rate,
+        #     finished=False,
+        # )
+    def add_segments(self, wav):
+        new_samples, _ = soundfile.read(wav)
+        self.samples = np.concatenate((self.samples, new_samples))
+class OutputSegments:
+    def __init__(self, segments: Union[List[Segment], Segment]):
+        if isinstance(segments, Segment):
+            segments = [segments]
+        self.segments: List[Segment] = [s for s in segments]
+    @property
+    def is_empty(self):
+        return all(segment.is_empty for segment in self.segments)
+    @property
+    def finished(self):
+        return all(segment.finished for segment in self.segments)
+def get_audiosegment(samples, sr):
+    b = io.BytesIO()
+    soundfile.write(b, samples, samplerate=sr, format="wav")
+    b.seek(0)
+    return AudioSegment.from_file(b)
+def reset_states(system, states):
+    if isinstance(system, TreeAgentPipeline):
+        states_iter = states.values()
+    else:
+        states_iter = states
+    for state in states_iter:
+        state.reset()
+def get_states_root(system, states) -> AgentStates:
+    if isinstance(system, TreeAgentPipeline):
+        # self.states is a dict
+        return states[system.source_module]
+    else:
+        # self.states is a list
+        return system.states[0]
+def build_streaming_system(model_configs, agent_class):
+    parser = options.general_parser()
+    parser.add_argument("-f", "--f", help="a dummy argument to fool ipython", default="1")
+    agent_class.add_args(parser)
+    args, _ = parser.parse_known_args(cli_argument_list(model_configs))
+    system = agent_class.from_args(args)
+    return system
+def run_streaming_inference(system, audio_frontend, system_states, tgt_lang):
+    # NOTE: Here for visualization, we calculate delays offset from audio
+    # *BEFORE* VAD segmentation.
+    # In contrast for SimulEval evaluation, we assume audios are pre-segmented,
+    # and Average Lagging, End Offset metrics are based on those pre-segmented audios.
+    # Thus, delays here are *NOT* comparable to SimulEval per-segment delays
+    delays = {"s2st": [], "s2tt": []}
+    prediction_lists = {"s2st": [], "s2tt": []}
+    speech_durations = []
+    curr_delay = 0
+    target_sample_rate = None
+    while True:
+        input_segment = audio_frontend.send_segment()
+        input_segment.tgt_lang = tgt_lang
+        curr_delay += len(input_segment.content) / SAMPLE_RATE * 1000
+        if input_segment.finished:
+            # a hack, we expect a real stream to end with silence
+            get_states_root(system, system_states).source_finished = True
+        # Translation happens here
+        if isinstance(input_segment, EmptySegment):
+            return None, None, None, None
+        output_segments = OutputSegments(system.pushpop(input_segment, system_states))
+        if not output_segments.is_empty:
+            for segment in output_segments.segments:
+                # NOTE: another difference from SimulEval evaluation -
+                # delays are accumulated per-token
+                if isinstance(segment, SpeechSegment):
+                    pred_duration = 1000 * len(segment.content) / segment.sample_rate
+                    speech_durations.append(pred_duration)
+                    delays["s2st"].append(curr_delay)
+                    prediction_lists["s2st"].append(segment.content)
+                    target_sample_rate = segment.sample_rate
+                elif isinstance(segment, TextSegment):
+                    delays["s2tt"].append(curr_delay)
+                    prediction_lists["s2tt"].append(segment.content)
+                    print(curr_delay, segment.content)
+        if output_segments.finished:
+            reset_states(system, system_states)
+        if input_segment.finished:
+            # an assumption of SimulEval agents -
+            # once source_finished=True, generate until output translation is finished
+            break
+    return delays, prediction_lists, speech_durations, target_sample_rate
+def get_s2st_delayed_targets(delays, target_sample_rate, prediction_lists, speech_durations):
+    # get calculate intervals + durations for s2st
+    intervals = []
+    start = prev_end = prediction_offset = delays["s2st"][0]
+    target_samples = [0.0] * int(target_sample_rate * prediction_offset / 1000)
+    for i, delay in enumerate(delays["s2st"]):
+        start = max(prev_end, delay)
+        if start > prev_end:
+            # Wait source speech, add discontinuity with silence
+            target_samples += [0.0] * int(
+                target_sample_rate * (start - prev_end) / 1000
+            )
+        target_samples += prediction_lists["s2st"][i]
+        duration = speech_durations[i]
+        prev_end = start + duration
+        intervals.append([start, duration])
+    return target_samples, intervals

backend/tests/__pycache__/test_client.cpython-310-pytest-8.1.1.pyc ADDED Viewed

Binary file (6.82 kB). View file

backend/tests/__pycache__/test_main.cpython-310-pytest-8.1.1.pyc ADDED Viewed

Binary file (3.38 kB). View file

backend/tests/__pycache__/test_main.cpython-310.pyc ADDED Viewed

Binary file (2.2 kB). View file

backend/tests/silence.wav ADDED Viewed

Binary file (302 kB). View file

backend/tests/speaking.wav ADDED Viewed

Binary file (255 kB). View file

backend/tests/test_client.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+import wave
+import pytest
+import torchaudio
+import os
+import sys
+current_dir = os.path.dirname(os.path.abspath(__file__))
+parent_dir = os.path.dirname(current_dir)
+sys.path.append(parent_dir)
+from Client import Client
+@pytest.fixture
+def mock_client():
+    client = Client("test_sid", "test_client_id", original_sr=44100)
+    return client
+def test_client_init(mock_client):
+    assert mock_client.sid == "test_sid"
+    assert mock_client.client_id == "test_client_id"
+    assert mock_client.call_id == None
+    assert mock_client.buffer == bytearray()
+    assert mock_client.output_path == "test_sid_output_audio.wav"
+    assert mock_client.target_language == None
+    assert mock_client.original_sr == 44100
+    assert mock_client.vad.sample_rate == 16000
+    assert mock_client.vad.frame_length == 25
+    assert mock_client.vad.frame_shift == 20
+    assert mock_client.vad.energy_threshold == 0.05
+    assert mock_client.vad.pre_emphasis == 0.95
+def test_client_add_bytes(mock_client):
+    mock_client.add_bytes(b"test")
+    assert mock_client.buffer == b"test"
+def test_client_resample_and_clear(mock_client):
+    location = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+    speaking_bytes = wave.open(location + "/speaking.wav", "rb").readframes(-1)
+    mock_client.add_bytes(speaking_bytes)
+    resampled_waveform = mock_client.resample_and_clear()
+    torchaudio.save(location + "testoutput.wav", resampled_waveform, 16000)
+    with wave.open(location + "testoutput.wav", "rb") as wf:
+        sample_rate = wf.getframerate()
+    assert mock_client.buffer == bytearray()
+    assert sample_rate == 16000
+def test_client_vad(mock_client):
+    location = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+    speaking_bytes = wave.open(location + "/speaking.wav", "rb").readframes(-1)
+    mock_client.add_bytes(speaking_bytes)
+    resampled_waveform = mock_client.resample_and_clear()
+    assert mock_client.buffer == bytearray()
+    assert mock_client.vad_analyse(resampled_waveform) == True
+    silent_bytes = wave.open(location + "/silence.wav", "rb").readframes(-1)
+    mock_client.add_bytes(silent_bytes)
+    resampled_waveform = mock_client.resample_and_clear()
+    assert mock_client.buffer == bytearray()
+    assert mock_client.vad_analyse(resampled_waveform) == False

backend/tests/test_main.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from fastapi import FastAPI
+import pytest
+from unittest.mock import AsyncMock, MagicMock, ANY
+import socketio
+import os
+import sys
+current_dir = os.path.dirname(os.path.abspath(__file__))
+parent_dir = os.path.dirname(current_dir)
+sys.path.append(parent_dir)
+from Client import Client
+from main import sio, connect, disconnect, target_language, call_user, answer_call, incoming_audio, clients, rooms
+from unittest.mock import patch
+sio = socketio.AsyncServer(
+    async_mode="asgi",
+    cors_allowed_origins="*",
+    # engineio_logger=logger,
+)
+# sio.logger.setLevel(logging.DEBUG)
+socketio_app = socketio.ASGIApp(sio)
+app = FastAPI()
+app.mount("/", socketio_app)
+@pytest.fixture(autouse=True)
+def setup_clients_and_rooms():
+    global clients, rooms
+    clients.clear()
+    rooms.clear()
+    yield
+@pytest.fixture
+def mock_client():
+    client = Client("test_sid", "test_client_id", original_sr=44100)
+    return client
+@pytest.mark.asyncio
+async def test_connect(mock_client):
+    sid = mock_client.sid
+    environ = {'QUERY_STRING': 'client_id=test_client_id'}
+    await connect(sid, environ)
+    assert sid in clients
+@pytest.mark.asyncio
+async def test_disconnect(mock_client):
+    sid = mock_client.sid
+    clients[sid] = mock_client
+    await disconnect(sid)
+    assert sid not in clients
+@pytest.mark.asyncio
+async def test_target_language(mock_client):
+    sid = mock_client.sid
+    clients[sid] = mock_client
+    target_lang = "fr"
+    await target_language(sid, target_lang)
+    assert clients[sid].target_language == "fr"
+# PM - issues with socketio enter_room in these tests
+# @pytest.mark.asyncio
+# async def test_call_user(mock_client):
+#     sid = mock_client.sid
+#     clients[sid] = mock_client
+#     call_id = "1234"
+#     await call_user(sid, call_id)
+#     assert call_id in rooms
+#     assert sid in rooms[call_id]
+# @pytest.mark.asyncio
+# async def test_answer_call(mock_client):
+#     sid = mock_client.sid
+#     clients[sid] = mock_client
+#     call_id = "1234"
+#     await answer_call(sid, call_id)
+#     assert call_id in rooms
+#     assert sid in rooms[call_id]
+@pytest.mark.asyncio
+async def test_incoming_audio(mock_client):
+    sid = mock_client.sid
+    clients[sid] = mock_client
+    data = b"\x01"
+    call_id = "1234"
+    await incoming_audio(sid, data, call_id)
+    assert clients[sid].get_length() != 0

backend/utils/__pycache__/text_rank.cpython-310.pyc ADDED Viewed

Binary file (2.07 kB). View file

backend/utils/text_rank.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import spacy
+import pytextrank
+from spacy.tokens import Span
+# Define decorator for converting to singular version of words
+@spacy.registry.misc("plural_scrubber")
+def plural_scrubber():
+    def scrubber_func(span: Span) -> str:
+        return span.lemma_
+    return scrubber_func
+def model_selector(target_language: str):
+    # Load subset of non-english models
+    language_model = {
+        "spa": "es_core_news_sm",
+        "fra": "fr_core_news_sm",
+        "pol": "pl_core_news_sm",
+        "deu": "de_core_news_sm",
+        "ita": "it_core_news_sm",
+        "por": "pt_core_news_sm",
+        "nld": "nl_core_news_sm",
+        "fin": "fi_core_news_sm",
+        "ron": "ro_core_news_sm",
+        "rus": "ru_core_news_sm"
+    }
+    try:
+        nlp = spacy.load(language_model[target_language])
+    except KeyError:
+        # Load a spaCy English model
+        nlp = spacy.load("en_core_web_lg")
+    # Add TextRank component to pipeline with stopwords
+    nlp.add_pipe("textrank", config={
+                            "stopwords": {token:["NOUN"] for token in nlp.Defaults.stop_words},
+                            "scrubber": {"@misc": "plural_scrubber"}})
+    return nlp
+def extract_terms(text, target_language, length):
+    nlp = model_selector(target_language)
+    # Perform fact extraction on overall summary and segment summaries
+    doc = nlp(text)
+    if length < 100:
+        # Get single most used key term
+        phrases = {phrase.text for phrase in doc._.phrases[:1]}
+    elif length > 100 and length < 300:
+        # Create unique set from top 2 ranked phrases
+        phrases = {phrase.text for phrase in doc._.phrases[:2]}
+    if length > 300:
+        # Create unique set from top 3 ranked phrases
+        phrases = {phrase.text for phrase in doc._.phrases[:3]}
+    return list(phrases)