## Dirty one file implementation for expermiental (and fun) purpose only
import os
import gradio as gr
from gradio_client import Client
import requests
from dotenv import load_dotenv
from pydub import AudioSegment
from tqdm.auto import tqdm
print("starting")
load_dotenv()
HF_API = os.getenv("HF_API")
SEAMLESS_API_URL = os.getenv("SEAMLESS_API_URL") # path to Seamlessm4t API endpoint
GPU_AVAILABLE = os.getenv("GPU_AVAILABLE")
DEFAULT_TARGET_LANGUAGE = "French"
MISTRAL_SUMMARY_URL = (
"https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
)
LLAMA_SUMMARY_URL = (
"https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
)
print("env setup ok")
DESCRIPTION = """
# Transcribe and create a summary of a conversation.
"""
DUPLICATE = """
To duplicate this repo, you have to give permission from three reopsitories and accept all user conditions:
1- https://huggingface.co/pyannote/voice-activity-detection
2- https://hf.co/pyannote/segmentation
3- https://hf.co/pyannote/speaker-diarization
"""
from pyannote.audio import Pipeline
# initialize diarization pipeline
diarizer = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1", use_auth_token=HF_API
)
# send pipeline to GPU (when available)
import torch
diarizer.to(torch.device(GPU_AVAILABLE))
print("diarizer setup ok")
# predict is a generator that incrementally yields recognized text with speaker label
def predict(target_language, input_audio):
print("->predict started")
print(target_language, type(input_audio), input_audio)
print("-->diarization")
diarized = diarizer(input_audio, min_speakers=2, max_speakers=5)
print("-->automatic speech recognition")
# split audio according to diarization
song = AudioSegment.from_wav(input_audio)
# client = Client(SEAMLESS_API_URL, hf_token=HF_API, serialize=False)
output_text = ""
for turn, _, speaker in diarized.itertracks(yield_label=True):
print(speaker, turn)
try:
filename = f"{turn.start}_segment.wav"
clipped = song[turn.start * 1000 : turn.end * 1000]
clipped.export(filename, format="wav", bitrate=16000)
# result = client.predict(f"my.wav", target_language, api_name="/asr")
result = automatic_speech_recognition(target_language, filename)
current_text = f"speaker: {speaker} text: {result} "
print(current_text)
if current_text is not None:
output_text = output_text + "\n" + current_text
yield output_text
except Exception as e:
print(e)
def automatic_speech_recognition(language, filename):
match language:
case "French":
api_url = "https://api-inference.huggingface.co/models/bofenghuang/whisper-large-v3-french"
case "English":
api_url = "https://api-inference.huggingface.co/models/facebook/wav2vec2-base-960h"
case _:
return f"Unknown language {language}"
print(f"-> automatic_speech_recognition with {api_url}")
with open(filename, "rb") as f:
data = f.read()
response = requests.post(
api_url, headers={"Authorization": f"Bearer {HF_API}"}, data=data
)
print(response.json())
return response.json()["text"]
def generate_summary_llama3(language, transcript):
queryTxt = f"""
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful and truthful patient-doctor encounter summary writer.
Users sends you transcripts of patient-doctor encounter and you create accurate and concise summaries.
The summary only contains informations from the transcript.
Your summary is written in {language}.
The summary only includes relevant sections.
# Chief Complaint
# History of Present Illness (HPI)
# Relevant Past Medical History
# Physical Examination
# Assessment and Plan
# Follow-up
# Additional Notes
<|eot_id|>
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
{transcript}
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
"""
payload = {
"inputs": queryTxt,
"parameters": {
"return_full_text": False,
"wait_for_model": True,
"min_length": 1000,
},
"options": {"use_cache": False},
}
response = requests.post(
LLAMA_SUMMARY_URL, headers={"Authorization": f"Bearer {HF_API}"}, json=payload
)
print(response.json())
return response.json()[0]["generated_text"][len("") :]
def generate_summary_mistral(language, transcript):
sysPrompt = f"""[INST]
You are a helpful and truthful patient-doctor encounter summary writer.
Users sends you transcripts of patient-doctor encounter and you create accurate and concise summaries.
The summary only contains informations from the transcript.
Your summary is written in {language}.
The summary only includes relevant sections.
# Chief Complaint
# History of Present Illness (HPI)
# Relevant Past Medical History
# Physical Examination
# Assessment and Plan
# Follow-up
# Additional Notes
"""
queryTxt = f"""
{transcript}
[/INST]
"""
payload = {
"inputs": sysPrompt + queryTxt,
"parameters": {
"return_full_text": False,
"wait_for_model": True,
"min_length": 1000,
},
"options": {"use_cache": False},
}
response = requests.post(
MISTRAL_SUMMARY_URL, headers={"Authorization": f"Bearer {HF_API}"}, json=payload
)
print(response.json())
return response.json()[0]["generated_text"][len("") :]
def generate_summary(model, language, transcript):
match model:
case "Mistral-7B":
print("-> summarize with mistral")
return generate_summary_mistral(language, transcript)
case "LLAMA3":
print("-> summarize with llama3")
return generate_summary_llama3(language, transcript)
case _:
return f"Unknown model {model}"
def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
mic = audio_source == "microphone"
return (
gr.update(visible=mic, value=None), # input_audio_mic
gr.update(visible=not mic, value=None), # input_audio_file
)
with gr.Blocks() as demo:
gr.Markdown(DESCRIPTION)
with gr.Group():
with gr.Row():
target_language = gr.Dropdown(
choices=["French", "English"],
label="Output Language",
value="French",
interactive=True,
info="Select your target language",
)
with gr.Row() as audio_box:
input_audio = gr.Audio(type="filepath")
submit = gr.Button("Transcribe")
transcribe_output = gr.Textbox(
label="Transcribed Text",
value="",
interactive=False,
lines=10,
scale=10,
max_lines=100,
)
submit.click(
fn=predict,
inputs=[target_language, input_audio],
outputs=[transcribe_output],
api_name="predict",
)
with gr.Row():
sumary_model = gr.Dropdown(
choices=["Mistral-7B", "LLAMA3"],
label="Summary model",
value="Mistral-7B",
interactive=True,
info="Select your summary model",
)
summarize = gr.Button("Summarize")
summary_output = gr.Textbox(
label="Summarized Text",
value="",
interactive=False,
lines=10,
scale=10,
max_lines=100,
)
summarize.click(
fn=generate_summary,
inputs=[sumary_model, target_language, transcribe_output],
outputs=[summary_output],
api_name="predict",
)
gr.Markdown(DUPLICATE)
demo.queue(max_size=50).launch()