kabita-choudhary's picture
Create app.py
b733b99
raw
history blame
1.81 kB
import pandas as pd
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization",use_auth_token="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")
def diarization():
diarization = pipeline("result.wav")
speakertime=[]
# print the result
for turn, _, speaker in diarization.itertracks(yield_label=True):
details=[turn.start,turn.end,speaker]
speakertime.append(details)
#print(turn.start)
#print(speaker)
print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")
#print(speakertime)
df = pd.DataFrame(speakertime,columns=['start', 'end','speaker'])
text=[]
for i in range (df.start.count()):
text.append(generatetext("result.wav",df.start[i], df.end[i]))
df['text']=text
with open('my_file.txt', 'w') as my_file:
for i in range (df.start.count()):
my_file.write(df.speaker[i]+": " +df.text[i] + '\n')
print(open("my_file.txt","r").read())
def generatetext(filename,starttime,endtime):
t1 = starttime * 1000 # works in milliseconds
t2 = endtime * 1000
newAudio = AudioSegment.from_wav(filename)
a = newAudio[t1:t2]
a.export('audio.wav', format="wav")
text1 = whisper('audio.wav')
return text1.get("text")
block = gr.Blocks()
with block:
with gr.Group():
with gr.Box():
with gr.Row().style():
inp_audio = gr.Audio(
label="Input Audio",
type="filepath",
mirror_webcam = False
)
outputdialogs = gr.Textbox()
btn = gr.Button("Generate Text")
btn.click(diarisation, inputs=[inp_audio], outputdialogs=[op],api_name="view_api")
block.launch(enable_queue = True,debug=True)