|
import streamlit as st |
|
import whisper |
|
import os |
|
import torch |
|
import nltk |
|
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, AutoModelForSeq2SeqLM |
|
from pydub import AudioSegment |
|
from nltk import sent_tokenize |
|
nltk.download('punkt') |
|
|
|
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
|
|
|
def transcribe_audio(audiofile): |
|
|
|
st.session_state['audio'] = audiofile |
|
print(f"audio_file_session_state:{st.session_state['audio'] }") |
|
|
|
st.info("Getting size of file") |
|
|
|
audio_size = round(os.path.getsize(st.session_state['audio'])/(1024*1024),1) |
|
print(f"audio file size:{audio_size}") |
|
|
|
|
|
podcast = AudioSegment.from_mp3(st.session_state['audio']) |
|
st.session_state['audio_segment'] = podcast |
|
podcast_duration = podcast.duration_seconds |
|
print(f"Audio Duration: {podcast_duration}") |
|
|
|
st.info("Transcribing") |
|
whisper_model = whisper.load_model("small.en") |
|
transcription = whisper_model.transcribe(audiofile) |
|
st.session_state['transcription'] = transcription |
|
print(f"ranscription: {transcription['text']}") |
|
st.info('Done Transcription') |
|
|
|
return transcription |
|
def chunk_and_preprocess_text(text, model_name= 'philschmid/flan-t5-base-samsum'): |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
sentences = sent_tokenize(text) |
|
|
|
length = 0 |
|
chunk = "" |
|
chunks = [] |
|
count = -1 |
|
|
|
for sentence in sentences: |
|
count += 1 |
|
combined_length = len(tokenizer.tokenize(sentence)) + length |
|
|
|
if combined_length <= tokenizer.max_len_single_sentence: |
|
chunk += sentence + " " |
|
length = combined_length |
|
|
|
|
|
if count == len(sentences) - 1: |
|
chunks.append(chunk) |
|
|
|
else: |
|
chunks.append(chunk) |
|
|
|
length = 0 |
|
chunk = "" |
|
|
|
|
|
chunk += sentence + " " |
|
length = len(tokenizer.tokenize(sentence)) |
|
|
|
return chunks |
|
|
|
def summarize_podcast(audiotranscription): |
|
st.info("Summarizing...") |
|
summarizer = pipeline("summarization", model="philschmid/flan-t5-base-samsum", device=0) |
|
|
|
st.info("Chunking text") |
|
text_chunks = chunk_and_preprocess_text(audiotranscription) |
|
|
|
|
|
summarized_text = summarizer(text_chunks) |
|
st.session_state['summary'] = summarized_text |
|
return summarized_text |
|
|
|
def prepare_text_for_qa(audiotranscription): |
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=20) |
|
documents = text_splitter.split_documents(audiotranscription) |
|
revalue = "" |
|
return revalue |
|
|
|
st.markdown("# Podcast Q&A") |
|
|
|
st.markdown( |
|
""" |
|
This helps understand information-dense podcast episodes by doing the following: |
|
- Speech to Text transcription - using OpenSource Whisper Model |
|
- Summarizes the episode |
|
- Allows you to ask questions and returns direct quotes from the episode. |
|
|
|
- As a proof of Concept: the Podcast Episode of Marketplace Business News Podcast from NPR on June 14 is used in this codebase. |
|
- The file is THE ONLY HARDCODED piece of information used in this application. |
|
""" |
|
) |
|
|
|
st.text("Marketplace Episode June 14 2023") |
|
st.audio("marketplace-2023-06-14.mp3") |
|
if st.button("Process Audio File"): |
|
podcast_text = transcribe_audio("marketplace-2023-06-14.mp3") |
|
|
|
with st.expander("See Transcription"): |
|
st.caption(podcast_text['text']) |
|
|
|
|
|
podcast_summary = summarize_podcast(podcast_text['text']) |
|
st.markdown( |
|
""" |
|
##Summary of Text |
|
""" |
|
) |
|
st.text(podcast_summary['summary_text']) |
|
|
|
if st.button("Summarize Podcast"): |
|
with open('transcription.txt', 'r') as file: |
|
podcast_text = file.read().rstrip() |
|
podcast_summary = summarize_podcast(podcast_text) |
|
st.markdown( |
|
""" |
|
##Summary of Text |
|
""" |
|
) |
|
st.text(podcast_summary['summary_text']) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|