File size: 4,850 Bytes
ab58fb5
9d727e3
 
7117743
4bce6ba
d1ef5e9
b74cc3f
d56b55d
a581f12
 
4fa56af
7814fef
 
 
 
4fa56af
253b4e4
 
 
 
 
e7956b2
253b4e4
 
 
 
866ffb3
 
 
 
 
b74cc3f
e7956b2
555a0ea
 
2ea7ea8
0157a0b
2ea7ea8
3e7fc84
2ea7ea8
9bb604c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253b4e4
c274bf0
01bea1f
e7956b2
e3d61f6
9bb604c
 
 
4fa56af
 
01bea1f
0157a0b
d6920f0
 
 
4fa56af
 
 
 
 
 
 
c274bf0
ad51ec1
4ebf6a9
b03d0c5
 
 
5431882
 
b03d0c5
976ad76
 
ad51ec1
 
 
b03d0c5
 
 
976ad76
97c3b7e
3277eaa
e7956b2
 
5de52b7
690c95c
e7956b2
 
0069484
e7956b2
 
ad51ec1
e7956b2
 
d6920f0
4ebf6a9
ad51ec1
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import streamlit as st
import whisper
import os
import torch
import nltk
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, AutoModelForSeq2SeqLM
from pydub import AudioSegment
from nltk import sent_tokenize
nltk.download('punkt')

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

def transcribe_audio(audiofile):

    st.session_state['audio'] = audiofile
    print(f"audio_file_session_state:{st.session_state['audio'] }")

    st.info("Getting size of file")
    #get size of audio file
    audio_size = round(os.path.getsize(st.session_state['audio'])/(1024*1024),1)
    print(f"audio file size:{audio_size}")

    #determine audio duration
    podcast = AudioSegment.from_mp3(st.session_state['audio'])
    st.session_state['audio_segment'] = podcast
    podcast_duration = podcast.duration_seconds
    print(f"Audio Duration: {podcast_duration}")

    st.info("Transcribing")
    whisper_model = whisper.load_model("small.en")
    transcription = whisper_model.transcribe(audiofile)
    st.session_state['transcription'] = transcription
    print(f"transcription: {transcription['text']}")
    st.info('Done Transcription')

    return transcription
def chunk_and_preprocess_text(text, model_name= 'philschmid/flan-t5-base-samsum'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    sentences = sent_tokenize(text)

    length = 0
    chunk = ""
    chunks = []
    count = -1
    
    for sentence in sentences:
        count += 1
        combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter
    
        if combined_length  <= tokenizer.max_len_single_sentence: # if it doesn't exceed
            chunk += sentence + " " # add the sentence to the chunk
            length = combined_length # update the length counter
    
            # if it is the last sentence
            if count == len(sentences) - 1:
                chunks.append(chunk) # save the chunk
      
        else: 
            chunks.append(chunk) # save the chunk
            # reset 
            length = 0 
            chunk = ""
        
            # take care of the overflow sentence
            chunk += sentence + " "
            length = len(tokenizer.tokenize(sentence))

    return chunks

def summarize_podcast(audiotranscription):
    st.info("Summarizing...")
    summarizer = pipeline("summarization", model="philschmid/flan-t5-base-samsum", device=0)

    st.info("Chunking text")
    text_chunks = chunk_and_preprocess_text(audiotranscription)

    #summarized_text = summarizer(text_chunks, max_len=200,min_len=50)
    summarized_text = summarizer(text_chunks)
    st.session_state['summary'] = summarized_text
    print(f"Summary: {summarized_text}")
    #summarized_text is an array of objects with key summary_text
    full_summary = ' '.join(item['summary_text'] for item in summarized_text)
    return full_summary

def prepare_text_for_qa(audiotranscription):

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=20)
    documents = text_splitter.split_documents(audiotranscription)
    revalue = ""
    return revalue

st.markdown("# Podcast Summarizer")

st.markdown(
        """
        This helps understand information-dense podcast episodes by doing the following:
        - Speech to Text transcription - using OpenSource Whisper Model (small.en)
        - Summarizes the episode - using philschmid/flan-t5-base-samsum a model based on Google's flan t5

        - As a proof of Concept: the Podcast Episode of Marketplace Business News Podcast from NPR on June 14 is used in this codebase.
        - The file is THE ONLY HARDCODED piece of information used in this application.

        - *HOW TO TEST:* Click on "Process Audio File" button

        """
        )

st.text("Marketplace Episode June 14 2023")
st.audio("marketplace-2023-06-14.mp3") 
if st.button("Process Audio File"):
    podcast_text = transcribe_audio("marketplace-2023-06-14.mp3")
    #write text out
    with st.expander("See Transcription"):
        st.caption(podcast_text['text'])
    
    #Summarize Text
    podcast_summary = summarize_podcast(podcast_text['text'])
    st.markdown(
        """
           ## Summary of Text
        """
        )
    st.text(podcast_summary)

# if st.button("Summarize Podcast"):
    # with open('transcription.txt', 'r') as file:
        # podcast_text = file.read().rstrip()
    # podcast_summary = summarize_podcast(podcast_text)
    # st.markdown(
        # """
           # ## Summary of Text
        # """
        # )
    # st.text(podcast_summary)