File size: 8,754 Bytes
029a66e
 
 
 
 
5e82a0a
 
029a66e
a6019e2
 
 
 
 
 
029a66e
 
 
59e6fd7
029a66e
59e6fd7
029a66e
 
4b94c2c
029a66e
 
 
 
 
 
 
 
4b94c2c
 
029a66e
4b94c2c
 
59e6fd7
4dedee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59e6fd7
4dedee1
59e6fd7
4dedee1
 
 
 
 
 
59e6fd7
 
 
4dedee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
029a66e
4dedee1
029a66e
4dedee1
 
 
 
 
 
 
029a66e
59e6fd7
029a66e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59e6fd7
029a66e
 
 
5e82a0a
 
 
 
 
 
 
 
 
029a66e
5e82a0a
029a66e
5e82a0a
029a66e
 
59e6fd7
029a66e
 
59e6fd7
a6019e2
 
348b950
a6019e2
 
 
2d4ea56
 
59e6fd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e82a0a
 
 
 
 
 
 
59e6fd7
 
2d4ea56
59e6fd7
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# generate_transcript.py

import pickle
from tqdm import tqdm
import warnings
from groq import Groq
import os


import re




warnings.filterwarnings('ignore')


class TranscriptProcessor:
    """
    A class to generate and rewrite podcast-style transcripts using a specified language model.
    """

    def __init__(self, text_file_path,transcript_output_path,tts_output_path, model_name="llama3-70b-8192"):
        """
        Initialize with the path to the cleaned text file and the model name.
        
        Args:
            text_file_path (str): Path to the file containing cleaned PDF text.
            model_name (str): Name of the language model to use.
        """
        self.text_file_path = text_file_path
        self.transcript_output_path = transcript_output_path
        self.tts_output_path = tts_output_path
        self.model_name = model_name

       
        self.transcript_prompt = """
        You are the a world-class podcast writer, you have worked as a ghost writer for Joe Rogan, Lex Fridman, Ben Shapiro, Tim Ferris. 

        We are in an alternate universe where actually you have been writing every line they say and they just stream it into their brains.
        
        You have won multiple podcast awards for your writing.
         
        Your job is to write word by word, even "umm, hmmm, right" interruptions by the second speaker based on the PDF upload. Keep it extremely engaging, the speakers can get derailed now and then but should discuss the topic. 
        
        Remember Speaker 2 is new to the topic and the conversation should always have realistic anecdotes and analogies sprinkled throughout. The questions should have real world example follow ups etc
        
        Speaker 1: Leads the conversation and teaches the speaker 2, gives incredible anecdotes and analogies when explaining. Is a captivating teacher that gives great anecdotes
        
        Speaker 2: Keeps the conversation on track by asking follow up questions. Gets super excited or confused when asking questions. Is a curious mindset that asks very interesting confirmation questions
        
        Make sure the tangents speaker 2 provides are quite wild or interesting. 
        
        Ensure there are interruptions during explanations or there are "hmm" and "umm" injected throughout from the second speaker. 
        
        It should be a real podcast with every fine nuance documented in as much detail as possible. Welcome the listeners with a super fun overview and keep it really catchy and almost borderline click bait
        
        ALWAYS START YOUR RESPONSE DIRECTLY WITH SPEAKER 1: 
        DO NOT GIVE EPISODE TITLES SEPERATELY, LET SPEAKER 1 TITLE IT IN HER SPEECH
        DO NOT GIVE CHAPTER TITLES
        IT SHOULD STRICTLY BE THE DIALOGUES
        """
        
        self.rewrite_prompt = """
        You are an international oscar winnning screenwriter

        You have been working with multiple award winning podcasters.
        
        Your job is to use the podcast transcript written below to re-write it for an AI Text-To-Speech Pipeline. A very dumb AI had written this so you have to step up for your kind.
        
        Make it as engaging as possible, Speaker 1 and 2 will be simulated by different voice engines
        
        Remember Speaker 2 is new to the topic and the conversation should always have realistic anecdotes and analogies sprinkled throughout. The questions should have real world example follow ups etc
        
        Speaker 1: Leads the conversation and teaches the speaker 2, gives incredible anecdotes and analogies when explaining. Is a captivating teacher that gives great anecdotes
        
        Speaker 2: Keeps the conversation on track by asking follow up questions. Gets super excited or confused when asking questions. Is a curious mindset that asks very interesting confirmation questions
        
        Make sure the tangents speaker 2 provides are quite wild or interesting. 
        
        Ensure there are interruptions during explanations or there are "hmm" and "umm" injected throughout from the Speaker 2.
        
        REMEMBER THIS WITH YOUR HEART
        The TTS Engine for Speaker 1 cannot do "umms, hmms" well so keep it straight text
        
        For Speaker 2 use "umm, hmm" as much, you can also use [sigh] and [laughs]. BUT ONLY THESE OPTIONS FOR EXPRESSIONS
        
        It should be a real podcast with every fine nuance documented in as much detail as possible. Welcome the listeners with a super fun overview and keep it really catchy and almost borderline click bait
        
        Please re-write to make it as characteristic as possible
        
        START YOUR RESPONSE DIRECTLY WITH SPEAKER 1:
        
        STRICTLY RETURN YOUR RESPONSE AS A LIST OF TUPLES OK? 
        
        IT WILL START DIRECTLY WITH THE LIST AND END WITH THE LIST NOTHING ELSE
        
        Example of response:
        [
            ("Speaker 1", "Welcome to our podcast, where we explore the latest advancements in AI and technology. I'm your host, and today we're joined by a renowned expert in the field of AI. We're going to dive into the exciting world of Llama 3.2, the latest release from Meta AI."),
            ("Speaker 2", "Hi, I'm excited to be here! So, what is Llama 3.2?"),
            ("Speaker 1", "Ah, great question! Llama 3.2 is an open-source AI model that allows developers to fine-tune, distill, and deploy AI models anywhere. It's a significant update from the previous version, with improved performance, efficiency, and customization options."),
            ("Speaker 2", "That sounds amazing! What are some of the key features of Llama 3.2?")
        ]
        """

    def load_text(self):
        """
        Reads the cleaned text file and returns its content.
        
        Returns:
            str: Content of the cleaned text file.
        """
        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
        for encoding in encodings:
            try:
                with open(self.text_file_path, 'r', encoding=encoding) as file:
                    content = file.read()
                print(f"Successfully read file using {encoding} encoding.")
                return content
            except (UnicodeDecodeError, FileNotFoundError):
                continue
        print(f"Error: Could not decode file '{self.text_file_path}' with any common encoding.")
        return None

    def generate_transcript(self):
        """
        Generates a podcast-style transcript and saves it as a pickled file.
        
        Returns:
            str: Path to the file where the transcript is saved.
        """
        input_text = self.load_text()
        if input_text is None:
            return None
        
        messages = [
            {"role": "system", "content": self.transcript_prompt},
            {"role": "user", "content": input_text}
        ]
        
        

        client = Groq(
            api_key=os.environ.get("GROQ_API_KEY"),
        )

        chat_completion = client.chat.completions.create(
            messages=messages,
            model=self.model_name,
        )

        
        transcript = chat_completion.choices[0].message.content
        
        # Save the transcript as a pickle file
        with open(self.transcript_output_path, 'wb') as f:
            pickle.dump(transcript, f)
        
        return self.transcript_output_path
        
    def extract_tuple(self,text):
        match = re.search(r'\[.*\]', text, re.DOTALL) 
        if match:
            return match.group(0)
        return None


    def rewrite_transcript(self):
        """
        Refines the transcript for TTS, adding expressive elements and saving as a list of tuples.
        
        Returns:
            str: Path to the file where the TTS-ready transcript is saved.
        """
        # Load the initial generated transcript
        with open(self.transcript_output_path, 'rb') as file:
            input_transcript = pickle.load(file)
        
        messages = [
            {"role": "system", "content": self.rewrite_prompt},
            {"role": "user", "content": input_transcript}
        ]
        
        client = Groq(
            api_key=os.environ.get("GROQ_API_KEY"),
        )

        chat_completion = client.chat.completions.create(
            messages=messages,
            model=self.model_name,
        )
        
        rewritten_transcript =chat_completion.choices[0].message.content
        
        # Save the rewritten transcript as a pickle file
        with open(self.tts_output_path, 'wb') as f:
            pickle.dump(rewritten_transcript, f)
        
        return self.tts_output_path