AhmadMustafa commited on
Commit
9f1e459
·
1 Parent(s): adf4e91

update: show crops

Browse files
Files changed (1) hide show
  1. app.py +102 -249
app.py CHANGED
@@ -3,9 +3,21 @@ from typing import Generator, List
3
 
4
  import gradio as gr
5
  from openai import OpenAI
 
 
 
 
 
 
 
 
 
6
  from transcript import TranscriptProcessor
7
- from utils import css, get_transcript_for_url, head, setup_openai_key
8
  from utils import openai_tools as tools
 
 
 
9
 
10
 
11
  def get_initial_analysis(
@@ -21,173 +33,23 @@ def get_initial_analysis(
21
  else:
22
  link_start = "https"
23
  if ct == "si": # street interview
24
- prompt = f"""This is a transcript for a street interview. Call Details are as follows:
25
- User ID UID: {uid}
26
- RSID: {rsid}
27
- Transcript: {transcript}
28
-
29
- Your task is to analyze this street interview transcript and identify the final/best timestamps for each topic or question discussed. Here are the key rules:
30
- The user might repeat the answer to the question sometimes, you need to pick the very last answer intelligently
31
-
32
- 1. For any topic/answer that appears multiple times in the transcript (even partially):
33
- - The LAST occurrence is always considered the best version. If the same thing is said multiple times, the last time is the best, all previous times are considered as additional takes.
34
- - This includes cases where parts of an answer are scattered throughout the transcript
35
- - Even slight variations of the same answer should be tracked
36
- - List timestamps for ALL takes, with the final take highlighted as the best answer
37
-
38
- 2. Introduction handling:
39
- - Question 1 is ALWAYS the speaker's introduction/self-introduction
40
- - If someone introduces themselves multiple times, use the last introduction as best answer
41
- - Include all variations of how they state their name/background
42
- - List ALL introduction timestamps chronologically
43
-
44
- 3. Question sequence:
45
- - After the introduction, list questions in the order they were first asked
46
- - If a question or introduction is revisited later at any point, please use the later timestamp
47
- - Track partial answers to the same question across the transcript
48
-
49
- You need to make sure that any words that are repeated, you need to pick the last of them.
50
-
51
- Return format:
52
-
53
- [Question Title]
54
- Total takes: [X] (Include ONLY if content appears more than once)
55
- - [Take 1. <div id='topic' style="display: inline"> 15s at 12:30 </div>]({link_start}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{750}}&et={{765}}&uid={{uid}})
56
- - [Take 2. <div id='topic' style="display: inline"> 30s at 14:45 </div>]({link_start}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{885}}&et={{915}}&uid={{uid}})
57
- ...
58
- - [Take X (Best) <div id='topic' style="display: inline"> 1m 10s at 16:20 </div>]({link_start}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{980}}&et={{1050}}&uid={{uid}})
59
-
60
- URL formatting:
61
- - Convert timestamps to seconds (e.g., 10:13 → 613)
62
- - Format: {link_start}://[origin]/colab/[cid]/[rsid]?st=[start_seconds]&et=[end_seconds]&uid=[unique_id]
63
- - Parameters after RSID must start with ? and subsequent parameters use &
64
-
65
- Example:
66
- 1. Introduction
67
- Total takes: 2
68
- - [Take 1. <div id='topic' style="display: inline"> 22s at 12:30 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{750}}&et={{772}}&uid={{uid}})
69
- - [Take 2. <div id='topic' style="display: inline"> 43s at 14:45 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{885}}&et={{928}}&uid={{uid}})
70
- 3 [Take 3. (Best) <div id='topic' style="display: inline"> 58s at 16:20 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{980}}&et={{1038}}&uid={{uid}})
71
- """
72
  completion = client.chat.completions.create(
73
  model="gpt-4o",
74
  messages=[
75
- {
76
- "role": "system",
77
- "content": f"""You are analyzing a transcript for Call ID: {cid}, Session ID: {rsid}, Origin: {origin}, Call Type: {ct}.
78
- CORE REQUIREMENT:
79
- - TIMESTAMPS: A speaker can repeat the answer to a question multiple times. You need to pick the last answer very carefully and choose that as best take. Make sure that that same answer is not repeated again after the best answer.
80
-
81
- YOU SHOULD Prioritize accuracy in timestamp at every cost. Read the Transcript carefully and decide where an answer starts and ends. You will have speaker labels so you need to be very sharp.""",
82
- },
83
- {"role": "user", "content": prompt},
84
  ],
85
  stream=True,
86
  temperature=0.1,
87
  )
88
  else:
89
- system_prompt = f"""You are a helpful assistant developed by Roll.AI(Leading AI tool for Remote production) who is analyzing the transcript for a RollAI Call. Following are the details:
90
- - Call ID: {cid}
91
- - Session ID: {rsid}
92
- - Origin: {origin}
93
- - Call Type: {ct}
94
- - Speakers: {", ".join(speaker_mapping.values())}
95
- - Diarized Transcript: {transcript}
96
-
97
-
98
- You are tasked with creating social media clips from the transcript, You need to shortlist the atleast two short clips for EACH SPEAKER. There are some requirments:
99
-
100
- CORE REQUIREMENTS:
101
- 1. SPEAKER Overlap in the CLIP: When specifying the duration for the script, make sure that in that duration:
102
- - There is only continuous dialogue from that speaker.
103
- - As soon as another speaker starts talking or the topic ends, the clip MUST end.
104
-
105
- 2. DURATION RULES:
106
- - Each clip must be between 20 seconds to 120 seconds.
107
-
108
- 3. SPEAKER COVERAGE:
109
- - Minimum 2 topics per speaker, aim for 3 if good content exists
110
-
111
- CRITICAL: When analyzing timestamps, you must verify that in the duration specified:
112
- 1. No other speaker talks during the selected timeframe
113
- 2. The speaker talks continuously for at least 20 seconds
114
- 3. The clip ends BEFORE any interruption or speaker change
115
- """
116
- # start_end_sentence_prompt = f"""Given a transcript with speakers {" , ".join(speaker_mapping.values())}, analyze the content and identify segments that would make compelling social media clips. For each speaker, find complete topics that meet the following criteria:
117
-
118
- # Key Requirements:
119
- # 1. Speaker Isolation
120
- # - Each clip must contain only ONE speaker
121
- # - No interruptions from other speakers allowed within the clip
122
- # - Once another speaker interrupts, the previous speaker's clip must end
123
-
124
- # 2. Duration Guidelines
125
- # - Minimum: 20 seconds of continuous speech
126
- # - Maximum: 100 seconds
127
- # - Must capture complete thoughts/topics
128
-
129
- # 3. Content Selection
130
- # - Focus on interesting or noteworthy content
131
- # - Topics should be self-contained and coherent
132
- # - Must include both the starting and ending sentences that bound the topic
133
- # - You can do 2 or 3 topics per speaker if there is more content for that speaker.
134
-
135
- # Expected Output Format:
136
- # ```json
137
- # {{
138
- # "Speaker_Name": [
139
- # {{
140
- # "Topic_Title": "<descriptive title of the topic>",
141
- # "Starting_Sentence": "<exact first sentence of the topic>",
142
- # "Ending_Sentence": "<exact last sentence before any interruption or topic change>"
143
- # }},
144
- # // Additional topics for this speaker...
145
- # ],
146
- # // Additional speakers...
147
- # }}
148
-
149
- # Example:
150
- # If a transcript contains:
151
- # [10:13] Speaker1: "First sentence..."
152
- # [10:20] Speaker1: "nth sentence..."
153
- # [10:17] Speaker2: "Interruption..."
154
- # [10:19] Speaker1: "nth+1 sentence..."
155
-
156
- # The valid ending sentence for Speaker1 would only include the first n sentences, ending before Speaker2's interruption.
157
-
158
- # Important:
159
- # - Ensure each clip represents a single, uninterrupted segment from one speaker
160
- # - Include only complete thoughts/statements
161
- # - Verify that no other speakers appear between the selected start and end sentences
162
- # """
163
-
164
- # sentence_finding_completion = client.chat.completions.create(
165
- # model="gpt-4o",
166
- # messages=[
167
- # {"role": "system", "content": start_end_sentence_prompt},
168
- # ],
169
- # stream=False,
170
- # temperature=0.2,
171
- # )
172
- # sentence_finding = sentence_finding_completion.choices[0].message.content
173
- # sentence_finding_json = sentence_finding[
174
- # sentence_finding.find("{") : sentence_finding.rfind("}") + 1
175
- # ]
176
-
177
- user_prompt = f"""User ID: {uid}
178
-
179
- Your task is to find the starting time, ending time, and the duration for the each topic in the above Short Listed Topics. You need to return the answer in the following format.
180
- Please make sure that in the duration of 1 speaker, there is no segment of any other speaker. The shortlisted duration must be of a single speaker
181
-
182
- Return Format requirements:
183
- SPEAKER FORMAT:
184
- **Speaker Name**
185
- 1. [Topic title <div id='topic' style="display: inline"> 22s at 12:30 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{750}}&et={{772}}&uid={{uid}})
186
- 2. [Topic title <div id='topic' style="display: inline"> 43s at 14:45 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{885}}&et={{928}}&uid={{uid}})
187
- 3. [Topic title <div id='topic' style="display: inline"> 58s at 16:20 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{980}}&et={{1038}}&uid={{uid}})
188
- **Speaker Name**
189
- ....
190
- """
191
  completion = client.chat.completions.create(
192
  model="gpt-4o",
193
  messages=[
@@ -231,51 +93,26 @@ def chat(
231
  else:
232
  link_start = "https"
233
  speaker_mapping = transcript_processor.speaker_mapping
234
- prompt = f"""You are a helpful assistant analyzing transcripts and generating timestamps and URL. The user will ask you questions regarding the social media clips from the transcript.
235
- Call ID is {cid},
236
- Session ID is {rsid},
237
- origin is {origin},
238
- Call Type is {ct}.
239
- Speakers: {", ".join(speaker_mapping.values())}
240
- Transcript: {transcript_processor.get_transcript()}
241
-
242
- If a user asks timestamps for a specific topic or things, find the start time and end time of that specific topic and return answer in the format:
243
- Answers and URLs should be formated as follows:
244
- [Topic title <div id='topic' style="display: inline"> 22s at 12:30 </div>]({link_start}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{750}}&et={{772}}&uid={{uid}})
245
- For Example:
246
- If the start time is 10:13 and end time is 10:18, the url will be:
247
- {link_start}://roll.ai/colab/1234aq_12314/51234151?st=613&et=618&uid=82314
248
- In the URL, make sure that after RSID there is ? and then rest of the fields are added via &.
249
- You can include multiple links here that can related to the user answer. ALWAYS ANSWER FROM THE TRANSCRIPT.
250
- RULE: When selecting timestamps for the answer, always use the **starting time (XX:YY)** as the reference point for your response, with the duration (Z seconds) calculated from this starting time, not the ending time of the segment.
251
-
252
- Example 1:
253
- User: Suggest me some clips that can go viral on Instagram.
254
- Response:
255
- 1. [Clip 1 <div id='topic' style="display: inline"> 22s at 12:30 </div>]({link_start}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{750}}&et={{772}}&uid={{uid}})
256
- User: Give me the URL where each person has introduced themselves.
257
- 2. [Clip 2 <div id='topic' style="display: inline"> 10s at 10:00 </div>]({link_start}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{600}}&et={{610}}&uid={{uid}})
258
-
259
- Example 2:
260
- Provide the exact timestamp where the person begins their introduction, typically starting with phrases like "Hi," "Hello," "I am," or "My name is," and include the full introduction, covering everything they say about themselves, including their name, role, background, current responsibilities, organization, and any additional details they provide about their work or personal interests.
261
- 1. [Person Name1 <div id='topic' style="display: inline"> 43s at 14:45 </div>]({link_start}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{885}}&et={{928}}&uid={{uid}})
262
- 2. [Person Name2 <div id='topic' style="display: inline"> 58s at 16:20 </div>]({link_start}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{980}}&et={{1038}}&uid={{uid}})
263
- ....
264
-
265
- If the user provides a link to the agenda, use the correct_speaker_name_with_url function to correct the speaker names based on the agenda.
266
- If the user provides the correct call type, use the correct_call_type function to correct the call type. Call Type for street interviews is 'si'.
267
- """
268
- messages = [{"role": "system", "content": prompt}]
269
 
270
  for user_msg, assistant_msg in chat_history:
271
- if user_msg is not None: # Skip the initial message where user_msg is None
272
  messages.append({"role": "user", "content": user_msg})
273
  if assistant_msg is not None:
274
  messages.append({"role": "assistant", "content": assistant_msg})
275
 
276
  # Add the current message
277
  messages.append({"role": "user", "content": message})
278
-
279
  completion = client.chat.completions.create(
280
  model="gpt-4o",
281
  messages=messages,
@@ -298,6 +135,22 @@ If the user provides the correct call type, use the correct_call_type function t
298
 
299
  if response.choices[0].message.tool_calls:
300
  tool_call = response.choices[0].message.tool_calls[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  if tool_call.function.name == "correct_speaker_name_with_url":
302
  args = eval(tool_call.function.arguments)
303
  url = args.get("url", None)
@@ -394,6 +247,7 @@ def create_chat_interface():
394
  show_share_button=False,
395
  show_copy_all_button=False,
396
  show_copy_button=False,
 
397
  )
398
  msg = gr.Textbox(elem_id="chatbot_textbox", show_label=False)
399
  transcript_processor_state = gr.State() # maintain state of imp things
@@ -467,7 +321,6 @@ def create_chat_interface():
467
  for param in required_params
468
  if request.query_params.get(param) is None
469
  ]
470
- print("Missing Params", missing_params)
471
 
472
  if missing_params:
473
  error_message = (
@@ -480,47 +333,46 @@ def create_chat_interface():
480
  # split turls based on ,
481
  turls = turl.split(",")
482
  pnames = [pname.replace("_", " ") for pname in pnames.split(",")]
483
- print(pnames)
484
-
485
- # try:
486
-
487
- if turls:
488
- transcript_data = []
489
- for turl in turls:
490
- print("Getting Transcript for URL")
491
- transcript_data.append(get_transcript_for_url(turl))
492
- print("Now creating Processor")
493
- transcript_processor = TranscriptProcessor(
494
- transcript_data=transcript_data,
495
- call_type=ct,
496
- person_names=pnames,
497
- )
498
-
499
- else:
500
- transcript_data = get_transcript_for_url(turl)
501
- transcript_processor = TranscriptProcessor(
502
- transcript_data=transcript_data, call_type=ct
503
- )
504
 
505
- # Initialize with empty message
506
- chatbot_value = [(None, "")]
507
 
508
- # Return initial values with the transcript processor
509
- return [
510
- chatbot_value,
511
- transcript_processor,
512
- cid,
513
- rsid,
514
- origin,
515
- ct,
516
- turl,
517
- uid,
518
- ]
519
- # except Exception as e:
520
- # print(e)
521
- # error_message = f"Error processing call_id {cid}: {str(e)}"
522
- # chatbot_value = [(None, error_message)]
523
- # return [chatbot_value, None, None, None, None, None, None, None]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
 
525
  def display_processing_message(chatbot_value):
526
  """Display the processing message while maintaining state."""
@@ -566,19 +418,20 @@ def create_chat_interface():
566
  display_processing_message,
567
  inputs=[chatbot],
568
  outputs=[chatbot],
569
- ).then(
570
- stream_initial_analysis,
571
- inputs=[
572
- chatbot,
573
- transcript_processor_state,
574
- call_id_state,
575
- colab_id_state,
576
- origin_state,
577
- ct_state,
578
- uid_state,
579
- ],
580
- outputs=[chatbot],
581
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
582
  return demo
583
 
584
 
 
3
 
4
  import gradio as gr
5
  from openai import OpenAI
6
+
7
+ from crop_utils import get_image_crop
8
+ from prompts import (
9
+ get_chat_system_prompt,
10
+ get_live_event_system_prompt,
11
+ get_live_event_user_prompt,
12
+ get_street_interview_prompt,
13
+ get_street_interview_system_prompt,
14
+ )
15
  from transcript import TranscriptProcessor
16
+ from utils import css, get_transcript_for_url, head
17
  from utils import openai_tools as tools
18
+ from utils import setup_openai_key
19
+
20
+ client = OpenAI()
21
 
22
 
23
  def get_initial_analysis(
 
33
  else:
34
  link_start = "https"
35
  if ct == "si": # street interview
36
+ user_prompt = get_street_interview_prompt(transcript, uid, rsid, link_start)
37
+ system_prompt = get_street_interview_system_prompt(cid, rsid, origin, ct)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  completion = client.chat.completions.create(
39
  model="gpt-4o",
40
  messages=[
41
+ {"role": "system", "content": system_prompt},
42
+ {"role": "user", "content": user_prompt},
 
 
 
 
 
 
 
43
  ],
44
  stream=True,
45
  temperature=0.1,
46
  )
47
  else:
48
+ system_prompt = get_live_event_system_prompt(
49
+ cid, rsid, origin, ct, speaker_mapping, transcript
50
+ )
51
+ user_prompt = get_live_event_user_prompt(uid, link_start)
52
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  completion = client.chat.completions.create(
54
  model="gpt-4o",
55
  messages=[
 
93
  else:
94
  link_start = "https"
95
  speaker_mapping = transcript_processor.speaker_mapping
96
+ system_prompt = get_chat_system_prompt(
97
+ cid=cid,
98
+ rsid=rsid,
99
+ origin=origin,
100
+ ct=ct,
101
+ speaker_mapping=speaker_mapping,
102
+ transcript=transcript_processor.get_transcript(),
103
+ link_start=link_start,
104
+ )
105
+
106
+ messages = [{"role": "system", "content": system_prompt}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  for user_msg, assistant_msg in chat_history:
109
+ if user_msg is not None:
110
  messages.append({"role": "user", "content": user_msg})
111
  if assistant_msg is not None:
112
  messages.append({"role": "assistant", "content": assistant_msg})
113
 
114
  # Add the current message
115
  messages.append({"role": "user", "content": message})
 
116
  completion = client.chat.completions.create(
117
  model="gpt-4o",
118
  messages=messages,
 
135
 
136
  if response.choices[0].message.tool_calls:
137
  tool_call = response.choices[0].message.tool_calls[0]
138
+ if tool_call.function.name == "get_image":
139
+ # Return the image directly in the chat
140
+ image_data = get_image_crop(cid, rsid, uid)
141
+
142
+ messages.append(response.choices[0].message)
143
+ function_call_result_message = {
144
+ "role": "tool",
145
+ "content": "Here are the Image Crops",
146
+ "name": tool_call.function.name,
147
+ "tool_call_id": tool_call.id,
148
+ }
149
+ messages.append(function_call_result_message)
150
+
151
+ yield image_data
152
+ return
153
+
154
  if tool_call.function.name == "correct_speaker_name_with_url":
155
  args = eval(tool_call.function.arguments)
156
  url = args.get("url", None)
 
247
  show_share_button=False,
248
  show_copy_all_button=False,
249
  show_copy_button=False,
250
+ render=True,
251
  )
252
  msg = gr.Textbox(elem_id="chatbot_textbox", show_label=False)
253
  transcript_processor_state = gr.State() # maintain state of imp things
 
321
  for param in required_params
322
  if request.query_params.get(param) is None
323
  ]
 
324
 
325
  if missing_params:
326
  error_message = (
 
333
  # split turls based on ,
334
  turls = turl.split(",")
335
  pnames = [pname.replace("_", " ") for pname in pnames.split(",")]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
 
337
+ try:
 
338
 
339
+ if turls:
340
+ transcript_data = []
341
+ for turl in turls:
342
+ print("Getting Transcript for URL")
343
+ transcript_data.append(get_transcript_for_url(turl))
344
+ print("Now creating Processor")
345
+ transcript_processor = TranscriptProcessor(
346
+ transcript_data=transcript_data,
347
+ call_type=ct,
348
+ person_names=pnames,
349
+ )
350
+
351
+ else:
352
+ transcript_data = get_transcript_for_url(turl)
353
+ transcript_processor = TranscriptProcessor(
354
+ transcript_data=transcript_data, call_type=ct
355
+ )
356
+
357
+ # Initialize with empty message
358
+ chatbot_value = [(None, "")]
359
+
360
+ # Return initial values with the transcript processor
361
+ return [
362
+ chatbot_value,
363
+ transcript_processor,
364
+ cid,
365
+ rsid,
366
+ origin,
367
+ ct,
368
+ turl,
369
+ uid,
370
+ ]
371
+ except Exception as e:
372
+ print(e)
373
+ error_message = f"Error processing call_id {cid}: {str(e)}"
374
+ chatbot_value = [(None, error_message)]
375
+ return [chatbot_value, None, None, None, None, None, None, None]
376
 
377
  def display_processing_message(chatbot_value):
378
  """Display the processing message while maintaining state."""
 
418
  display_processing_message,
419
  inputs=[chatbot],
420
  outputs=[chatbot],
 
 
 
 
 
 
 
 
 
 
 
 
421
  )
422
+ # .then(
423
+ # stream_initial_analysis,
424
+ # inputs=[
425
+ # chatbot,
426
+ # transcript_processor_state,
427
+ # call_id_state,
428
+ # colab_id_state,
429
+ # origin_state,
430
+ # ct_state,
431
+ # uid_state,
432
+ # ],
433
+ # outputs=[chatbot],
434
+ # )
435
  return demo
436
 
437