Prudvireddy commited on
Commit
9b6561b
1 Parent(s): f636163

Update tools.py

Browse files
Files changed (1) hide show
  1. tools.py +423 -420
tools.py CHANGED
@@ -1,421 +1,424 @@
1
- from langchain.tools import tool, Tool
2
- import re
3
- import os
4
- from langchain_groq import ChatGroq
5
- import requests
6
- import cv2
7
- from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips
8
- from langchain.pydantic_v1 import BaseModel, Field
9
- from langchain_community.tools import WikipediaQueryRun
10
- from langchain_community.utilities import WikipediaAPIWrapper
11
-
12
- # from diffusers import StableDiffusionXLPipeline, DPMSolverSinglestepScheduler
13
- # import bitsandbytes as bnb
14
- # import torch.nn as nn
15
- # import torch
16
- import pyttsx3
17
- import os
18
- # from langchain_google_genai import ChatGoogleGenerativeAI
19
-
20
- # from langchain.chat_models import ChatOpenAI
21
- # # llm2 = ChatOpenAI(model='gpt-3.5-turbo')
22
- # # llm3 = ChatOpenAI(model='gpt-3.5-turbo')
23
- # llm1 = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=2048)
24
- # # llm2 = ChatGroq(model='mixtral-8x7b-32768', temperature=0.6, max_tokens=2048, api_key='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')
25
- # llm2 = ChatGoogleGenerativeAI(model='gemini-pro', temperature=0.0)
26
- # # llm2 = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=2048, api_key='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')
27
- # # llm3 = ChatGoogleGenerativeAI(model='gemini-pro')
28
- # llm4 = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=2048, api_key='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')
29
- llm = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=1024, api_key=os.environ.get('GROQ_API_KEY'))
30
-
31
- # pipe = StableDiffusionXLPipeline.from_pretrained("sd-community/sdxl-flash", torch_dtype=torch.float16).to('cuda')
32
- # pipe.scheduler = DPMSolverSinglestepScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
33
-
34
- # def quantize_model_to_4bit(model):
35
- # replacements = []
36
-
37
- # # Collect layers to be replaced
38
- # for name, module in model.named_modules():
39
- # if isinstance(module, nn.Linear):
40
- # replacements.append((name, module))
41
-
42
- # # Replace layers
43
- # for name, module in replacements:
44
- # # Split the name to navigate to the parent module
45
- # *path, last = name.split('.')
46
- # parent = model
47
- # for part in path:
48
- # parent = getattr(parent, part)
49
-
50
- # # Create and assign the quantized layer
51
- # quantized_layer = bnb.nn.Linear4bit(module.in_features, module.out_features, bias=module.bias is not None)
52
- # quantized_layer.weight.data = module.weight.data
53
- # if module.bias is not None:
54
- # quantized_layer.bias.data = module.bias.data
55
- # setattr(parent, last, quantized_layer)
56
-
57
- # return model
58
-
59
- # pipe.unet = quantize_model_to_4bit(pipe.unet)
60
- # pipe.enable_model_cpu_offload()
61
-
62
- def generate_speech(text, speech_dir='./outputs/audio', lang='en', speed=170, voice='default', num=0):
63
- """
64
- Generates speech for given script.
65
- """
66
- engine = pyttsx3.init()
67
-
68
- # Set language and voice
69
- voices = engine.getProperty('voices')
70
- if voice == 'default':
71
- voice_id = voices[1].id
72
- else:
73
- # Try to find the voice with the given name
74
- voice_id = None
75
- for v in voices:
76
- if voice in v.name:
77
- voice_id = v.id
78
- break
79
- if not voice_id:
80
- raise ValueError(f"Voice '{voice}' not found.")
81
-
82
- engine.setProperty('voice', voice_id)
83
- engine.setProperty('rate', speed)
84
- os.remove(os.path.join(os.path.dirname(os.path.abspath(__file__)), speech_dir, f'speech_{num}.mp3')) if os.path.exists(os.path.join(speech_dir, f'speech_{num}.mp3')) else None
85
- engine.save_to_file(text, os.path.join(os.path.dirname(os.path.abspath(__file__)), speech_dir, f'speech_{num}.mp3'))
86
- engine.runAndWait()
87
-
88
- # class VideoGeneration(BaseModel):
89
- # images_dir : str = Field(description='Path to images directory, such as "outputs/images"')
90
- # speeches_dir : str = Field(description='Path to speeches directory, such as "outputs/speeches"')
91
-
92
- # @tool(args_schema=VideoGeneration)
93
- # def create_video_from_images_and_audio(images_dir, speeches_dir, zoom_factor=1.2):
94
- # """Creates video using images and audios with zoom-in effect"""
95
- # images_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), images_dir)
96
- # speeches_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), speeches_dir)
97
-
98
- # images_paths = os.listdir(images_dir)
99
- # audio_paths = os.listdir(speeches_dir)
100
- # # print(images_paths, audio_paths)
101
- # clips = []
102
-
103
- # for i in range(min(len(images_paths), len(audio_paths))):
104
- # # Load the image
105
- # img_clip = ImageClip(os.path.join(images_dir, images_paths[i]))
106
-
107
- # # Load the audio file
108
- # audioclip = AudioFileClip(os.path.join(speeches_dir, audio_paths[i]))
109
-
110
- # # Set the duration of the video clip to the duration of the audio file
111
- # videoclip = img_clip.set_duration(audioclip.duration)
112
-
113
- # # Apply zoom-in effect to the video clip
114
- # zoomed_clip = apply_zoom_in_effect(videoclip, zoom_factor)
115
-
116
- # # Add audio to the zoomed video clip
117
- # zoomed_clip = zoomed_clip.set_audio(audioclip)
118
-
119
- # clips.append(zoomed_clip)
120
-
121
- # # Concatenate all video clips
122
- # final_clip = concatenate_videoclips(clips)
123
-
124
- # # Write the result to a file
125
- # final_clip.write_videofile(os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4"), codec='libx264', fps=24)
126
-
127
- # return os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4")
128
-
129
- # def apply_zoom_in_effect(clip, zoom_factor=1.2):
130
- # width, height = clip.size
131
- # duration = clip.duration
132
-
133
- # def zoom_in_effect(get_frame, t):
134
- # frame = get_frame(t)
135
- # zoom = 1 + (zoom_factor - 1) * (t / duration)
136
- # new_width, new_height = int(width * zoom), int(height * zoom)
137
- # resized_frame = cv2.resize(frame, (new_width, new_height))
138
-
139
- # # Calculate the position to crop the frame to the original size
140
- # x_start = (new_width - width) // 2
141
- # y_start = (new_height - height) // 2
142
- # cropped_frame = resized_frame[y_start:y_start + height, x_start:x_start + width]
143
-
144
- # return cropped_frame
145
-
146
- # return clip.fl(zoom_in_effect, apply_to=['mask'])
147
-
148
- # Example usage
149
- # image_paths = "outputs/images"
150
- # audio_paths = "outputs/audio"
151
-
152
- # video_path = create_video_from_images_and_audio(image_paths, audio_paths)
153
- # print(f"Video created at: {video_path}")
154
-
155
-
156
- # class ImageGeneration(BaseModel):
157
- # text : str = Field(description='description of sentence used for image generation')
158
- # num : int = Field(description='sequence of description passed this tool. Used in image saving path. Example 1,2,3,4,5 and so on')
159
-
160
- # class SpeechGeneration(BaseModel):
161
- # text : str = Field(description='description of sentence used for image generation')
162
- # num : int = Field(description='sequence of description passed this tool. Used in image saving path. Example 1,2,3,4,5 and so on')
163
-
164
- import os
165
- import cv2
166
- from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips, VideoFileClip
167
- from PIL import Image, ImageDraw, ImageFont
168
- import numpy as np
169
- from groq import Groq
170
-
171
- client = Groq()
172
-
173
- class VideoGeneration(BaseModel):
174
- images_dir: str = Field(description='Path to images directory, such as "outputs/images"')
175
- speeches_dir: str = Field(description='Path to speeches directory, such as "outputs/speeches"')
176
-
177
- def split_text_into_chunks(text, chunk_size):
178
- words = text.split()
179
- return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
180
-
181
- def add_text_to_video(input_video, output_video, text, duration=1, fontsize=40, fontcolor=(255, 255, 255),
182
- outline_thickness=2, outline_color=(0, 0, 0), delay_between_chunks=0.1,
183
- font_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'Montserrat-Bold.ttf')):
184
-
185
- chunks = split_text_into_chunks(text, 3) # Adjust chunk size as needed
186
-
187
- cap = cv2.VideoCapture(input_video)
188
- fourcc = cv2.VideoWriter_fourcc(*'mp4v')
189
- fps = int(cap.get(cv2.CAP_PROP_FPS))
190
- width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
191
- height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
192
- out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))
193
-
194
- frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
195
- chunk_duration_frames = duration * fps
196
- delay_frames = int(delay_between_chunks * fps)
197
-
198
- font = ImageFont.truetype(font_path, fontsize)
199
-
200
- current_frame = 0
201
-
202
- while cap.isOpened():
203
- ret, frame = cap.read()
204
- if not ret:
205
- break
206
-
207
- frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
208
- draw = ImageDraw.Draw(frame_pil)
209
-
210
- chunk_index = current_frame // (chunk_duration_frames + delay_frames)
211
-
212
- if current_frame % (chunk_duration_frames + delay_frames) < chunk_duration_frames and chunk_index < len(chunks):
213
- chunk = chunks[chunk_index]
214
- text_width, text_height = draw.textsize(chunk, font=font)
215
- text_x = (width - text_width) // 2
216
- text_y = height - 400 # Position text at the bottom
217
-
218
- if text_width > width:
219
- words = chunk.split()
220
- half = len(words) // 2
221
- line1 = ' '.join(words[:half])
222
- line2 = ' '.join(words[half:])
223
-
224
- text_size_line1 = draw.textsize(line1, font=font)
225
- text_size_line2 = draw.textsize(line2, font=font)
226
- text_x_line1 = (width - text_size_line1[0]) // 2
227
- text_x_line2 = (width - text_size_line2[0]) // 2
228
- text_y = height - 250 - text_size_line1[1] # Adjust vertical position for two lines
229
-
230
- for dx in range(-outline_thickness, outline_thickness + 1):
231
- for dy in range(-outline_thickness, outline_thickness + 1):
232
- if dx != 0 or dy != 0:
233
- draw.text((text_x_line1 + dx, text_y + dy), line1, font=font, fill=outline_color)
234
- draw.text((text_x_line2 + dx, text_y + text_size_line1[1] + dy), line2, font=font, fill=outline_color)
235
-
236
- draw.text((text_x_line1, text_y), line1, font=font, fill=fontcolor)
237
- draw.text((text_x_line2, text_y + text_size_line1[1]), line2, font=font, fill=fontcolor)
238
-
239
- else:
240
- for dx in range(-outline_thickness, outline_thickness + 1):
241
- for dy in range(-outline_thickness, outline_thickness + 1):
242
- if dx != 0 or dy != 0:
243
- draw.text((text_x + dx, text_y + dy), chunk, font=font, fill=outline_color)
244
-
245
- draw.text((text_x, text_y), chunk, font=font, fill=fontcolor)
246
-
247
- frame = cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)
248
-
249
- out.write(frame)
250
- current_frame += 1
251
-
252
- cap.release()
253
- out.release()
254
- cv2.destroyAllWindows()
255
-
256
- def apply_zoom_in_effect(clip, zoom_factor=1.2):
257
- width, height = clip.size
258
- duration = clip.duration
259
-
260
- def zoom_in_effect(get_frame, t):
261
- frame = get_frame(t)
262
- zoom = 1 + (zoom_factor - 1) * (t / duration)
263
- new_width, new_height = int(width * zoom), int(height * zoom)
264
- resized_frame = cv2.resize(frame, (new_width, new_height))
265
-
266
- x_start = (new_width - width) // 2
267
- y_start = (new_height - height) // 2
268
- cropped_frame = resized_frame[y_start:y_start + height, x_start:x_start + width]
269
-
270
- return cropped_frame
271
-
272
- return clip.fl(zoom_in_effect, apply_to=['mask'])
273
-
274
- @tool(args_schema=VideoGeneration)
275
- def create_video_from_images_and_audio(images_dir, speeches_dir, zoom_factor=1.2):
276
- """Creates video using images and audios.
277
- Args:
278
- images_dir: path to images folder, example 'outputs/images'
279
- speeches_dir: path to speeches folder, example 'outputs/speeches'"""
280
- images_paths = sorted(os.listdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),images_dir)))
281
- audio_paths = sorted(os.listdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),speeches_dir)))
282
- clips = []
283
- temp_files = []
284
-
285
- for i in range(min(len(images_paths), len(audio_paths))):
286
- img_clip = ImageClip(os.path.join(os.path.dirname(os.path.abspath(__file__)),images_dir, images_paths[i]))
287
- audioclip = AudioFileClip(os.path.join(os.path.dirname(os.path.abspath(__file__)),speeches_dir, audio_paths[i]))
288
- videoclip = img_clip.set_duration(audioclip.duration)
289
- zoomed_clip = apply_zoom_in_effect(videoclip, zoom_factor)
290
-
291
- with open(os.path.join(os.path.dirname(os.path.abspath(__file__)),speeches_dir, audio_paths[i]), "rb") as file:
292
- transcription = client.audio.transcriptions.create(
293
- file=(audio_paths[i], file.read()),
294
- model="whisper-large-v3",
295
- response_format="verbose_json",
296
- )
297
- caption = transcription.text
298
-
299
- temp_video_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"outputs/final_video/temp_zoomed_{i}.mp4")
300
- zoomed_clip.write_videofile(temp_video_path, codec='libx264', fps=24)
301
- temp_files.append(temp_video_path)
302
-
303
- final_video_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"outputs/final_video/temp_captioned_{i}.mp4")
304
- add_text_to_video(temp_video_path, final_video_path, caption, duration=1, fontsize=60)
305
- temp_files.append(final_video_path)
306
-
307
- final_clip = VideoFileClip(final_video_path)
308
- final_clip = final_clip.set_audio(audioclip)
309
-
310
- clips.append(final_clip)
311
-
312
- final_clip = concatenate_videoclips(clips)
313
- final_clip.write_videofile(os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4"), codec='libx264', fps=24)
314
-
315
- # Close all video files properly
316
- for clip in clips:
317
- clip.close()
318
-
319
- # Remove all temporary files
320
- for temp_file in temp_files:
321
- try:
322
- os.remove(temp_file)
323
- except Exception as e:
324
- print(f"Error removing file {temp_file}: {e}")
325
-
326
- return os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4")
327
-
328
- # Example usage
329
- # image_paths = "outputs/images"
330
- # audio_paths = "outputs/speeches"
331
-
332
- # video_path = create_video_from_images_and_audio(image_paths, audio_paths)
333
- # print(f"Video created at: {video_path}")
334
-
335
- class WikiInputs(BaseModel):
336
- """Inputs to the wikipedia tool."""
337
- query: str = Field(description="query to look up in Wikipedia, should be 3 or less words")
338
-
339
- api_wrapper = WikipediaAPIWrapper(top_k_results=3)#, doc_content_chars_max=100)
340
-
341
- wiki_tool = WikipediaQueryRun(
342
- name="wiki-tool",
343
- description="{query:'input here'}",
344
- args_schema=WikiInputs,
345
- api_wrapper=api_wrapper,
346
- return_direct=True,
347
- )
348
-
349
- wiki = Tool(
350
- name = 'wikipedia',
351
- func = wiki_tool.run,
352
- description= "{query:'input here'}"
353
- )
354
-
355
- # wiki_tool.run("latest news in India")
356
-
357
- # @tool
358
- def process_script(script):
359
- """Used to process the script into dictionary format"""
360
- dict = {}
361
- dict['text_for_image_generation'] = re.findall(r'<image>(.*?)</?image>', script)
362
- dict['text_for_speech_generation'] = re.findall(r'<narration>.*?</?narration>', script)
363
- return dict
364
-
365
- @tool#(args_schema=ImageGeneration)
366
- def image_generator(script):
367
- """Generates images for the given script.
368
- Saves it to images_dir and return path
369
- Args:
370
- script: a complete script containing narrations and image descriptions"""
371
- images_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), './outputs/images')
372
- # if num==1:
373
- for filename in os.listdir(images_dir):
374
- file_path = os.path.join(images_dir, filename)
375
- if os.path.isfile(file_path):
376
- os.remove(file_path)
377
-
378
- dict = process_script(script)
379
- for i, text in enumerate(dict['text_for_image_generation']):
380
- # image = pipe(text, num_inference_steps=12, guidance_scale=2, width=720, height=1280, verbose=0).images[0]
381
- # image.save(os.path.join(images_dir, f'image{i}.jpg'))
382
- response = requests.post(
383
- f"https://api.stability.ai/v2beta/stable-image/generate/core",
384
- headers={
385
- "authorization": os.environ.get('STABILITY_AI_API_KEY'),
386
- "accept": "image/*"
387
- },
388
- files={"none": ''},
389
- data={
390
- "prompt": text,
391
- "output_format": "png",
392
- 'aspect_ratio': "9:16",
393
- },
394
- )
395
-
396
- if response.status_code == 200:
397
- with open(os.path.join(images_dir, f'image_{i}.png'), 'wb') as file:
398
- file.write(response.content)
399
- else:
400
- raise Exception(str(response.json()))
401
- return f'images generated.'#f'image generated for "{text}" and saved to directory {images_dir} as image{num}.jpg'
402
-
403
- @tool
404
- def speech_generator(script):
405
- """Generates speech for given text
406
- Saves it to speech_dir and return path
407
- Args:
408
- script: a complete script containing narrations and image descriptions"""
409
- speech_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), './outputs/speeches')
410
-
411
- # if num==1:
412
- for filename in os.listdir(speech_dir):
413
- file_path = os.path.join(speech_dir, filename)
414
- if os.path.isfile(file_path):
415
- os.remove(file_path)
416
-
417
- dict = process_script(script)
418
- print(dict)
419
- for i, text in enumerate(dict['text_for_speech_generation']):
420
- generate_speech(text, speech_dir, num=i)
 
 
 
421
  return f'speechs generated.'#f'speech generated for "{text}" and saved to directory {speech_dir} as speech{num}.mp3'
 
1
+ from langchain.tools import tool, Tool
2
+ import re
3
+ import os
4
+ from langchain_groq import ChatGroq
5
+ import requests
6
+ import cv2
7
+ from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips
8
+ from langchain.pydantic_v1 import BaseModel, Field
9
+ from langchain_community.tools import WikipediaQueryRun
10
+ from langchain_community.utilities import WikipediaAPIWrapper
11
+
12
+ # from diffusers import StableDiffusionXLPipeline, DPMSolverSinglestepScheduler
13
+ # import bitsandbytes as bnb
14
+ # import torch.nn as nn
15
+ # import torch
16
+ import pyttsx3
17
+ # from agents import get_agents_and_tasks
18
+ # from langchain_google_genai import ChatGoogleGenerativeAI
19
+
20
+ # from langchain.chat_models import ChatOpenAI
21
+ # # llm2 = ChatOpenAI(model='gpt-3.5-turbo')
22
+ # # llm3 = ChatOpenAI(model='gpt-3.5-turbo')
23
+ # llm1 = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=2048)
24
+ # # llm2 = ChatGroq(model='mixtral-8x7b-32768', temperature=0.6, max_tokens=2048, api_key='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')
25
+ # llm2 = ChatGoogleGenerativeAI(model='gemini-pro', temperature=0.0)
26
+ # # llm2 = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=2048, api_key='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')
27
+ # # llm3 = ChatGoogleGenerativeAI(model='gemini-pro')
28
+ # llm4 = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=2048, api_key='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')
29
+ # groq_api_key=os.environ.get('GROQ_API_KEY')
30
+ # llm = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=1024, api_key=groq_api_key)
31
+
32
+ # pipe = StableDiffusionXLPipeline.from_pretrained("sd-community/sdxl-flash", torch_dtype=torch.float16).to('cuda')
33
+ # pipe.scheduler = DPMSolverSinglestepScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
34
+
35
+ # def quantize_model_to_4bit(model):
36
+ # replacements = []
37
+
38
+ # # Collect layers to be replaced
39
+ # for name, module in model.named_modules():
40
+ # if isinstance(module, nn.Linear):
41
+ # replacements.append((name, module))
42
+
43
+ # # Replace layers
44
+ # for name, module in replacements:
45
+ # # Split the name to navigate to the parent module
46
+ # *path, last = name.split('.')
47
+ # parent = model
48
+ # for part in path:
49
+ # parent = getattr(parent, part)
50
+
51
+ # # Create and assign the quantized layer
52
+ # quantized_layer = bnb.nn.Linear4bit(module.in_features, module.out_features, bias=module.bias is not None)
53
+ # quantized_layer.weight.data = module.weight.data
54
+ # if module.bias is not None:
55
+ # quantized_layer.bias.data = module.bias.data
56
+ # setattr(parent, last, quantized_layer)
57
+
58
+ # return model
59
+
60
+ # pipe.unet = quantize_model_to_4bit(pipe.unet)
61
+ # pipe.enable_model_cpu_offload()
62
+
63
+
64
+ def generate_speech(text, speech_dir='./outputs/audio', lang='en', speed=170, voice='default', num=0):
65
+ """
66
+ Generates speech for given script.
67
+ """
68
+ engine = pyttsx3.init()
69
+
70
+ # Set language and voice
71
+ voices = engine.getProperty('voices')
72
+ if voice == 'default':
73
+ voice_id = voices[1].id
74
+ else:
75
+ # Try to find the voice with the given name
76
+ voice_id = None
77
+ for v in voices:
78
+ if voice in v.name:
79
+ voice_id = v.id
80
+ break
81
+ if not voice_id:
82
+ raise ValueError(f"Voice '{voice}' not found.")
83
+
84
+ engine.setProperty('voice', voice_id)
85
+ engine.setProperty('rate', speed)
86
+ os.remove(os.path.join(os.path.dirname(os.path.abspath(__file__)), speech_dir, f'speech_{num}.mp3')) if os.path.exists(os.path.join(speech_dir, f'speech_{num}.mp3')) else None
87
+ engine.save_to_file(text, os.path.join(os.path.dirname(os.path.abspath(__file__)), speech_dir, f'speech_{num}.mp3'))
88
+ engine.runAndWait()
89
+
90
+ # class VideoGeneration(BaseModel):
91
+ # images_dir : str = Field(description='Path to images directory, such as "outputs/images"')
92
+ # speeches_dir : str = Field(description='Path to speeches directory, such as "outputs/speeches"')
93
+
94
+ # @tool(args_schema=VideoGeneration)
95
+ # def create_video_from_images_and_audio(images_dir, speeches_dir, zoom_factor=1.2):
96
+ # """Creates video using images and audios with zoom-in effect"""
97
+ # images_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), images_dir)
98
+ # speeches_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), speeches_dir)
99
+
100
+ # images_paths = os.listdir(images_dir)
101
+ # audio_paths = os.listdir(speeches_dir)
102
+ # # print(images_paths, audio_paths)
103
+ # clips = []
104
+
105
+ # for i in range(min(len(images_paths), len(audio_paths))):
106
+ # # Load the image
107
+ # img_clip = ImageClip(os.path.join(images_dir, images_paths[i]))
108
+
109
+ # # Load the audio file
110
+ # audioclip = AudioFileClip(os.path.join(speeches_dir, audio_paths[i]))
111
+
112
+ # # Set the duration of the video clip to the duration of the audio file
113
+ # videoclip = img_clip.set_duration(audioclip.duration)
114
+
115
+ # # Apply zoom-in effect to the video clip
116
+ # zoomed_clip = apply_zoom_in_effect(videoclip, zoom_factor)
117
+
118
+ # # Add audio to the zoomed video clip
119
+ # zoomed_clip = zoomed_clip.set_audio(audioclip)
120
+
121
+ # clips.append(zoomed_clip)
122
+
123
+ # # Concatenate all video clips
124
+ # final_clip = concatenate_videoclips(clips)
125
+
126
+ # # Write the result to a file
127
+ # final_clip.write_videofile(os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4"), codec='libx264', fps=24)
128
+
129
+ # return os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4")
130
+
131
+ # def apply_zoom_in_effect(clip, zoom_factor=1.2):
132
+ # width, height = clip.size
133
+ # duration = clip.duration
134
+
135
+ # def zoom_in_effect(get_frame, t):
136
+ # frame = get_frame(t)
137
+ # zoom = 1 + (zoom_factor - 1) * (t / duration)
138
+ # new_width, new_height = int(width * zoom), int(height * zoom)
139
+ # resized_frame = cv2.resize(frame, (new_width, new_height))
140
+
141
+ # # Calculate the position to crop the frame to the original size
142
+ # x_start = (new_width - width) // 2
143
+ # y_start = (new_height - height) // 2
144
+ # cropped_frame = resized_frame[y_start:y_start + height, x_start:x_start + width]
145
+
146
+ # return cropped_frame
147
+
148
+ # return clip.fl(zoom_in_effect, apply_to=['mask'])
149
+
150
+ # Example usage
151
+ # image_paths = "outputs/images"
152
+ # audio_paths = "outputs/audio"
153
+
154
+ # video_path = create_video_from_images_and_audio(image_paths, audio_paths)
155
+ # print(f"Video created at: {video_path}")
156
+
157
+
158
+ # class ImageGeneration(BaseModel):
159
+ # text : str = Field(description='description of sentence used for image generation')
160
+ # num : int = Field(description='sequence of description passed this tool. Used in image saving path. Example 1,2,3,4,5 and so on')
161
+
162
+ # class SpeechGeneration(BaseModel):
163
+ # text : str = Field(description='description of sentence used for image generation')
164
+ # num : int = Field(description='sequence of description passed this tool. Used in image saving path. Example 1,2,3,4,5 and so on')
165
+
166
+ import os
167
+ import cv2
168
+ from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips, VideoFileClip
169
+ from PIL import Image, ImageDraw, ImageFont
170
+ import numpy as np
171
+ from groq import Groq
172
+
173
+
174
+
175
+ class VideoGeneration(BaseModel):
176
+ images_dir: str = Field(description='Path to images directory, such as "outputs/images"')
177
+ speeches_dir: str = Field(description='Path to speeches directory, such as "outputs/speeches"')
178
+
179
+ def split_text_into_chunks(text, chunk_size):
180
+ words = text.split()
181
+ return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
182
+
183
+ def add_text_to_video(input_video, output_video, text, duration=1, fontsize=40, fontcolor=(255, 255, 255),
184
+ outline_thickness=2, outline_color=(0, 0, 0), delay_between_chunks=0.1,
185
+ font_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'Montserrat-Bold.ttf')):
186
+
187
+ chunks = split_text_into_chunks(text, 3) # Adjust chunk size as needed
188
+
189
+ cap = cv2.VideoCapture(input_video)
190
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
191
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
192
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
193
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
194
+ out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))
195
+
196
+ frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
197
+ chunk_duration_frames = duration * fps
198
+ delay_frames = int(delay_between_chunks * fps)
199
+
200
+ font = ImageFont.truetype(font_path, fontsize)
201
+
202
+ current_frame = 0
203
+
204
+ while cap.isOpened():
205
+ ret, frame = cap.read()
206
+ if not ret:
207
+ break
208
+
209
+ frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
210
+ draw = ImageDraw.Draw(frame_pil)
211
+
212
+ chunk_index = current_frame // (chunk_duration_frames + delay_frames)
213
+
214
+ if current_frame % (chunk_duration_frames + delay_frames) < chunk_duration_frames and chunk_index < len(chunks):
215
+ chunk = chunks[chunk_index]
216
+ text_width, text_height = draw.textsize(chunk, font=font)
217
+ text_x = (width - text_width) // 2
218
+ text_y = height - 400 # Position text at the bottom
219
+
220
+ if text_width > width:
221
+ words = chunk.split()
222
+ half = len(words) // 2
223
+ line1 = ' '.join(words[:half])
224
+ line2 = ' '.join(words[half:])
225
+
226
+ text_size_line1 = draw.textsize(line1, font=font)
227
+ text_size_line2 = draw.textsize(line2, font=font)
228
+ text_x_line1 = (width - text_size_line1[0]) // 2
229
+ text_x_line2 = (width - text_size_line2[0]) // 2
230
+ text_y = height - 250 - text_size_line1[1] # Adjust vertical position for two lines
231
+
232
+ for dx in range(-outline_thickness, outline_thickness + 1):
233
+ for dy in range(-outline_thickness, outline_thickness + 1):
234
+ if dx != 0 or dy != 0:
235
+ draw.text((text_x_line1 + dx, text_y + dy), line1, font=font, fill=outline_color)
236
+ draw.text((text_x_line2 + dx, text_y + text_size_line1[1] + dy), line2, font=font, fill=outline_color)
237
+
238
+ draw.text((text_x_line1, text_y), line1, font=font, fill=fontcolor)
239
+ draw.text((text_x_line2, text_y + text_size_line1[1]), line2, font=font, fill=fontcolor)
240
+
241
+ else:
242
+ for dx in range(-outline_thickness, outline_thickness + 1):
243
+ for dy in range(-outline_thickness, outline_thickness + 1):
244
+ if dx != 0 or dy != 0:
245
+ draw.text((text_x + dx, text_y + dy), chunk, font=font, fill=outline_color)
246
+
247
+ draw.text((text_x, text_y), chunk, font=font, fill=fontcolor)
248
+
249
+ frame = cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)
250
+
251
+ out.write(frame)
252
+ current_frame += 1
253
+
254
+ cap.release()
255
+ out.release()
256
+ cv2.destroyAllWindows()
257
+
258
+ def apply_zoom_in_effect(clip, zoom_factor=1.2):
259
+ width, height = clip.size
260
+ duration = clip.duration
261
+
262
+ def zoom_in_effect(get_frame, t):
263
+ frame = get_frame(t)
264
+ zoom = 1 + (zoom_factor - 1) * (t / duration)
265
+ new_width, new_height = int(width * zoom), int(height * zoom)
266
+ resized_frame = cv2.resize(frame, (new_width, new_height))
267
+
268
+ x_start = (new_width - width) // 2
269
+ y_start = (new_height - height) // 2
270
+ cropped_frame = resized_frame[y_start:y_start + height, x_start:x_start + width]
271
+
272
+ return cropped_frame
273
+
274
+ return clip.fl(zoom_in_effect, apply_to=['mask'])
275
+
276
+ @tool(args_schema=VideoGeneration)
277
+ def create_video_from_images_and_audio(images_dir, speeches_dir, zoom_factor=1.2):
278
+ """Creates video using images and audios.
279
+ Args:
280
+ images_dir: path to images folder, example 'outputs/images'
281
+ speeches_dir: path to speeches folder, example 'outputs/speeches'"""
282
+ client = Groq()
283
+ images_paths = sorted(os.listdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),images_dir)))
284
+ audio_paths = sorted(os.listdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),speeches_dir)))
285
+ clips = []
286
+ temp_files = []
287
+
288
+ for i in range(min(len(images_paths), len(audio_paths))):
289
+ img_clip = ImageClip(os.path.join(os.path.dirname(os.path.abspath(__file__)),images_dir, images_paths[i]))
290
+ audioclip = AudioFileClip(os.path.join(os.path.dirname(os.path.abspath(__file__)),speeches_dir, audio_paths[i]))
291
+ videoclip = img_clip.set_duration(audioclip.duration)
292
+ zoomed_clip = apply_zoom_in_effect(videoclip, zoom_factor)
293
+
294
+ with open(os.path.join(os.path.dirname(os.path.abspath(__file__)),speeches_dir, audio_paths[i]), "rb") as file:
295
+ transcription = client.audio.transcriptions.create(
296
+ file=(audio_paths[i], file.read()),
297
+ model="whisper-large-v3",
298
+ response_format="verbose_json",
299
+ )
300
+ caption = transcription.text
301
+
302
+ temp_video_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"outputs/final_video/temp_zoomed_{i}.mp4")
303
+ zoomed_clip.write_videofile(temp_video_path, codec='libx264', fps=24)
304
+ temp_files.append(temp_video_path)
305
+
306
+ final_video_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"outputs/final_video/temp_captioned_{i}.mp4")
307
+ add_text_to_video(temp_video_path, final_video_path, caption, duration=1, fontsize=60)
308
+ temp_files.append(final_video_path)
309
+
310
+ final_clip = VideoFileClip(final_video_path)
311
+ final_clip = final_clip.set_audio(audioclip)
312
+
313
+ clips.append(final_clip)
314
+
315
+ final_clip = concatenate_videoclips(clips)
316
+ final_clip.write_videofile(os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4"), codec='libx264', fps=24)
317
+
318
+ # Close all video files properly
319
+ for clip in clips:
320
+ clip.close()
321
+
322
+ # Remove all temporary files
323
+ for temp_file in temp_files:
324
+ try:
325
+ os.remove(temp_file)
326
+ except Exception as e:
327
+ print(f"Error removing file {temp_file}: {e}")
328
+
329
+ return os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4")
330
+
331
+ # Example usage
332
+ # image_paths = "outputs/images"
333
+ # audio_paths = "outputs/speeches"
334
+
335
+ # video_path = create_video_from_images_and_audio(image_paths, audio_paths)
336
+ # print(f"Video created at: {video_path}")
337
+
338
+ class WikiInputs(BaseModel):
339
+ """Inputs to the wikipedia tool."""
340
+ query: str = Field(description="query to look up in Wikipedia, should be 3 or less words")
341
+
342
+ api_wrapper = WikipediaAPIWrapper(top_k_results=3)#, doc_content_chars_max=100)
343
+
344
+ wiki_tool = WikipediaQueryRun(
345
+ name="wiki-tool",
346
+ description="{query:'input here'}",
347
+ args_schema=WikiInputs,
348
+ api_wrapper=api_wrapper,
349
+ return_direct=True,
350
+ )
351
+
352
+ wiki = Tool(
353
+ name = 'wikipedia',
354
+ func = wiki_tool.run,
355
+ description= "{query:'input here'}"
356
+ )
357
+
358
+ # wiki_tool.run("latest news in India")
359
+
360
+ # @tool
361
+ def process_script(script):
362
+ """Used to process the script into dictionary format"""
363
+ dict = {}
364
+ dict['text_for_image_generation'] = re.findall(r'<image>(.*?)</?image>', script)
365
+ dict['text_for_speech_generation'] = re.findall(r'<narration>.*?</?narration>', script)
366
+ return dict
367
+
368
+ @tool#(args_schema=ImageGeneration)
369
+ def image_generator(script):
370
+ """Generates images for the given script.
371
+ Saves it to images_dir and return path
372
+ Args:
373
+ script: a complete script containing narrations and image descriptions"""
374
+ images_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), './outputs/images')
375
+ # if num==1:
376
+ for filename in os.listdir(images_dir):
377
+ file_path = os.path.join(images_dir, filename)
378
+ if os.path.isfile(file_path):
379
+ os.remove(file_path)
380
+
381
+ dict = process_script(script)
382
+ for i, text in enumerate(dict['text_for_image_generation']):
383
+ # image = pipe(text, num_inference_steps=12, guidance_scale=2, width=720, height=1280, verbose=0).images[0]
384
+ # image.save(os.path.join(images_dir, f'image{i}.jpg'))
385
+ response = requests.post(
386
+ f"https://api.stability.ai/v2beta/stable-image/generate/core",
387
+ headers={
388
+ "authorization": os.environ.get('STABILITY_AI_API_KEY'),
389
+ "accept": "image/*"
390
+ },
391
+ files={"none": ''},
392
+ data={
393
+ "prompt": text,
394
+ "output_format": "png",
395
+ 'aspect_ratio': "9:16",
396
+ },
397
+ )
398
+
399
+ if response.status_code == 200:
400
+ with open(os.path.join(images_dir, f'image_{i}.png'), 'wb') as file:
401
+ file.write(response.content)
402
+ else:
403
+ raise Exception(str(response.json()))
404
+ return f'images generated.'#f'image generated for "{text}" and saved to directory {images_dir} as image{num}.jpg'
405
+
406
+ @tool
407
+ def speech_generator(script):
408
+ """Generates speech for given text
409
+ Saves it to speech_dir and return path
410
+ Args:
411
+ script: a complete script containing narrations and image descriptions"""
412
+ speech_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), './outputs/speeches')
413
+
414
+ # if num==1:
415
+ for filename in os.listdir(speech_dir):
416
+ file_path = os.path.join(speech_dir, filename)
417
+ if os.path.isfile(file_path):
418
+ os.remove(file_path)
419
+
420
+ dict = process_script(script)
421
+ print(dict)
422
+ for i, text in enumerate(dict['text_for_speech_generation']):
423
+ generate_speech(text, speech_dir, num=i)
424
  return f'speechs generated.'#f'speech generated for "{text}" and saved to directory {speech_dir} as speech{num}.mp3'