Spaces:
Sleeping
Sleeping
rogerxavier
commited on
Create 3mergeDialogToVideo.py
Browse files- 3mergeDialogToVideo.py +272 -0
3mergeDialogToVideo.py
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# rogerxavier-ocr-with-fastapi.hf.space
|
2 |
+
import os
|
3 |
+
##这个模型目前只适合确定文本框顺序后再识别,因为如果后面的
|
4 |
+
##完整图片处理的反例 现在处理的图片是10\0.jpg
|
5 |
+
# [[[953, 743], [987, 743], [987, 867], [953, 867]], [[917, 745], [951, 745], [951, 867], [917, 867]], [[881, 741], [918, 742], [915, 898], [877, 897]], [[843, 743], [879, 743], [879, 809], [843, 809]], [[629, 1058], [669, 1058], [669, 1210], [629, 1210]], [[549, 1227], [583, 1227], [583, 1381], [549, 1381]], [[535, 115], [563, 115], [563, 145], [535, 145]], [[535, 147], [563, 147], [563, 213], [535, 213]], [[507, 443], [539, 443], [539, 579], [507, 579]], [[505, 115], [533, 115], [533, 197], [505, 197]], [[511, 1225], [547, 1225], [547, 1321], [511, 1321]], [[475, 117], [503, 117], [503, 265], [475, 265]], [[467, 421], [503, 421], [503, 575], [467, 575]], [[419, 235], [447, 235], [447, 337], [419, 337]], [[387, 236], [417, 237], [414, 339], [385, 338]], [[209, 796], [242, 797], [239, 921], [206, 920]], [[175, 173], [205, 173], [205, 225], [175, 225]], [[177, 231], [205, 231], [205, 285], [177, 285]], [[103, 1153], [129, 1153], [129, 1223], [103, 1223]], [[41, 100], [108, 101], [104, 549], [36, 548]]]
|
6 |
+
# ['就算是你', '没有圣剑', '也不可能有', '胜算', '就算如此', '我也不觉得', '做', ':做个', '·就不觉得', '老好人', '你可怕', '也要有个限度', '我很恐怖吗', '该说真是', '无药可救', '说的是呢', '这个', '但是', '为何?', '第二话让人怜爱']
|
7 |
+
|
8 |
+
import requests
|
9 |
+
|
10 |
+
import tempfile
|
11 |
+
import time
|
12 |
+
|
13 |
+
from moviepy.audio.AudioClip import AudioArrayClip
|
14 |
+
from moviepy.editor import *
|
15 |
+
import cv2
|
16 |
+
import azure.cognitiveservices.speech as speechsdk
|
17 |
+
import numpy as np
|
18 |
+
import io
|
19 |
+
import base64
|
20 |
+
import json
|
21 |
+
from io import BytesIO
|
22 |
+
import pandas as pd
|
23 |
+
from PIL import Image
|
24 |
+
|
25 |
+
|
26 |
+
#通过去水印完整漫画图片->获取相应的对话框图片->获取对话框文字->返回对话框文字
|
27 |
+
def get_image_copywrite(image_path:"图片路径(包含后缀)",dialog_cut_path:"对话框切割路径")->"返回漫画关联对话框识别后得到的文案str(原文即可),也可能是none":
|
28 |
+
dialog_texts = ''
|
29 |
+
associate_dialog_img = get_associate_dialog(image_path=image_path,dialog_cut_path=dialog_cut_path)
|
30 |
+
if len(associate_dialog_img)!=0:
|
31 |
+
#如果有对应的对话框
|
32 |
+
for dialog_img_path in associate_dialog_img:
|
33 |
+
cur_dialog_texts = get_sorted_dialog_text(dialog_img_path)#一个对话框的文字list
|
34 |
+
if cur_dialog_texts is not None:
|
35 |
+
for dialog_text in cur_dialog_texts:
|
36 |
+
dialog_texts += dialog_text
|
37 |
+
dialog_texts += '\n'
|
38 |
+
else:
|
39 |
+
print(dialog_img_path+"识别是空-可能是有问题")
|
40 |
+
return dialog_texts
|
41 |
+
return None#不规范图片不请求,直接返回none
|
42 |
+
|
43 |
+
#通过传入无水印漫画图片对话框路径,得到关联的对话框图片list
|
44 |
+
def get_associate_dialog(image_path:"图片路径(包含后缀)",dialog_cut_path:"对话框切割路径")->"返回漫画关联对话框list,也可能是空的list":
|
45 |
+
image_name = os.path.splitext(os.path.basename(image_path))[0]
|
46 |
+
image_name_format = '{:03d}'.format(int(image_name))
|
47 |
+
|
48 |
+
associated_dialogs = []
|
49 |
+
for root, _, files in os.walk(dialog_cut_path):
|
50 |
+
for file in files:
|
51 |
+
if file.startswith(image_name_format) and file.endswith('.jpg'):
|
52 |
+
associated_dialogs.append(os.path.join(root, file))
|
53 |
+
|
54 |
+
return associated_dialogs
|
55 |
+
|
56 |
+
|
57 |
+
#通过对话框图片路径,获取对话框文字list
|
58 |
+
def get_sorted_dialog_text(image_path:"包含后缀的文件路径")->"返回排序后的text list(一列或者几列话,反正是一个框的内容,几句不清楚,一个框的list当一次文案就行) 或者失败请求返回none":
|
59 |
+
image_bytes = open(image_path, 'rb')
|
60 |
+
headers = {
|
61 |
+
'authority': 'rogerxavier-fastapi-t5-magi.hf.space',
|
62 |
+
'scheme': 'https',
|
63 |
+
'Accept': '*/*',
|
64 |
+
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
65 |
+
'Accept-Language': 'zh-CN,zh;q=0.9',
|
66 |
+
'Cookie': 'spaces-jwt=eyJhbGciOiJFZERTQSJ9.eyJyZWFkIjp0cnVlLCJwZXJtaXNzaW9ucyI6eyJyZXBvLmNvbnRlbnQucmVhZCI6dHJ1ZX0sIm9uQmVoYWxmT2YiOnsia2luZCI6InVzZXIiLCJfaWQiOiI2NDJhNTNiNTE2ZDRkODI5M2M5YjdiNzgiLCJ1c2VyIjoicm9nZXJ4YXZpZXIifSwiaWF0IjoxNzE2Njg3MzU3LCJzdWIiOiIvc3BhY2VzL3JvZ2VyeGF2aWVyL29jcl93aXRoX2Zhc3RhcGkiLCJleHAiOjE3MTY3NzM3NTcsImlzcyI6Imh0dHBzOi8vaHVnZ2luZ2ZhY2UuY28ifQ._sGdEgC-ijbIhLmB6iNSBQ_xHNzb4Ydb9mD0L3ByRmJSbB9ccfGbRgtNmkV1JLLldHp_VEKUSQt9Mwq_q4aGAQ',
|
67 |
+
'Dnt': '1',
|
68 |
+
'Priority': 'u=1, i',
|
69 |
+
'Sec-Ch-Ua': '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
|
70 |
+
'Sec-Ch-Ua-Mobile': '?0',
|
71 |
+
'Sec-Ch-Ua-Platform': '"Windows"',
|
72 |
+
'Sec-Fetch-Dest': 'empty',
|
73 |
+
'Sec-Fetch-Mode': 'cors',
|
74 |
+
'Sec-Fetch-Site': 'same-origin',
|
75 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
|
76 |
+
}
|
77 |
+
files = {
|
78 |
+
"image": image_bytes,
|
79 |
+
}
|
80 |
+
try:
|
81 |
+
resp = requests.post("https://rogerxavier-ocr-with-fastapi.hf.space/getCoordinates", files=files,headers=headers)#还是有header才能跑
|
82 |
+
#先json转换,0为坐标list合集,1为 boxid和text合集
|
83 |
+
boxCoordinates , boxInfo = resp.json()[0],resp.json()[1] #分别是list和dict类型
|
84 |
+
|
85 |
+
# 计算文本框的中心点,以便按照从右往左,从上往下的顺序进行排序
|
86 |
+
centers = [((box[0][0] + box[2][0]) / 2, (box[0][1] + box[2][1]) / 2) for box in boxCoordinates]
|
87 |
+
|
88 |
+
# 按照中心点的坐标从右往左,从上往下的顺序对文本框坐标进行排序
|
89 |
+
sorted_indices = sorted(range(len(centers)), key=lambda i: (-centers[i][0], centers[i][1]))
|
90 |
+
|
91 |
+
# 获取排序后的文本框坐标和对应的文字
|
92 |
+
sorted_coordinates = [boxCoordinates[i] for i in sorted_indices]
|
93 |
+
sorted_text = [boxInfo['Text'][str(i)] for i in sorted_indices]
|
94 |
+
|
95 |
+
# 根据x方向偏差要求重新排序同一列的文本框
|
96 |
+
for i in range(len(sorted_indices) - 1):
|
97 |
+
if centers[sorted_indices[i]][0] - centers[sorted_indices[i+1]][0] < (sorted_coordinates[i][2][0] - sorted_coordinates[i][0][0]) / 3:
|
98 |
+
if sorted_coordinates[i][0][1] > sorted_coordinates[i+1][2][1]:
|
99 |
+
sorted_indices[i], sorted_indices[i+1] = sorted_indices[i+1], sorted_indices[i]
|
100 |
+
|
101 |
+
sorted_coordinates = [boxCoordinates[i] for i in sorted_indices]
|
102 |
+
sorted_text = [boxInfo['Text'][str(i)] for i in sorted_indices]
|
103 |
+
|
104 |
+
print(sorted_coordinates)
|
105 |
+
print(sorted_text)
|
106 |
+
return sorted_text
|
107 |
+
except Exception as e:
|
108 |
+
print("图片请求出现问题")
|
109 |
+
print(e)
|
110 |
+
return None
|
111 |
+
|
112 |
+
|
113 |
+
#通过文字获取音频
|
114 |
+
def get_audio_data(text:str)-> "返回audio data io句柄, duration":
|
115 |
+
# Creates an instance of a speech config with specified subscription key and service region.
|
116 |
+
speech_key = "5f438c41786144d6b5318317321a2da2"
|
117 |
+
service_region = "eastus"
|
118 |
+
|
119 |
+
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
|
120 |
+
# Note: the voice setting will not overwrite the voice element in input SSML.
|
121 |
+
speech_config.speech_synthesis_voice_name = "zh-CN-YunxiNeural" ##云希
|
122 |
+
|
123 |
+
# use the default speaker as audio output.
|
124 |
+
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
|
125 |
+
|
126 |
+
result = speech_synthesizer.speak_text_async(text).get()
|
127 |
+
# Check result
|
128 |
+
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
|
129 |
+
print("Speech synthesized for text [{}]".format(text))
|
130 |
+
elif result.reason == speechsdk.ResultReason.Canceled:
|
131 |
+
cancellation_details = result.cancellation_details
|
132 |
+
print("Speech synthesis canceled: {}".format(cancellation_details.reason))
|
133 |
+
if cancellation_details.reason == speechsdk.CancellationReason.Error:
|
134 |
+
print("Error details: {}".format(cancellation_details.error_details))
|
135 |
+
|
136 |
+
# print("音频持续时间是",result.audio_duration)
|
137 |
+
# print("音频数据是",result.audio_data)
|
138 |
+
# 创建临时文件 -当前路径下面
|
139 |
+
with tempfile.NamedTemporaryFile(dir='/',delete=False) as temp_file:
|
140 |
+
temp_file.write(result.audio_data)
|
141 |
+
temp_file.close()
|
142 |
+
# 在这里完成您对文件的操作,比如返回文件名
|
143 |
+
file_name = temp_file.name
|
144 |
+
return file_name, str(result.audio_duration)
|
145 |
+
|
146 |
+
|
147 |
+
# 补零函数,将数字部分补齐为指定长度
|
148 |
+
def zero_pad(s, length):
|
149 |
+
return s.zfill(length)
|
150 |
+
|
151 |
+
|
152 |
+
def gpt_polish(text:str)->"通过gpt润色str文案并返回str新文案,或者gpt请求失败none":
|
153 |
+
# Set your OpenAI API key
|
154 |
+
api_key = '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'
|
155 |
+
|
156 |
+
# Define the headers
|
157 |
+
headers = {
|
158 |
+
'Authorization': f'Bearer {api_key}',
|
159 |
+
'Content-Type': 'application/json',
|
160 |
+
}
|
161 |
+
|
162 |
+
# Chat Completions request data
|
163 |
+
data = {
|
164 |
+
'model': 'gpt-3.5-turbo', # Replace with your chosen model
|
165 |
+
'messages': [
|
166 |
+
{'role': 'system', 'content': "你是一个assistant,能够根据user发送的漫画中提取的文字,生成一个短视频中一帧的三人称文案(1-2句话)"},
|
167 |
+
{'role': 'user', 'content': text}
|
168 |
+
]
|
169 |
+
}
|
170 |
+
try:
|
171 |
+
|
172 |
+
response = requests.post('https://api.yingwu.lol/v1/chat/completions', headers=headers, data=json.dumps(data))
|
173 |
+
print("润色后文案是:"+response.json()['choices'][0]['message']['content'])
|
174 |
+
return response.json()['choices'][0]['message']['content']
|
175 |
+
except Exception as e:
|
176 |
+
print("gpt润色文案失败:")
|
177 |
+
print(e)
|
178 |
+
return None
|
179 |
+
if __name__ == '__main__':
|
180 |
+
# 获取存放去水印漫画图片的路径 ---放这里是因为获取对话文字时需要和原图关联
|
181 |
+
img_path = 'manga1'
|
182 |
+
# 获取切割后的文本框路径
|
183 |
+
dialog_img_path = 'manga12'
|
184 |
+
|
185 |
+
#获取漫画原图无水印的加入image_files,并排序
|
186 |
+
subdir_path = os.path.join(os.getcwd(), img_path)
|
187 |
+
# 对话图片经过加入list并补0确定顺序
|
188 |
+
image_files = []
|
189 |
+
for root, dirs, files in os.walk(subdir_path):
|
190 |
+
for file in files:
|
191 |
+
if file.endswith(".jpg") or file.endswith(".png"):
|
192 |
+
image_files.append(os.path.relpath(os.path.join(root, file)))
|
193 |
+
# 对对话框文件名中的数字部分进行补零操作-这样顺序会正常
|
194 |
+
image_files.sort(
|
195 |
+
key=lambda x: zero_pad(''.join(filter(str.isdigit, os.path.splitext(os.path.basename(x))[0])), 3))
|
196 |
+
|
197 |
+
dialog_subdir_path = os.path.join(os.getcwd(), dialog_img_path)
|
198 |
+
# 对话图片经过加入list并补0确定顺序
|
199 |
+
dialog_image_files = []
|
200 |
+
for root, dirs, files in os.walk(dialog_subdir_path):
|
201 |
+
for file in files:
|
202 |
+
if file.endswith(".jpg") or file.endswith(".png"):
|
203 |
+
dialog_image_files.append(os.path.relpath(os.path.join(root, file)))
|
204 |
+
# 对对话框文件名中的数字部分进行补零操作-这样顺序会正常
|
205 |
+
dialog_image_files.sort(
|
206 |
+
key=lambda x: zero_pad(''.join(filter(str.isdigit, os.path.splitext(os.path.basename(x))[0])), 3))
|
207 |
+
# 对话图片经过加入list并补0确定顺序
|
208 |
+
|
209 |
+
|
210 |
+
###音视频相关参数-------------------------------------------------------------------------------------
|
211 |
+
##这个是临时生成音频文件的全局变量--方便后续删除
|
212 |
+
filename = ''
|
213 |
+
# 视频分辨率和帧率
|
214 |
+
# 获取第一张图片的尺寸
|
215 |
+
image = Image.open(image_files[0])
|
216 |
+
width, height = 1125, 1600 # 无法显示可能是win播放器不支持
|
217 |
+
fps = 30
|
218 |
+
font_path = '1.ttf' # 设置字体以防默认字体无法同时处理中英文
|
219 |
+
# 创建视频编辑器
|
220 |
+
video_clips = []
|
221 |
+
###音视频相关参数-------------------------------------------------------------------------------------
|
222 |
+
|
223 |
+
|
224 |
+
|
225 |
+
#因为是根据原图无水印的进行遍历,所以处理前要进行筛选,只处理能找到相应对话框图片的原图
|
226 |
+
filtered_image_files = []
|
227 |
+
for image_path in image_files:
|
228 |
+
dialog_list = get_associate_dialog(image_path, dialog_img_path)
|
229 |
+
if dialog_list:
|
230 |
+
filtered_image_files.append(image_path)
|
231 |
+
|
232 |
+
image_files = filtered_image_files
|
233 |
+
|
234 |
+
for idx, image_file in enumerate(image_files):
|
235 |
+
print("现在处理的图片是"+image_file)
|
236 |
+
#后面是视音频生成部分-这里图片需要用到完整的去水印的而不是对话框用于识别的
|
237 |
+
img = cv2.imread(image_file)
|
238 |
+
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) ##只支持英文路径
|
239 |
+
|
240 |
+
##获取当前图片对应的对话框识别文字(还需gpt处理后作为字幕文案)
|
241 |
+
cur_copywrite = get_image_copywrite(image_file,dialog_img_path) # image_file就是6.jpg了
|
242 |
+
cur_copywrite = gpt_polish(cur_copywrite)
|
243 |
+
|
244 |
+
|
245 |
+
|
246 |
+
if cur_copywrite is not None:
|
247 |
+
|
248 |
+
##获取当前图片对应的临时音频文件名称和文案时长
|
249 |
+
filename, duration = get_audio_data(cur_copywrite)
|
250 |
+
|
251 |
+
clip = ImageClip(img).set_duration(duration).resize((width, height)) # 初始clip
|
252 |
+
|
253 |
+
txt_clip = TextClip(cur_copywrite, fontsize=40, color='white', bg_color='black',
|
254 |
+
font=font_path) ##文本clip后加入视频
|
255 |
+
|
256 |
+
txt_clip = txt_clip.set_pos(('center', 'bottom')).set_duration(duration)
|
257 |
+
# 创建音频剪辑
|
258 |
+
audio_clip = AudioFileClip(filename)
|
259 |
+
clip = clip.set_audio(audio_clip) # 将音频与视频片段关联
|
260 |
+
clip = CompositeVideoClip([clip, txt_clip])
|
261 |
+
video_clips.append(clip)
|
262 |
+
else:
|
263 |
+
pass ##图片不规范直接跳过
|
264 |
+
video = concatenate_videoclips(video_clips)
|
265 |
+
# 保存视频
|
266 |
+
video.write_videofile('output_video.mp4', fps=fps)
|
267 |
+
# # 在文件关闭后删除临时文件
|
268 |
+
print("删除临时mp3文件", filename)
|
269 |
+
os.remove(filename)
|
270 |
+
|
271 |
+
|
272 |
+
|