bert-vits2-maolei / re_matching.py
RUSH-miaomi's picture
Upload 22 files
94aaff9
raw
history blame contribute delete
No virus
2.08 kB
import re
def extract_language_and_text_updated(speaker, dialogue):
# 使用正则表达式匹配<语言>标签和其后的文本
pattern_language_text = r'<(\S+?)>([^<]+)'
matches = re.findall(pattern_language_text, dialogue, re.DOTALL)
speaker = speaker[1:-1]
# 清理文本:去除两边的空白字符
matches_cleaned = [(lang.upper(), text.strip()) for lang, text in matches]
matches_cleaned.append(speaker)
return matches_cleaned
def validate_text(input_text):
# 验证说话人的正则表达式
pattern_speaker = r'(\[\S+?\])((?:\s*<\S+?>[^<\[\]]+?)+)'
# 使用re.DOTALL标志使.匹配包括换行符在内的所有字符
matches = re.findall(pattern_speaker, input_text, re.DOTALL)
# 对每个匹配到的说话人内容进行进一步验证
for _, dialogue in matches:
language_text_matches = extract_language_and_text_updated(_, dialogue)
if not language_text_matches:
return False, "Error: Invalid format detected in dialogue content. Please check your input."
# 如果输入的文本中没有找到任何匹配项
if not matches:
return False, "Error: No valid speaker format detected. Please check your input."
return True, "Input is valid."
def text_matching(text:str) -> list:
speaker_pattern = r'(\[\S+?\])(.+?)(?=\[\S+?\]|$)'
matches = re.findall(speaker_pattern, text, re.DOTALL)
result = []
for speaker, dialogue in matches:
result.append(extract_language_and_text_updated(speaker, dialogue))
print(result)
return result
if __name__ == '__main__':
text = '''
[说话人1]
[说话人2]<zh>你好吗?<jp>元気ですか?<jp>こんにちは,世界。<zh>你好吗?
[说话人3]<zh>谢谢。<jp>どういたしまして。
'''
text_matching(text)
# 测试函数
test_text = '''
[说话人1]<zh>你好,こんにちは!<jp>こんにちは,世界。
[说话人2]<zh>你好吗?
'''
text_matching(test_text)
res = validate_text(test_text)
print(res)