Spaces:
Runtime error
Runtime error
import re | |
def extract_language_and_text_updated(speaker, dialogue): | |
# 使用正则表达式匹配<语言>标签和其后的文本 | |
pattern_language_text = r'<(\S+?)>([^<]+)' | |
matches = re.findall(pattern_language_text, dialogue, re.DOTALL) | |
speaker = speaker[1:-1] | |
# 清理文本:去除两边的空白字符 | |
matches_cleaned = [(lang.upper(), text.strip()) for lang, text in matches] | |
matches_cleaned.append(speaker) | |
return matches_cleaned | |
def validate_text(input_text): | |
# 验证说话人的正则表达式 | |
pattern_speaker = r'(\[\S+?\])((?:\s*<\S+?>[^<\[\]]+?)+)' | |
# 使用re.DOTALL标志使.匹配包括换行符在内的所有字符 | |
matches = re.findall(pattern_speaker, input_text, re.DOTALL) | |
# 对每个匹配到的说话人内容进行进一步验证 | |
for _, dialogue in matches: | |
language_text_matches = extract_language_and_text_updated(_, dialogue) | |
if not language_text_matches: | |
return False, "Error: Invalid format detected in dialogue content. Please check your input." | |
# 如果输入的文本中没有找到任何匹配项 | |
if not matches: | |
return False, "Error: No valid speaker format detected. Please check your input." | |
return True, "Input is valid." | |
def text_matching(text:str) -> list: | |
speaker_pattern = r'(\[\S+?\])(.+?)(?=\[\S+?\]|$)' | |
matches = re.findall(speaker_pattern, text, re.DOTALL) | |
result = [] | |
for speaker, dialogue in matches: | |
result.append(extract_language_and_text_updated(speaker, dialogue)) | |
print(result) | |
return result | |
if __name__ == '__main__': | |
text = ''' | |
[说话人1] | |
[说话人2]<zh>你好吗?<jp>元気ですか?<jp>こんにちは,世界。<zh>你好吗? | |
[说话人3]<zh>谢谢。<jp>どういたしまして。 | |
''' | |
text_matching(text) | |
# 测试函数 | |
test_text = ''' | |
[说话人1]<zh>你好,こんにちは!<jp>こんにちは,世界。 | |
[说话人2]<zh>你好吗? | |
''' | |
text_matching(test_text) | |
res = validate_text(test_text) | |
print(res) | |