|
import re |
|
|
|
|
|
def extract_language_and_text_updated(speaker, dialogue): |
|
|
|
pattern_language_text = r"<(\S+?)>([^<]+)" |
|
matches = re.findall(pattern_language_text, dialogue, re.DOTALL) |
|
speaker = speaker[1:-1] |
|
|
|
matches_cleaned = [(lang.upper(), text.strip()) for lang, text in matches] |
|
matches_cleaned.append(speaker) |
|
return matches_cleaned |
|
|
|
|
|
def validate_text(input_text): |
|
|
|
pattern_speaker = r"(\[\S+?\])((?:\s*<\S+?>[^<\[\]]+?)+)" |
|
|
|
|
|
matches = re.findall(pattern_speaker, input_text, re.DOTALL) |
|
|
|
|
|
for _, dialogue in matches: |
|
language_text_matches = extract_language_and_text_updated(_, dialogue) |
|
if not language_text_matches: |
|
return ( |
|
False, |
|
"Error: Invalid format detected in dialogue content. Please check your input.", |
|
) |
|
|
|
|
|
if not matches: |
|
return ( |
|
False, |
|
"Error: No valid speaker format detected. Please check your input.", |
|
) |
|
|
|
return True, "Input is valid." |
|
|
|
|
|
def text_matching(text: str) -> list: |
|
speaker_pattern = r"(\[\S+?\])(.+?)(?=\[\S+?\]|$)" |
|
matches = re.findall(speaker_pattern, text, re.DOTALL) |
|
result = [] |
|
for speaker, dialogue in matches: |
|
result.append(extract_language_and_text_updated(speaker, dialogue)) |
|
print(result) |
|
return result |
|
|
|
|
|
def cut_para(text): |
|
splitted_para = re.split("[\n]", text) |
|
splitted_para = [ |
|
sentence.strip() for sentence in splitted_para if sentence.strip() |
|
] |
|
return splitted_para |
|
|
|
|
|
def cut_sent(para): |
|
para = re.sub("([。!;?\?])([^”’])", r"\1\n\2", para) |
|
para = re.sub("(\.{6})([^”’])", r"\1\n\2", para) |
|
para = re.sub("(\…{2})([^”’])", r"\1\n\2", para) |
|
para = re.sub("([。!?\?][”’])([^,。!?\?])", r"\1\n\2", para) |
|
para = para.rstrip() |
|
return para.split("\n") |
|
|
|
|
|
if __name__ == "__main__": |
|
text = """ |
|
[说话人1] |
|
[说话人2]<zh>你好吗?<jp>元気ですか?<jp>こんにちは,世界。<zh>你好吗? |
|
[说话人3]<zh>谢谢。<jp>どういたしまして。 |
|
""" |
|
text_matching(text) |
|
|
|
test_text = """ |
|
[说话人1]<zh>你好,こんにちは!<jp>こんにちは,世界。 |
|
[说话人2]<zh>你好吗? |
|
""" |
|
text_matching(test_text) |
|
res = validate_text(test_text) |
|
print(res) |
|
|