general-deep-learning / test /data /wiki_cleaner_test.py
yetrun's picture
ver1: 实现深度学习训练框架,支持 Wiki GPT 与诗歌生成双任务
a5fd608
"""
Wiki 清洗模块的单元测试。
"""
from pathlib import Path
from data.wiki.wiki_cleaner import (
filter_single_line,
filter_html_tags,
filter_empty_brackets,
filter_lang_tags,
clean,
)
from env.resolve import resolve_path
class TestFilterSingleLine:
"""测试单行过滤器"""
def test_single_line_returns_none(self):
"""单行文本应该返回 None"""
assert filter_single_line("这是一个重定向") is None
def test_single_line_with_whitespace_returns_none(self):
"""单行但包含空白字符应该返回 None"""
assert filter_single_line(" 这是一个重定向 ") is None
def test_multiple_lines_returns_original(self):
"""多行文本应该返回原文本"""
text = "第一行\n第二行\n第三行"
assert filter_single_line(text) == text
def test_multiple_lines_with_empty_lines(self):
"""多行包含空行应该返回原文本"""
text = "第一行\n\n第二行\n\n"
result = filter_single_line(text)
assert result == text
def test_empty_string_returns_none(self):
"""空字符串应该返回 None"""
assert filter_single_line("") is None
def test_only_whitespace_returns_none(self):
"""只有空白字符应该返回 None"""
assert filter_single_line(" \n \n ") is None
class TestFilterEmptyBrackets:
"""测试空括号过滤器"""
def test_remove_empty_parentheses_in_text(self):
"""移除文本中的空括号 ()"""
text = "这是()一段文本"
result = filter_empty_brackets(text)
assert result == "这是一段文本"
def test_remove_empty_chinese_brackets_in_text(self):
"""移除文本中的空中文括号 ()"""
text = "这是()一段文本"
result = filter_empty_brackets(text)
assert result == "这是一段文本"
def test_remove_brackets_with_space_in_text(self):
"""移除带空格的空括号"""
text = "这是( )一段( )文本"
result = filter_empty_brackets(text)
assert result == "这是一段文本"
def test_keep_brackets_with_content(self):
"""保留有内容的括号"""
text = "这是一个(有内容的)括号"
assert filter_empty_brackets(text) == text
def test_remove_square_brackets_in_text(self):
"""移除文本中的空方括号 []"""
text = "这是[]一段[ ]文本"
result = filter_empty_brackets(text)
assert result == "这是一段文本"
def test_remove_chinese_square_brackets_in_text(self):
"""移除文本中的空中文方括号 【】"""
text = "这是【】一段文本"
result = filter_empty_brackets(text)
assert result == "这是一段文本"
def test_remove_curly_brackets_in_text(self):
"""移除文本中的空花括号 {}"""
text = "这是{}一段{ }文本"
result = filter_empty_brackets(text)
assert result == "这是一段文本"
def test_no_brackets_returns_original(self):
"""没有括号应该返回原文本"""
text = "这是一段普通文本\n没有任何括号"
assert filter_empty_brackets(text) == text
def test_empty_string(self):
"""空字符串应该返回空字符串"""
assert filter_empty_brackets("") == ""
def test_multiple_empty_brackets(self):
"""移除多个空括号"""
text = "()()[]【】"
result = filter_empty_brackets(text)
assert result == ""
def test_mixed_empty_and_content_brackets(self):
"""混合空括号和有内容的括号"""
text = "这是()(有内容的)和[]的测试"
result = filter_empty_brackets(text)
assert result == "这是(有内容的)和的测试"
def test_multiple_lines_with_empty_brackets(self):
"""多行文本中的空括号 ()"""
text = "这是()一段文本\n这是()一段文本"
result = filter_empty_brackets(text)
assert result == "这是一段文本\n这是一段文本"
class TestFilterHtmlTags:
"""测试 HTML 标签过滤器"""
def test_remove_templatestyles_tag(self):
"""移除 templatestyles 标签(实体编码格式)"""
text = '<templatestyles src="ShareCSS/infobox.css" />正文内容'
result = filter_html_tags(text)
assert result == "正文内容"
def test_remove_multiple_tags(self):
"""移除多个 HTML 标签(实体编码格式)"""
text = "<div><p>段落</p></div>"
result = filter_html_tags(text)
assert result == "段落"
def test_no_tags_returns_original(self):
"""没有标签应该返回原文本"""
text = "这是一段普通文本"
assert filter_html_tags(text) == text
def test_empty_string(self):
"""空字符串应该返回空字符串"""
assert filter_html_tags("") == ""
def test_only_tags(self):
"""只有标签应该返回空字符串"""
text = '<templatestyles src="test.css" />'
assert filter_html_tags(text) == ""
def test_mixed_content(self):
"""混合内容应该只移除标签"""
text = "开头<tag>中间</tag>结尾"
result = filter_html_tags(text)
assert result == "开头中间结尾"
def test_multiple_lines_with_html_tags(self):
"""多行文本中的 HTML 标签"""
text = "第一行<tag>\n第二行<tag>\n第三行"
result = filter_html_tags(text)
assert result == "第一行\n第二行\n第三行"
class TestFilterLangTags:
"""测试语言转换标记过滤器"""
def test_remove_single_lang_tags(self):
"""移除单个语言转换标记"""
text = "-{H|zh-hans:重定向;zh-hant:重新导向;}-正文"
result = filter_lang_tags(text)
assert result == "正文"
def test_remove_multiple_lang_tagss(self):
"""移除多个语言转换标记"""
text = "-{H|zh-hans:重定向;zh-hant:重新导向;}--{H|zh-cn:字符;zh-tw:字元;}-正文"
result = filter_lang_tags(text)
assert result == "正文"
def test_remove_complex_lang_tags(self):
"""移除复杂的语言转换标记"""
text = (
"-{H|zh-hans:文件; zh-hant:档案;}--{H|zh-hans:快捷方式; zh-hant:捷径;}-正文"
)
result = filter_lang_tags(text)
assert result == "正文"
def test_no_lang_tags_returns_original(self):
"""没有语言转换标记应该返回原文本"""
text = "这是一段普通文本"
assert filter_lang_tags(text) == text
def test_empty_string(self):
"""空字符串应该返回空字符串"""
assert filter_lang_tags("") == ""
def test_only_lang_tags(self):
"""只有语言转换标记应该返回空字符串"""
text = "-{H|zh-hans:重定向;zh-hant:重新导向;}-"
assert filter_lang_tags(text) == ""
def test_multiple_lines_with_lang_tags(self):
"""多行文本中的语言转换标记"""
text = "第一行-{H|zh-hans:测试1;}-\n第二行-{H|zh-hans:测试2;}-\n第三行"
result = filter_lang_tags(text)
assert result == "第一行\n第二行\n第三行"
def test_nested_lang_tags(self):
"""移除嵌套的语言转换标记"""
text = "-{T|zh:-{zh|}-;zh-hans:-{zh-hans|}-;zh-hant:-{zh-hant|}-;}-正文"
result = filter_lang_tags(text)
assert result == "正文"
def test_deeply_nested_lang_tags(self):
"""移除深度嵌套的语言转换标记"""
text = "-{A|-{B|-{C|内容}-}-}-正文"
result = filter_lang_tags(text)
assert result == "正文"
class TestCleanIntegration:
"""测试 clean 函数的集成效果"""
def test_single_line_returns_none(self):
"""单行文本应该返回 None"""
assert clean("重定向") is None
def test_empty_after_filtering_returns_none(self):
"""过滤后为空应该返回 None"""
text = "()()[]"
assert clean(text) is None
def test_multiple_filters_applied(self):
"""多个过滤器应该依次应用"""
text = """第一行
<templatestyles src="test.css" />
()
-{H|zh-hans:测试;zh-hant:測試;}-
第二行"""
result = clean(text)
assert result is not None
assert "<" not in result
assert "()" not in result
assert "-{" not in result
assert "第一行" in result
assert "第二行" in result
def test_real_wiki_example(self):
"""真实 wiki 文本示例"""
text = """词条标题
<templatestyles src="ShareCSS/infobox.css" />
这是正文内容。
()
-{H|zh-hans:重定向;zh-hant:重新导向;}-
更多内容。"""
result = clean(text)
assert result is not None
assert "<templatestyles" not in result
assert "()" not in result
assert "-{" not in result
assert "这是正文内容" in result
assert "更多内容" in result
def test_normal_text_unchanged(self):
"""正常文本应该保持不变"""
text = """第一行
第二行
第三行"""
result = clean(text)
assert result == text
def test_only_whitespace_returns_none(self):
"""只有空白字符应该返回 None"""
assert clean(" \n \n ") is None
def test_multiple_lines_clean(self):
"""多行文本的完整清洗"""
text = """词条标题
<templatestyles src="test.css" />
这是()一段()文本
-{H|zh-hans:测试;zh-hant:測試;}-
第二行
<div>标签</div>
()空括号
第三行"""
result = clean(text)
assert result is not None
assert "<" not in result
assert "()" not in result
assert "()" not in result
assert "-{" not in result
assert "这是一段文本" in result
assert "第二行" in result
assert "第三行" in result
def test_clean_demo_text():
"""读取 demo_text.txt 文件并打印清洗后的内容"""
demo_file = resolve_path("test/fixtures/clean/demo_text.txt")
with open(demo_file, "r", encoding="utf-8") as f:
content = f.read()
result = clean(content)
print("\n" + "=" * 50)
print("清洗后的内容:")
print("=" * 50)
print(result)
print("=" * 50)
assert result is not None