| | import re
|
| |
|
| | def contains_chinese(text):
|
| | """
|
| | Unicode 范围 \u4e00-\u9fff 包含常见的汉字
|
| | """
|
| | return re.search(r'[\u4e00-\u9fff]', text) is not None
|
| |
|
| | def process_lyrics(text):
|
| | """
|
| | 处理歌词文本:
|
| | 1. 按 '/' 分割
|
| | 2. 去除空白及空行
|
| | 3. 过滤掉不包含中文(视为英文)的歌词
|
| | 4. 去除重复歌词(保持原始顺序)
|
| | """
|
| |
|
| | lyrics = text.split('/')
|
| | processed = []
|
| | seen = set()
|
| |
|
| | for line in lyrics:
|
| |
|
| | line = line.strip()
|
| |
|
| | if not line:
|
| | continue
|
| |
|
| | if not contains_chinese(line):
|
| | continue
|
| | if len(line) < 3:
|
| | continue
|
| |
|
| | if line not in seen:
|
| | seen.add(line)
|
| | processed.append(line)
|
| |
|
| | return processed
|
| |
|
| | def main():
|
| | input_filename = 'data\lyrics.txt'
|
| | output_filename = 'data\processed_data.txt'
|
| |
|
| |
|
| | with open(input_filename, 'r', encoding='utf-8') as f:
|
| | content = f.read()
|
| |
|
| |
|
| | processed = process_lyrics(content)
|
| |
|
| |
|
| | output_content = '/'.join(processed)
|
| |
|
| |
|
| | with open(output_filename, 'w', encoding='utf-8') as f:
|
| | f.write(output_content)
|
| |
|
| | print(f'处理完成,结果保存在 {output_filename}')
|
| |
|
| |
|
| | if __name__ == '__main__':
|
| | main()
|
| |
|