File size: 5,775 Bytes
b6a70f8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import json
import os
from datetime import datetime
def filter_by_duration(input_file, output_file, min_duration=10, max_duration=100):
"""
过滤JSON文件,只保留total_duration在[min_duration, max_duration]范围内的条目
并记录被删除的文件信息到日志文件
:param input_file: 输入JSON文件路径
:param output_file: 输出JSON文件路径
:param min_duration: 最小持续时间(秒)
:param max_duration: 最大持续时间(秒)
"""
# 创建日志目录
log_dir = os.path.join(os.path.dirname(output_file), "filter_logs")
if not os.path.exists(log_dir):
os.makedirs(log_dir)
# 创建日志文件(以当前时间命名)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(log_dir, f"removed_entries_{timestamp}.log")
# 加载原始JSON文件
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# 初始化过滤结果和删除列表
filtered_data = {}
removed_entries = []
# 过滤数据并记录被删除的条目
for key, value in data.items():
if 'total_duration' in value and min_duration <= value['total_duration'] <= max_duration:
filtered_data[key] = value
else:
duration = value.get('total_duration', 'N/A')
removed_entries.append({
'key': key,
'duration': duration,
'original_dialog_id': value.get('original_dialog_id', 'N/A'),
'reason': 'too_short' if isinstance(duration, (int, float)) and duration < min_duration
else 'too_long' if isinstance(duration, (int, float)) and duration > max_duration
else 'missing_or_invalid'
})
# 保存过滤后的结果
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(filtered_data, f, indent=2, ensure_ascii=False)
# 保存删除日志
with open(log_file, 'w', encoding='utf-8') as f:
f.write(f"Filtering log - {timestamp}\n")
f.write(f"Input file: {input_file}\n")
f.write(f"Output file: {output_file}\n")
f.write(f"Duration range: {min_duration}s to {max_duration}s\n\n")
f.write("Removed Entries:\n")
f.write("="*50 + "\n")
for entry in removed_entries:
f.write(f"Key: {entry['key']}\n")
f.write(f"Original Dialog ID: {entry['original_dialog_id']}\n")
f.write(f"Duration: {entry['duration']}s\n")
f.write(f"Reason: {entry['reason']}\n")
f.write("-"*50 + "\n")
print(f"\n处理结果: {os.path.basename(input_file)}")
print(f"原始条目数: {len(data)}")
print(f"过滤后条目数: {len(filtered_data)}")
print(f"已删除 {len(removed_entries)} 个不符合时长要求的条目")
print(f"过滤后的数据已保存到: {output_file}")
print(f"删除条目日志已保存到: {log_file}")
def process_directory(input_dir, output_dir, min_duration=10, max_duration=90):
"""
处理目录中的所有JSON文件
"""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 创建总日志文件
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
summary_log = os.path.join(output_dir, f"summary_removed_entries_{timestamp}.log")
total_removed = 0
total_processed = 0
with open(summary_log, 'w', encoding='utf-8') as summary_f:
summary_f.write(f"Summary Filtering Log - {timestamp}\n")
summary_f.write(f"Input directory: {input_dir}\n")
summary_f.write(f"Output directory: {output_dir}\n")
summary_f.write(f"Duration range: {min_duration}s to {max_duration}s\n\n")
for filename in os.listdir(input_dir):
if filename.endswith('.json'):
input_path = os.path.join(input_dir, filename)
output_path = os.path.join(output_dir, filename)
print(f"\n处理文件: {filename}")
filter_by_duration(input_path, output_path, min_duration, max_duration)
# 读取单个文件日志以获取统计信息
log_dir = os.path.join(output_dir, "filter_logs")
latest_log = max(
[f for f in os.listdir(log_dir) if f.startswith('removed_entries')],
key=lambda f: os.path.getmtime(os.path.join(log_dir, f)))
with open(os.path.join(log_dir, latest_log), 'r', encoding='utf-8') as log_f:
log_content = log_f.read()
removed_count = log_content.count("Key: ")
summary_f.write(f"\nFile: {filename}\n")
summary_f.write(f"Removed entries: {removed_count}\n")
summary_f.write("-"*40 + "\n")
total_removed += removed_count
total_processed += 1
summary_f.write(f"\nTotal files processed: {total_processed}\n")
summary_f.write(f"Total entries removed: {total_removed}\n")
print(f"\n处理完成!所有文件的总日志已保存到: {summary_log}")
if __name__ == "__main__":
# 使用示例 - 处理单个文件
# input_json = "silence.json" # 替换为你的输入文件路径
# output_json = "silence_filtered_output.json" # 输出文件路径
# filter_by_duration(input_json, output_json)
# 使用示例 - 处理整个目录
input_directory = "./" # 替换为你的输入目录
output_directory = "./filtered_output" # 替换为你的输出目录
process_directory(input_directory, output_directory) |