from pathlib import Path from collections import defaultdict # n割以上に共通してるタグをピックアップするやつ def count_tags(directory_path): dir_path = Path(directory_path) tag_count = defaultdict(int) total_files = 0 for file_path in dir_path.glob('*.txt'): total_files += 1 with open(file_path, 'r') as f: tags = f.read().replace(" ", "").strip().split(',') for tag in tags: tag_count[tag] += 1 print(tag) return tag_count, total_files def find_common_tags(tag_count, total_files, threshold): common_tags = [tag for tag, count in tag_count.items() if count / total_files >= threshold] return common_tags if __name__ == "__main__": directory_path = r"E:\Dataset\XXXXXXXX" # ここにディレクトリへのパスを入力してください threshold = 0.8 # 8割以上のキャプションファイルに使われているタグのみ tag_count, total_files = count_tags(directory_path) print(tag_count) print(total_files) common_tags = find_common_tags(tag_count, total_files, threshold) output = ", ".join(common_tags) print(f"Common tags (used in {threshold * 100}% or more of the files): {output}")