| import jsonlines |
| import matplotlib.pyplot as plt |
| from collections import Counter |
| import os |
|
|
| |
| suffix_counter = Counter() |
|
|
| file_path = "./output/alignment.jsonl" |
| if not os.path.exists(file_path): |
| print(f"Error: {file_path} not found.") |
| else: |
| with jsonlines.open(file_path) as reader: |
| for obj in reader: |
| |
| parts = obj['file'].split(".") |
| suffix = parts[-1] if len(parts) > 1 else "no_suffix" |
| suffix_counter[suffix] += 1 |
|
|
| |
| sorted_suffixes = suffix_counter.most_common() |
|
|
| |
| total_files = sum(suffix_counter.values()) |
|
|
| |
| try: |
| from rich.console import Console |
| from rich.table import Table |
| from rich import box |
|
|
| console = Console() |
| table = Table(title="Language Distribution", box=box.ROUNDED) |
|
|
| table.add_column("Language (Suffix)", style="cyan", justify="left") |
| table.add_column("Count", style="magenta", justify="right") |
| table.add_column("Percentage", style="green", justify="right") |
|
|
| for suffix, count in sorted_suffixes: |
| percentage = (count / total_files) * 100 |
| table.add_row(suffix, str(count), f"{percentage:.1f}%") |
|
|
| console.print(table) |
| except ImportError: |
| |
| print(f"\n{'Language':<15} | {'Count':<10} | {'Percentage':<10}") |
| print("-" * 45) |
| for suffix, count in sorted_suffixes: |
| percentage = (count / total_files) * 100 |
| print(f"{suffix:<15} | {count:<10} | {percentage:.1f}%") |
|
|
| |
| if sorted_suffixes: |
| labels, values = zip(*sorted_suffixes) |
| |
| |
| plt.figure(figsize=(12, 6)) |
| bars = plt.bar(labels, values, color='skyblue') |
| plt.title('Language Distribution', fontsize=16) |
| plt.xlabel('Language', fontsize=12) |
| plt.ylabel('Count', fontsize=12) |
| plt.xticks(rotation=45) |
| |
| |
| for bar in bars: |
| height = bar.get_height() |
| plt.text(bar.get_x() + bar.get_width()/2., height, |
| f'{int(height)}', |
| ha='center', va='bottom') |
|
|
| plt.tight_layout() |
| plt.savefig("suffix.png") |
| print("\nBar chart saved to suffix.png") |
|
|
| |
| plt.figure(figsize=(10, len(sorted_suffixes) * 0.5 + 2)) |
| plt.axis('off') |
| |
| cell_text = [] |
| for suffix, count in sorted_suffixes: |
| percentage = (count / total_files) * 100 |
| cell_text.append([suffix, str(count), f"{percentage:.1f}%"]) |
| |
| col_labels = ["Language", "Count", "Percentage"] |
| col_colors = ["#CCCCFF", "#CCCCFF", "#CCCCFF"] |
| |
| table_plot = plt.table(cellText=cell_text, |
| colLabels=col_labels, |
| colColours=col_colors, |
| loc='center', |
| cellLoc='center') |
| |
| table_plot.auto_set_font_size(False) |
| table_plot.set_fontsize(12) |
| table_plot.scale(1.2, 1.5) |
| |
| plt.title('Language Distribution Table', fontsize=16, y=1.0) |
| |
| |
| |
| plt.savefig("suffix_table.png", bbox_inches='tight', dpi=300) |
| print("Table saved to suffix_table.png") |
| |