import os
from collections import Counter
import math


def calculate_topk_upper_bound(file_path, k=5):
    """
    Calculates the upper bound for top-k accuracy based on the tokenized text file.
    
    Args:
        file_path (str): Path to the input text file.
        k (int): Top-k accuracy value to compute.
    
    Returns:
        float: The upper bound for top-k accuracy.
    """
    try:
        # Read the file and tokenize by spaces
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()

        tokens = text.split()  # Tokenize by spaces
        
        # Calculate token frequencies
        token_counts = Counter(tokens)
        total_tokens = len(tokens)

        if total_tokens == 0:
            return 0

        # Convert frequencies to probabilities
        token_probabilities = {token: count / total_tokens for token, count in token_counts.items()}
        
        # Calculate entropy
        entropy = -sum(p * math.log2(p) for p in token_probabilities.values())
        
        # Calculate top-k accuracy upper bound
        sorted_tokens = sorted(token_probabilities.items(), key=lambda x: x[1], reverse=True)
        top_k_prob = sum(prob for _, prob in sorted_tokens[:k])

        # Print entropy and top-k accuracy upper bound
        print(f"Entropy: {entropy:.4f} bits")
        print(f"Top-{k} Accuracy Upper Bound: {top_k_prob:.4f}")
        return top_k_prob
    except Exception as e:
        print(f"Error: {e}")
        return None


# Example usage
file_path = os.path.expanduser(
    "~/torch_datasets/github-python/corpus/data/corpus_processed.txt"
)

top_k_accuracy = calculate_topk_upper_bound(file_path, k=5)
if top_k_accuracy is not None:
    print(f"Upper Bound for Top-5 Accuracy: {top_k_accuracy:.4f}")