TunisianEncodersArena

Runtime error

App Files Files Community

hamzabouajila commited on Oct 6

Commit

8076d2b

1 Parent(s): 34aa785

remove unecessary files , update tasks fux bug on tasks mapping in get_leaderboard df

Browse files

Files changed (7) hide show

app.py +1 -1
src/about.py +9 -7
src/display/formatting.py +2 -2
src/evaluator/tsac.py +0 -133
src/evaluators/madar_tun.py +0 -108
src/leaderboard/read_evals.py +1 -1
src/populate.py +2 -0

app.py CHANGED Viewed

@@ -84,7 +84,7 @@ def init_leaderboard(dataframe):
 ### Space initialisation
 try:
-    print(f"\n=== Starting space initialization ===")
     print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
     print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")

 ### Space initialisation
 try:
+    print("\n=== Starting space initialization ===")
     print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
     print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")

src/about.py CHANGED Viewed

@@ -9,13 +9,15 @@ class Task:
 class Tasks(Enum):
-    sentiment_accuracy = Task("fbougares/tsac", "accuracy", "Accuracy (TSAC) ⬆️")
-    sentiment_f1 = Task("fbougares/tsac", "macro_f1", "Macro-F1 (TSAC) ⬆️")
-    ner_f1 = Task("arbml/tunisian_ner", "entity_f1", "Entity F1 (NER) ⬆️")
-    coverage = Task("arbml/Tunisian_Dialect_Corpus", "coverage", "Corpus Coverage % ⬆️")
-    arabizi_robustness = Task("tunis-ai/arabizi_eval", "arabizi_f1", "Arabizi Robustness F1 ⬆️")
-    code_switch = Task("tunis-ai/codeswitch_eval", "accuracy", "Code-Switch Accuracy ⬆️")
-    typo_robustness = Task("tunis-ai/typo_eval", "f1_drop", "Typo Robustness Drop % ⬇️")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------

 class Tasks(Enum):
+    sentiment_analysis = Task("tunis-ai/tsac", "Sentiment Analysis", "Accuracy (Sentiment Analysis) ⬆️")
+    normalization = Task("tunis-ai/MADAR-TUN", "Normalization", "Normalization F1 ⬆️")
+    transliteration = Task("tunis-ai/MADAR-TUN", "Transliteration", "Transliteration F1 ⬆️")
+    # sentiment_f1 = Task("fbougares/tsac", "macro_f1", "Macro-F1 (TSAC) ⬆️")
+    # ner_f1 = Task("arbml/tunisian_ner", "entity_f1", "Entity F1 (NER) ⬆️")
+    # coverage = Task("arbml/Tunisian_Dialect_Corpus", "coverage", "Corpus Coverage % ⬆️")
+    # arabizi_robustness = Task("tunis-ai/arabizi_eval", "arabizi_f1", "Arabizi Robustness F1 ⬆️")
+    # code_switch = Task("tunis-ai/codeswitch_eval", "accuracy", "Code-Switch Accuracy ⬆️")
+    # typo_robustness = Task("tunis-ai/typo_eval", "f1_drop", "Typo Robustness Drop % ⬇️")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------

src/display/formatting.py CHANGED Viewed

@@ -20,8 +20,8 @@ def styled_message(message):
 def has_no_nan_values(df, columns):
-    print(df.columns)
-    print(columns)
     return df[columns].notna().all(axis=1)

 def has_no_nan_values(df, columns):
+    # print(df.columns)
+    # print(columns)
     return df[columns].notna().all(axis=1)

src/evaluator/tsac.py DELETED Viewed

@@ -1,133 +0,0 @@
-import torch
-from datasets import load_dataset
-import traceback
-import time
-def evaluate_tsac_sentiment(model, tokenizer, device):
-    """Evaluate model on TSAC sentiment analysis task"""
-    try:
-        print("\n=== Starting TSAC sentiment evaluation ===")
-        print(f"Current device: {device}")
-        # Load and preprocess dataset
-        print("\nLoading and preprocessing TSAC dataset...")
-        dataset = load_dataset("fbougares/tsac", split="test", trust_remote_code=True)
-        dataset = dataset.select(range(10))  # Only evaluate on 200 samples
-        # print(f"Dataset size: {len(dataset)} examples")
-        def preprocess(examples):
-            return tokenizer(
-                examples['sentence'],
-                padding=True,
-                truncation=True,
-                max_length=512,
-                return_tensors=None
-            )
-        print(dataset.column_names)
-        dataset = dataset.map(preprocess, batched=True)
-        dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'target'])
-        # Check first example
-        first_example = dataset[0]
-        print("\nFirst example details:")
-        print(f"Input IDs shape: {first_example['input_ids'].shape}")
-        print(f"Attention mask shape: {first_example['attention_mask'].shape}")
-        print(f"Target: {first_example['target']}")
-        model.eval()
-        print(f"\nModel class: {model.__class__.__name__}")
-        print(f"Model device: {next(model.parameters()).device}")
-        with torch.no_grad():
-            predictions = []
-            targets = []
-            # Create DataLoader with batch size 16
-            from torch.utils.data import DataLoader
-            # Define a custom collate function
-            def collate_fn(batch):
-                input_ids = torch.stack([sample['input_ids'] for sample in batch])
-                attention_mask = torch.stack([sample['attention_mask'] for sample in batch])
-                targets = torch.stack([sample['target'] for sample in batch])
-                return {
-                    'input_ids': input_ids,
-                    'attention_mask': attention_mask,
-                    'target': targets
-                }
-            dataloader = DataLoader(
-                dataset,
-                batch_size=16,
-                shuffle=False,
-                collate_fn=collate_fn
-            )
-            for i, batch in enumerate(dataloader):
-                if i % 10 == 0 :
-                    print("\nProcessing first batch...")
-                    print(f"Batch keys: {list(batch.keys())}")
-                    print(f"Target shape: {batch['target'].shape}")
-                inputs = {k: v.to(device) for k, v in batch.items() if k != 'target'}
-                target = batch['target'].to(device)
-                before = time.time()
-                outputs = model(**inputs)
-                # print(f"\nBatch {i} output type: {type(outputs)}")
-                # Handle different model output formats
-                if isinstance(outputs, dict):
-                    # print(f"Output keys: {list(outputs.keys())}")
-                    if 'logits' in outputs:
-                        logits = outputs['logits']
-                    elif 'prediction_logits' in outputs:
-                        logits = outputs['prediction_logits']
-                    else:
-                        raise ValueError(f"Unknown output format. Available keys: {list(outputs.keys())}")
-                elif isinstance(outputs, tuple):
-                    print(f"Output tuple length: {len(outputs)}")
-                    logits = outputs[0]
-                else:
-                    logits = outputs
-                # print(f"Logits shape: {logits.shape}")
-                # For sequence classification, we typically use the [CLS] token's prediction
-                if len(logits.shape) == 3:  # [batch_size, sequence_length, num_classes]
-                    logits = logits[:, 0, :]  # Take the [CLS] token prediction
-                # print(f"Final logits shape: {logits.shape}")
-                batch_predictions = logits.argmax(dim=-1).cpu().tolist()
-                batch_targets = target.cpu().tolist()
-                predictions.extend(batch_predictions)
-                targets.extend(batch_targets)
-                if i % 10 == 0:
-                    print("\nFirst batch predictions:")
-                    print(f"Predictions: {batch_predictions[:5]}")
-                    print(f"Targets: {batch_targets[:5]}")
-            print(f"\nTotal predictions: {len(predictions)}")
-            print(f"Total targets: {len(targets)}")
-            # Calculate accuracy
-            correct = sum(p == t for p, t in zip(predictions, targets))
-            total = len(predictions)
-            accuracy = correct / total if total > 0 else 0.0
-            print(f"\nEvaluation results:")
-            print(f"Correct predictions: {correct}")
-            print(f"Total predictions: {total}")
-            print(f"Accuracy: {accuracy:.4f}")
-            return {"fbougares/tsac": accuracy}
-    except Exception as e:
-        print(f"\n=== Error in TSAC evaluation: {str(e)} ===")
-        print(f"Full traceback: {traceback.format_exc()}")
-        raise e

src/evaluators/madar_tun.py DELETED Viewed

@@ -1,108 +0,0 @@
-import torch
-from datasets import load_dataset
-# from transformers import AutoTokenizer, AutoModel
-from sklearn.metrics import accuracy_score
-# import argparse
-import warnings
-warnings.filterwarnings("ignore")
-def load_and_prepare_data():
-    """Load MADAR-TUN and prepare normalization & transliteration pairs."""
-    print("Loading MADAR-TUN dataset...")
-    ds = load_dataset("tunis-ai/MADAR-TUN", split="train")
-    valid_examples = [
-        ex for ex in ds
-        if ex["arabish"] != "<eos>"
-        and ex["words"] != "<eos>"
-        and ex["lem"] != "<eos>"
-        and ex["arabish"] is not None
-        and ex["arabish"].strip()
-        and ex["words"] is not None
-        and ex["words"].strip()
-        and ex["lem"] is not None
-        and ex["lem"].strip()
-    ]
-    print(f"Loaded {len(valid_examples)} valid token entries.")
-    # Build unique pairs (deduplicate)
-    norm_pairs = {}  # arabish -> canonical lemma
-    trans_pairs = {}  # arabish <-> arabic
-    for ex in valid_examples:
-        arabizi = ex["arabish"]
-        arabic = ex["words"]
-        lemma = ex["lem"]
-        # For normalization: use lemma as canonical form
-        if arabizi not in norm_pairs:
-            norm_pairs[arabizi] = lemma
-        if arabizi not in trans_pairs:
-            trans_pairs[arabizi] = arabic
-    print(f"Normalization pairs: {len(norm_pairs)}")
-    print(f"Transliteration pairs: {len(trans_pairs)}")
-    return norm_pairs, trans_pairs
-def evaluate_word_classification(model, tokenizer, word_pairs, device, task_name):
-    """
-    Evaluate word-level classification (normalization or transliteration).
-    Treats it as closed-vocabulary classification via embedding similarity.
-    """
-    words = list(word_pairs.keys())
-    targets = list(word_pairs.values())
-    # Build target vocabulary
-    unique_targets = sorted(set(targets))
-    target_to_id = {t: i for i, t in enumerate(unique_targets)}
-    _target_ids = [target_to_id[t] for t in targets]
-    print(f"\n[{task_name}] Vocabulary size: {len(unique_targets)}")
-    print(f"[{task_name}] Evaluation samples: {len(words)}")
-    # Get embeddings for all target forms
-    print(f"[{task_name}] Encoding target vocabulary...")
-    target_encodings = tokenizer(
-        unique_targets,
-        padding=True,
-        truncation=True,
-        max_length=32,
-        return_tensors="pt"
-    ).to(device)
-    with torch.no_grad():
-        target_embeds = model(**target_encodings).last_hidden_state[:, 0]  # [V, H]
-    # Predict for each input word
-    predictions = []
-    batch_size = 32
-    print(f"[{task_name}] Predicting...")
-    for i in range(0, len(words), batch_size):
-        batch_words = words[i:i+batch_size]
-        inputs = tokenizer(
-            batch_words,
-            padding=True,
-            truncation=True,
-            max_length=32,
-            return_tensors="pt"
-        ).to(device)
-        with torch.no_grad():
-            word_embeds = model(**inputs).last_hidden_state[:, 0]  # [B, H]
-            logits = torch.matmul(word_embeds, target_embeds.T)  # [B, V]
-            preds = logits.argmax(dim=1).cpu().tolist()
-            predictions.extend(preds)
-    # Map back to target IDs
-    true_labels = [target_to_id[t] for t in targets]
-    acc = accuracy_score(true_labels, predictions)
-    print(f"[{task_name}] Accuracy: {acc:.4f}")
-    return acc

src/leaderboard/read_evals.py CHANGED Viewed

@@ -33,7 +33,7 @@ class EvalResult:
         try:
             with open(json_filepath) as fp:
                 data = json.load(fp)
             # Extract model information from the JSON data
             full_model_name = data.get('model')
             org_and_model = full_model_name.split("/", 1)

         try:
             with open(json_filepath) as fp:
                 data = json.load(fp)
+            print(data)
             # Extract model information from the JSON data
             full_model_name = data.get('model')
             org_and_model = full_model_name.split("/", 1)

src/populate.py CHANGED Viewed

@@ -12,8 +12,10 @@ from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
     if df.empty:
         print("No evaluation results found. Returning empty DataFrame with correct columns.")
         return pd.DataFrame(columns=cols)

 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
+    # print(raw_data)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
+    # print(df)
     if df.empty:
         print("No evaluation results found. Returning empty DataFrame with correct columns.")
         return pd.DataFrame(columns=cols)