# OptimAbstract
It aims at building a meta-model on top of T5 model in order to adapt the model choice relatively to the complextity of the text to compress.

Several steps. During learning phase:
1. Find relevant features that represents the complexity with low computational time
2. Apply the candidate models and select the best with regard with a fixed criteria (BertScore)
3. Fit a classifier to predict, from the features, the best model
In the inference: simply predict the classifier, and choose the right model.

In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from bert_score import score
from sklearn.ensemble import RandomForestClassifier

from model import MetaModel, save_object
import time
from model import T5Model, extract_features, get_best_model

## Data loading

For the first idea, let us work on a very small amount of data

In [None]:
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train")

In [None]:
# I want a wide diversity of complexity
train_dataset = dataset.map(lambda x: {"text_length": len(x["article"])})
train_dataset = train_dataset.sort("text_length")
num_samples = 500
indices = np.linspace(0, len(train_dataset) - 1, num_samples, dtype=int)
selected_samples = train_dataset.select(indices)
print([ex["text_length"] for ex in selected_samples])

In [None]:
selected_samples

In [None]:
model_names = ["google-t5/t5-small", "google-t5/t5-base", "google-t5/t5-large"]

## Exploring features and classifier

In [None]:
models = {name: T5Model(name) for name in model_names}
train_texts = selected_samples["article"]
train_summaries = selected_samples["highlights"]

In [None]:
features_name = list(extract_features(train_texts[0]).keys())

In [None]:
X = np.array([list(extract_features(text).values()) for text in train_texts])

In [None]:
y = get_best_model(models, train_texts, train_summaries, tolerance=0)

In [None]:
import pandas as pd

df = pd.DataFrame(
 columns=["best_model_name"] + features_name, data=np.concatenate((y.reshape(-1, 1), X), axis=1)
)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 30))
for i, feature in enumerate(features_name):
 plt.subplot(len(features_name) // 2, len(features_name) // 2, i + 1)
 sns.boxplot(x="best_model_name", y=feature, data=df)
 plt.xticks(rotation=45)
 plt.yticks(rotation=0)
 plt.locator_params(axis="y", nbins=6)
 plt.title(feature)
plt.tight_layout()
plt.show()

The features do not seem to be relevant, I have to work further.

## MetaModel

In [None]:
import warnings
warnings.filterwarnings("ignore", message=".*Some weights of RobertaModel.*")
meta_model = MetaModel(model_names, base_classifier=RandomForestClassifier(), tolerance=0.01)
meta_model.fit(selected_samples["article"], selected_samples["highlights"])
save_object(meta_model, "first_model.pkl")

In [None]:
test_dataset = dataset.shuffle(seed=42).select(range(100))

In [None]:
meta_model_scores = []
meta_model_times = []
model_scores = {name: [] for name in model_names}
model_times = {name: [] for name in model_names}

for i, dataset_ in enumerate(test_dataset):
 predicted_summary, meta_time = meta_model.summarize(dataset_["article"])
 P, R, F1 = score([predicted_summary], [dataset_["highlights"]], lang="en", verbose=False)
 meta_model_scores.append(F1.item())
 meta_model_times.append(meta_time)

 model_results = []
 for model_name in model_names:
 model = meta_model.models[model_name]
 summary, elapsed_time = model.summarize(dataset_["article"])
 P, R, F1 = score([summary], [dataset_["highlights"]], lang="en", verbose=False)
 f1_score = F1.item()

 model_scores[model_name].append(f1_score)
 model_times[model_name].append(elapsed_time)
 model_results.append((model_name, f1_score, elapsed_time))



In [9]:
print("\n===== Model Evaluation =====")
for model_name in model_names:
 avg_score = np.mean(model_scores[model_name])
 avg_time = np.mean(model_times[model_name])
 print(f"{model_name}: BERTScore={avg_score:.4f}, Time={avg_time:.4f}s")

print(
 f" MetaModel : BERTScore={np.mean(meta_model_scores):.4f}, "
 f"Time={np.mean(meta_model_times):.4f}s"
)


===== Model Evaluation =====
google-t5/t5-small: BERTScore=0.8639, Time=2.0048s
google-t5/t5-base: BERTScore=0.8720, Time=5.2173s
google-t5/t5-large: BERTScore=0.8664, Time=15.8678s
 MetaModel : BERTScore=0.8681, Time=3.2380s


17/02/25 : The results are better with tol at 1%. I should rerun with :
- add the feature computation in the meta model time cost
- analyze more deeply the features and the classifier performances
- Should change the MetaModel structure because it is too large to be commited (4GB)