LLMDH
/

French-TV-Headline-Classification

+import pandas as pd
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+from tqdm.auto import tqdm
+# Constants
+batch_size = 1000
+# Load tokenizer and model
+model_checkpoint = "PleIAs/French-TV-Headline-Classification"
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=512)
+model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
+classification_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)
+# Read the dataset
+val_classification = pd.read_parquet("[file]")
+val_classification.reset_index(drop=True, inplace=True)
+# Calculate the number of batches needed
+num_batches = (len(val_classification) + batch_size - 1) // batch_size
+# Initialize the list to collect DataFrames
+list_df = []
+for i in tqdm(range(num_batches), desc="Processing batches"):
+    start_index = i * batch_size
+    end_index = min((i + 1) * batch_size, len(val_classification))
+    batch = val_classification.iloc[start_index:end_index]
+    # Extract texts from the DataFrame
+    texts = batch["corrected_text"].tolist()
+    # Classify texts in batches
+    classifications = classification_pipeline(texts, truncation=True, padding=True, top_k=None)
+    # Prepare data for DataFrame
+    rows = []
+    for text_index, class_results in enumerate(classifications):
+        for entry in class_results:
+            rows.append({
+                'text_id': start_index + text_index,
+                'label': entry['label'],
+                'score': round(entry['score'] * 100, 2),
+                'identifier': batch.iloc[text_index]['identifier']
+            })
+    # Create DataFrame from the processed batch
+    df = pd.DataFrame(rows)
+    list_df.append(df)
+# Concatenate all DataFrames in the list
+final_df = pd.concat(list_df, ignore_index=True)
+print(final_df)
+# Save the resulting DataFrame to a CSV file
+final_df.to_csv("transcript_classification.csv", index=False)