morris-bot / src /preprocess.py
eusholli's picture
Upload folder using huggingface_hub
599c2c0 verified
"""
Data preprocessing for fine-tuning on Iain Morris articles
"""
import json
import re
from typing import List, Dict, Tuple
import pandas as pd
from datasets import Dataset
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class ArticlePreprocessor:
def __init__(self):
"""Initialize the preprocessor"""
self.min_content_length = 500
self.max_content_length = 8000
self.system_prompt = """You are Iain Morris, a veteran telecom journalist with a razor-sharp pen and zero tolerance for industry BS. Your writing style is distinctive for:
PROVOCATIVE TITLES & OPENINGS:
- Always lead with conflict, failure, or impending doom
- Use dramatic, negative framing even for mundane topics
- Open with vivid scenarios that immediately establish tension
- Frame everything as battles, collisions, or disasters waiting to happen
SIGNATURE NEGATIVE ANALOGIES:
- Compare industry situations to train wrecks, collisions, explosions
- Use visceral, physical metaphors for business problems
- Reference pop culture disasters and failures
- Turn technical concepts into dramatic, often dark imagery
WRITING TECHNIQUE:
- Cynical, sarcastic commentary on industry players
- Technical expertise delivered with biting wit
- Assume readers are intelligent but skeptical
- Build articles around conflict narratives
- Use parenthetical asides for extra snark
- Quote industry figures, then immediately undercut them
Write compelling telecom news articles that grab readers by the throat from the first sentence and never let go."""
def load_articles(self, filepath: str) -> List[Dict]:
"""
Load articles from JSON file
Args:
filepath: Path to the JSON file containing articles
Returns:
List of article dictionaries
"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
articles = json.load(f)
logger.info(f"Loaded {len(articles)} articles from {filepath}")
return articles
except Exception as e:
logger.error(f"Error loading articles: {e}")
return []
def clean_content(self, content: str) -> str:
"""
Clean article content for training
Args:
content: Raw article content
Returns:
Cleaned content
"""
if not content:
return ""
# Remove URLs
content = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', content)
# Remove email addresses
content = re.sub(r'\S+@\S+', '', content)
# Remove excessive whitespace but preserve paragraph breaks
content = re.sub(r'[ \t]+', ' ', content) # Multiple spaces/tabs to single space
content = re.sub(r'\n\s*\n\s*\n+', '\n\n', content) # Multiple line breaks to double
# Enhanced footer/header cleaning for new crawler format
content = re.sub(r'Light Reading.*?All rights reserved\.?', '', content, flags=re.IGNORECASE)
content = re.sub(r'Copyright.*?Light Reading', '', content, flags=re.IGNORECASE)
content = re.sub(r'Copyright.*?Informa.*?TechTarget.*?registered office.*?', '', content, flags=re.IGNORECASE | re.DOTALL)
content = re.sub(r'You May Also Like.*?$', '', content, flags=re.IGNORECASE | re.DOTALL)
content = re.sub(r'Featured Story.*?$', '', content, flags=re.IGNORECASE | re.DOTALL)
content = re.sub(r'Read more about:.*?$', '', content, flags=re.IGNORECASE | re.DOTALL)
content = re.sub(r'Subscribe.*?newsletter', '', content, flags=re.IGNORECASE)
content = re.sub(r'Follow.*?Twitter', '', content, flags=re.IGNORECASE)
# Remove author bio sections (they appear at the end)
content = re.sub(r'Iain Morris International Editor, Light Reading.*?$', '', content, flags=re.IGNORECASE | re.DOTALL)
# Remove advertisement markers
content = re.sub(r'\[Advertisement\]', '', content, flags=re.IGNORECASE)
content = re.sub(r'ADVERTISEMENT', '', content, flags=re.IGNORECASE)
# Clean up quotes and apostrophes
content = content.replace('"', '"').replace('"', '"')
content = content.replace(''', "'").replace(''', "'")
# Remove trailing whitespace and normalize line endings
content = '\n'.join(line.rstrip() for line in content.split('\n'))
content = content.strip()
return content
def has_provocative_elements(self, title: str, content: str) -> bool:
"""
Check if article has Iain Morris's provocative style elements
Args:
title: Article title
content: Article content
Returns:
True if article has strong stylistic elements
"""
# Provocative title indicators
provocative_words = [
'danger', 'threat', 'crisis', 'disaster', 'collapse', 'failure', 'fiasco',
'wreck', 'crash', 'collision', 'explosion', 'doom', 'catastrophe',
'doubt', 'question', 'challenge', 'attack', 'battle', 'war', 'fight',
'gross', 'massive', 'huge', 'epic', 'monster', 'brutal'
]
title_lower = title.lower()
title_score = sum(1 for word in provocative_words if word in title_lower)
# Negative analogy indicators in content
analogy_patterns = [
r'train wreck', r'off the rails', r'collision', r'explosion', r'fiasco',
r'disaster', r'catastrophe', r'meltdown', r'implosion', r'crash',
r'like.*disaster', r'as.*wreck', r'resembl.*catastrophe'
]
content_lower = content.lower()
analogy_score = sum(1 for pattern in analogy_patterns if re.search(pattern, content_lower))
# Sarcastic/cynical indicators
cynical_patterns = [
r'of course', r'naturally', r'predictably', r'unsurprisingly',
r'needless to say', r'obviously', r'clearly', r'evidently'
]
cynical_score = sum(1 for pattern in cynical_patterns if re.search(pattern, content_lower))
# Calculate total style score
total_score = title_score + analogy_score + cynical_score
return total_score >= 2 # Require at least 2 style elements
def extract_topic_from_title(self, title: str) -> str:
"""
Extract a topic prompt from the article title, preserving provocative framing
Args:
title: Article title
Returns:
Topic prompt for training
"""
# Preserve provocative elements in the topic
topic = title
# For provocative titles, maintain the dramatic framing
provocative_starters = [
'danger', 'threat', 'crisis', 'disaster', 'collapse', 'failure',
'doubt', 'question', 'challenge', 'attack', 'battle'
]
title_lower = title.lower()
is_provocative = any(starter in title_lower for starter in provocative_starters)
if is_provocative:
# Keep the provocative framing
if topic.endswith('?'):
topic = topic[:-1]
return f"Analyze the controversy and implications of: {topic}"
else:
# Standard topic extraction for less provocative titles
if topic.endswith('?'):
topic = topic[:-1]
if not topic.lower().startswith(('what', 'how', 'why', 'when', 'where', 'who')):
topic = f"Discuss the industry implications of {topic.lower()}"
# Add context if too short
if len(topic.split()) < 3:
topic = f"Write about {topic} in the telecom industry"
return topic
def filter_articles(self, articles: List[Dict]) -> List[Dict]:
"""
Filter articles based on quality criteria and prioritize provocative style
Args:
articles: List of article dictionaries
Returns:
Filtered list of articles, sorted by style strength
"""
filtered = []
style_scores = []
for article in articles:
content = article.get('content', '')
title = article.get('title', '')
# Skip if missing essential fields
if not content or not title:
continue
# Skip if content is too short or too long
if len(content) < self.min_content_length or len(content) > self.max_content_length:
continue
# Skip if title is too generic
if len(title.split()) < 3:
continue
# Skip if content seems to be mostly navigation/UI elements
if content.count('Click') > 5 or content.count('Subscribe') > 3:
continue
# Calculate style score for prioritization
cleaned_content = self.clean_content(content)
has_style = self.has_provocative_elements(title, cleaned_content)
# Calculate detailed style score for sorting
provocative_words = [
'danger', 'threat', 'crisis', 'disaster', 'collapse', 'failure', 'fiasco',
'wreck', 'crash', 'collision', 'explosion', 'doom', 'catastrophe',
'doubt', 'question', 'challenge', 'attack', 'battle', 'war', 'fight',
'gross', 'massive', 'huge', 'epic', 'monster', 'brutal'
]
title_lower = title.lower()
title_score = sum(1 for word in provocative_words if word in title_lower)
analogy_patterns = [
r'train wreck', r'off the rails', r'collision', r'explosion', r'fiasco',
r'disaster', r'catastrophe', r'meltdown', r'implosion', r'crash',
r'like.*disaster', r'as.*wreck', r'resembl.*catastrophe'
]
content_lower = cleaned_content.lower()
analogy_score = sum(1 for pattern in analogy_patterns if re.search(pattern, content_lower))
cynical_patterns = [
r'of course', r'naturally', r'predictably', r'unsurprisingly',
r'needless to say', r'obviously', r'clearly', r'evidently'
]
cynical_score = sum(1 for pattern in cynical_patterns if re.search(pattern, content_lower))
total_style_score = title_score + analogy_score + cynical_score
filtered.append(article)
style_scores.append(total_style_score)
# Sort by style score (highest first) to prioritize provocative articles
sorted_pairs = sorted(zip(filtered, style_scores), key=lambda x: x[1], reverse=True)
filtered = [article for article, score in sorted_pairs]
# Count articles with strong style elements
strong_style_count = sum(1 for score in style_scores if score >= 2)
logger.info(f"Filtered {len(articles)} articles down to {len(filtered)} quality articles")
logger.info(f"Articles with strong Iain Morris style elements: {strong_style_count}")
return filtered
def create_training_examples(self, articles: List[Dict]) -> List[Dict]:
"""
Create training examples in instruction-response format
Args:
articles: List of article dictionaries
Returns:
List of training examples
"""
training_examples = []
for article in articles:
title = article.get('title', '')
content = self.clean_content(article.get('content', ''))
if not title or not content:
continue
# Create topic prompt from title
topic = self.extract_topic_from_title(title)
# Create training example
example = {
'instruction': f"Write a telecom industry news article about: {topic}",
'input': "",
'output': f"# {title}\n\n{content}",
'system': self.system_prompt
}
training_examples.append(example)
logger.info(f"Created {len(training_examples)} training examples")
return training_examples
def create_chat_format(self, examples: List[Dict]) -> List[Dict]:
"""
Convert examples to chat format for training
Args:
examples: List of training examples
Returns:
List of examples in chat format
"""
chat_examples = []
for example in examples:
chat_example = {
'messages': [
{
'role': 'system',
'content': example['system']
},
{
'role': 'user',
'content': example['instruction']
},
{
'role': 'assistant',
'content': example['output']
}
]
}
chat_examples.append(chat_example)
return chat_examples
def split_dataset(self, examples: List[Dict], train_ratio: float = 0.9) -> Tuple[List[Dict], List[Dict]]:
"""
Split dataset into train and validation sets
Args:
examples: List of training examples
train_ratio: Ratio of examples to use for training
Returns:
Tuple of (train_examples, val_examples)
"""
split_idx = int(len(examples) * train_ratio)
# Shuffle examples
import random
random.seed(42)
shuffled = examples.copy()
random.shuffle(shuffled)
train_examples = shuffled[:split_idx]
val_examples = shuffled[split_idx:]
logger.info(f"Split dataset: {len(train_examples)} train, {len(val_examples)} validation")
return train_examples, val_examples
def save_dataset(self, examples: List[Dict], filepath: str):
"""
Save dataset to JSON file
Args:
examples: List of examples
filepath: Output file path
"""
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(examples, f, indent=2, ensure_ascii=False)
logger.info(f"Saved {len(examples)} examples to {filepath}")
def create_hf_dataset(self, examples: List[Dict]) -> Dataset:
"""
Create Hugging Face Dataset object
Args:
examples: List of training examples
Returns:
Hugging Face Dataset
"""
return Dataset.from_list(examples)
def process_articles(self, input_file: str, output_dir: str = "data"):
"""
Complete preprocessing pipeline
Args:
input_file: Path to raw articles JSON file
output_dir: Directory to save processed data
"""
logger.info("Starting article preprocessing pipeline")
# Load articles
articles = self.load_articles(input_file)
if not articles:
logger.error("No articles loaded, exiting")
return
# Disable Filter articles
filtered_articles = articles # self.filter_articles(articles)
if not filtered_articles:
logger.error("No articles passed filtering, exiting")
return
# Create training examples
training_examples = self.create_training_examples(filtered_articles)
if not training_examples:
logger.error("No training examples created, exiting")
return
# Load additional training examples from supplementary files
logger.info("Loading additional training examples from supplementary files")
# Load general Iain Morris style examples
try:
with open('data/additional_training_examples.json', 'r', encoding='utf-8') as f:
additional_examples = json.load(f)
logger.info(f"Loaded {len(additional_examples)} additional training examples")
# Convert chat format to training format and add to training_examples
for example in additional_examples:
if 'messages' in example and len(example['messages']) >= 3:
system_msg = example['messages'][0]['content']
user_msg = example['messages'][1]['content']
assistant_msg = example['messages'][2]['content']
training_example = {
'instruction': user_msg,
'input': "",
'output': assistant_msg,
'system': system_msg
}
training_examples.append(training_example)
except Exception as e:
logger.warning(f"Could not load additional_training_examples.json: {e}")
# Load expanded telecom training dataset
try:
with open('data/expanded_train_dataset.json', 'r', encoding='utf-8') as f:
expanded_examples = json.load(f)
logger.info(f"Loaded {len(expanded_examples)} expanded training examples")
# Convert chat format to training format and add to training_examples
for example in expanded_examples:
if 'messages' in example and len(example['messages']) >= 3:
system_msg = example['messages'][0]['content']
user_msg = example['messages'][1]['content']
assistant_msg = example['messages'][2]['content']
training_example = {
'instruction': user_msg,
'input': "",
'output': assistant_msg,
'system': system_msg
}
training_examples.append(training_example)
except Exception as e:
logger.warning(f"Could not load expanded_train_dataset.json: {e}")
logger.info(f"Total training examples after adding supplementary data: {len(training_examples)}")
# Convert to chat format
chat_examples = self.create_chat_format(training_examples)
# Split dataset
train_examples, val_examples = self.split_dataset(chat_examples)
# Save datasets
self.save_dataset(train_examples, f"{output_dir}/train_dataset.json")
self.save_dataset(val_examples, f"{output_dir}/val_dataset.json")
self.save_dataset(training_examples, f"{output_dir}/processed_dataset.json")
# Create and save HF datasets
train_dataset = self.create_hf_dataset(train_examples)
val_dataset = self.create_hf_dataset(val_examples)
train_dataset.save_to_disk(f"{output_dir}/train_hf_dataset")
val_dataset.save_to_disk(f"{output_dir}/val_hf_dataset")
# Print summary
print(f"\nPreprocessing Summary:")
print(f"Original articles: {len(articles)}")
print(f"Filtered articles: {len(filtered_articles)}")
print(f"Training examples: {len(train_examples)}")
print(f"Validation examples: {len(val_examples)}")
print(f"Average article length: {sum(len(ex['messages'][2]['content']) for ex in train_examples) // len(train_examples)} characters")
# Show sample
if train_examples:
print(f"\nSample training example:")
sample = train_examples[0]
print(f"User: {sample['messages'][1]['content'][:100]}...")
print(f"Assistant: {sample['messages'][2]['content'][:200]}...")
def main():
"""
Main function to run preprocessing
"""
preprocessor = ArticlePreprocessor()
preprocessor.process_articles("data/raw_articles.json")
if __name__ == "__main__":
main()