File size: 6,157 Bytes
import pandas as pd
import numpy as np
from transformers import GPT2Tokenizer, GPT2Model
from sklearn.preprocessing import MultiLabelBinarizer
from torch import nn
import torch
import openai
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

class GenreClassifier(nn.Module):
    def __init__(self, num_genres=20):
        super().__init__()
        self.gpt2 = GPT2Model.from_pretrained('gpt2')
        self.dropout = nn.Dropout(0.1)
        self.genre_classifier = nn.Linear(768, num_genres)  # 768 is GPT2's hidden size
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, input_ids, attention_mask):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[0].mean(dim=1)  # Average pooling
        pooled_output = self.dropout(pooled_output)
        genre_logits = self.genre_classifier(pooled_output)
        return self.sigmoid(genre_logits)

class BookGenreAnalyzer:
    def __init__(self, api_key):
        """Initialize the analyzer with OpenAI API key"""
        self.openai.api_key = api_key
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.model = GenreClassifier()
        self.genre_labels = self._load_genre_labels()
        nltk.download('punkt')
        nltk.download('stopwords')
        self.stop_words = set(stopwords.words('english'))
        
    def _load_genre_labels(self):
        """Load predefined genre labels"""
        # You would typically load these from a file or database
        return [
            "Fiction", "Non-fiction", "Mystery", "Romance", "Science Fiction",
            "Fantasy", "Thriller", "Horror", "Historical Fiction", "Biography",
            "Self-help", "Business", "Science", "Philosophy", "Poetry",
            "Drama", "Adventure", "Literary Fiction", "Young Adult", "Children's"
        ]
    
    def preprocess_text(self, text):
        """Preprocess the book text"""
        # Tokenize and remove stop words
        tokens = word_tokenize(text.lower())
        tokens = [t for t in tokens if t not in self.stop_words]
        
        # Convert to GPT2 tokens
        encodings = self.tokenizer(
            ' '.join(tokens),
            truncation=True,
            max_length=1024,
            padding='max_length',
            return_tensors='pt'
        )
        return encodings
    
    def extract_features(self, text):
        """Extract relevant features from the text"""
        encodings = self.preprocess_text(text)
        with torch.no_grad():
            features = self.model(
                input_ids=encodings['input_ids'],
                attention_mask=encodings['attention_mask']
            )
        return features
    
    def fine_tune_with_gpt3(self, training_data):
        """Fine-tune the model using GPT-3"""
        # Prepare training data in the format expected by OpenAI
        formatted_data = []
        for book_text, genres in training_data:
            formatted_data.append({
                "prompt": f"Book text: {book_text[:1000]}...\nGenres:",
                "completion": f" {', '.join(genres)}"
            })
            
        # Create fine-tuning job
        try:
            response = openai.FineTune.create(
                training_file=self._upload_training_data(formatted_data),
                model="gpt-3",
                n_epochs=3,
                batch_size=4,
                learning_rate_multiplier=0.1
            )
            return response
        except Exception as e:
            print(f"Fine-tuning error: {e}")
            return None
    
    def _upload_training_data(self, formatted_data):
        """Upload training data to OpenAI"""
        import json
        with open('training_data.jsonl', 'w') as f:
            for entry in formatted_data:
                json.dump(entry, f)
                f.write('\n')
                
        with open('training_data.jsonl', 'rb') as f:
            response = openai.File.create(
                file=f,
                purpose='fine-tune'
            )
        return response.id
    
    def analyze_book(self, book_text):
        """Analyze a book and return top 20 genres with confidence scores"""
        # Get base predictions from our model
        features = self.extract_features(book_text)
        predictions = features.numpy()[0]
        
        # Use GPT-3 to enhance predictions
        try:
            response = openai.Completion.create(
                model="gpt-3",  # Use fine-tuned model ID if available
                prompt=f"Book text: {book_text[:1000]}...\nGenres:",
                max_tokens=100,
                temperature=0.3
            )
            gpt3_genres = response.choices[0].text.strip().split(', ')
        except:
            gpt3_genres = []
        
        # Combine both predictions
        genres_with_scores = [
            (genre, float(score))
            for genre, score in zip(self.genre_labels, predictions)
        ]
        
        # Boost scores for genres mentioned by GPT-3
        for genre, score in genres_with_scores:
            if genre in gpt3_genres:
                score *= 1.2
                
        # Sort and return top 20
        return sorted(genres_with_scores, key=lambda x: x[1], reverse=True)[:20]

# Example usage
def main():
    # Initialize analyzer
    analyzer = BookGenreAnalyzer('your-api-key')
    
    # Example book text
    book_text = """
    [Your book text here]
    """
    
    # Get genre predictions
    genres = analyzer.analyze_book(book_text)
    
    # Print results
    print("\nTop 20 Genres:")
    for genre, confidence in genres:
        print(f"{genre}: {confidence:.2%}")
        
    # Example of fine-tuning
    training_data = [
        ("Book 1 text...", ["Mystery", "Thriller"]),
        ("Book 2 text...", ["Science Fiction", "Adventure"]),
        # Add more training examples
    ]
    
    fine_tune_response = analyzer.fine_tune_with_gpt3(training_data)
    if fine_tune_response:
        print("\nFine-tuning job created successfully!")

if __name__ == "__main__":
    main()