import pandas as pd import numpy as np from transformers import GPT2Tokenizer, GPT2Model from sklearn.preprocessing import MultiLabelBinarizer from torch import nn import torch import openai from collections import Counter import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize

class GenreClassifier(nn.Module): def init(self, num_genres=20): super().init() self.gpt2 = GPT2Model.from_pretrained('gpt2') self.dropout = nn.Dropout(0.1) self.genre_classifier = nn.Linear(768, num_genres) # 768 is GPT2's hidden size self.sigmoid = nn.Sigmoid()

def forward(self, input_ids, attention_mask):
    outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
    pooled_output = outputs[0].mean(dim=1)  # Average pooling
    pooled_output = self.dropout(pooled_output)
    genre_logits = self.genre_classifier(pooled_output)
    return self.sigmoid(genre_logits)

class BookGenreAnalyzer: def init(self, api_key): """Initialize the analyzer with OpenAI API key""" self.openai.api_key = api_key self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') self.model = GenreClassifier() self.genre_labels = self._load_genre_labels() nltk.download('punkt') nltk.download('stopwords') self.stop_words = set(stopwords.words('english'))

def _load_genre_labels(self):
    """Load predefined genre labels"""
    # You would typically load these from a file or database
    return [
        "Fiction", "Non-fiction", "Mystery", "Romance", "Science Fiction",
        "Fantasy", "Thriller", "Horror", "Historical Fiction", "Biography",
        "Self-help", "Business", "Science", "Philosophy", "Poetry",
        "Drama", "Adventure", "Literary Fiction", "Young Adult", "Children's"
    ]

def preprocess_text(self, text):
    """Preprocess the book text"""
    # Tokenize and remove stop words
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t not in self.stop_words]
    
    # Convert to GPT2 tokens
    encodings = self.tokenizer(
        ' '.join(tokens),
        truncation=True,
        max_length=1024,
        padding='max_length',
        return_tensors='pt'
    )
    return encodings

def extract_features(self, text):
    """Extract relevant features from the text"""
    encodings = self.preprocess_text(text)
    with torch.no_grad():
        features = self.model(
            input_ids=encodings['input_ids'],
            attention_mask=encodings['attention_mask']
        )
    return features

def fine_tune_with_gpt3(self, training_data):
    """Fine-tune the model using GPT-3"""
    # Prepare training data in the format expected by OpenAI
    formatted_data = []
    for book_text, genres in training_data:
        formatted_data.append({
            "prompt": f"Book text: {book_text[:1000]}...\nGenres:",
            "completion": f" {', '.join(genres)}"
        })
        
    # Create fine-tuning job
    try:
        response = openai.FineTune.create(
            training_file=self._upload_training_data(formatted_data),
            model="gpt-3",
            n_epochs=3,
            batch_size=4,
            learning_rate_multiplier=0.1
        )
        return response
    except Exception as e:
        print(f"Fine-tuning error: {e}")
        return None

def _upload_training_data(self, formatted_data):
    """Upload training data to OpenAI"""
    import json
    with open('training_data.jsonl', 'w') as f:
        for entry in formatted_data:
            json.dump(entry, f)
            f.write('\n')
            
    with open('training_data.jsonl', 'rb') as f:
        response = openai.File.create(
            file=f,
            purpose='fine-tune'
        )
    return response.id

def analyze_book(self, book_text):
    """Analyze a book and return top 20 genres with confidence scores"""
    # Get base predictions from our model
    features = self.extract_features(book_text)
    predictions = features.numpy()[0]
    
    # Use GPT-3 to enhance predictions
    try:
        response = openai.Completion.create(
            model="gpt-3",  # Use fine-tuned model ID if available
            prompt=f"Book text: {book_text[:1000]}...\nGenres:",
            max_tokens=100,
            temperature=0.3
        )
        gpt3_genres = response.choices[0].text.strip().split(', ')
    except:
        gpt3_genres = []
    
    # Combine both predictions
    genres_with_scores = [
        (genre, float(score))
        for genre, score in zip(self.genre_labels, predictions)
    ]
    
    # Boost scores for genres mentioned by GPT-3
    for genre, score in genres_with_scores:
        if genre in gpt3_genres:
            score *= 1.2
            
    # Sort and return top 20
    return sorted(genres_with_scores, key=lambda x: x[1], reverse=True)[:20]

Example usage

def main(): # Initialize analyzer analyzer = BookGenreAnalyzer('your-api-key')

# Example book text
book_text = """
[Your book text here]
"""

# Get genre predictions
genres = analyzer.analyze_book(book_text)

# Print results
print("\nTop 20 Genres:")
for genre, confidence in genres:
    print(f"{genre}: {confidence:.2%}")
    
# Example of fine-tuning
training_data = [
    ("Book 1 text...", ["Mystery", "Thriller"]),
    ("Book 2 text...", ["Science Fiction", "Adventure"]),
    # Add more training examples
]

fine_tune_response = analyzer.fine_tune_with_gpt3(training_data)
if fine_tune_response:
    print("\nFine-tuning job created successfully!")

if name == "main": main()