import pandas as pd import numpy as np from transformers import GPT2Tokenizer, GPT2Model from sklearn.preprocessing import MultiLabelBinarizer from torch import nn import torch import openai from collections import Counter import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize
class GenreClassifier(nn.Module): def init(self, num_genres=20): super().init() self.gpt2 = GPT2Model.from_pretrained('gpt2') self.dropout = nn.Dropout(0.1) self.genre_classifier = nn.Linear(768, num_genres) # 768 is GPT2's hidden size self.sigmoid = nn.Sigmoid()
def forward(self, input_ids, attention_mask):
outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
pooled_output = outputs[0].mean(dim=1) # Average pooling
pooled_output = self.dropout(pooled_output)
genre_logits = self.genre_classifier(pooled_output)
return self.sigmoid(genre_logits)
class BookGenreAnalyzer: def init(self, api_key): """Initialize the analyzer with OpenAI API key""" self.openai.api_key = api_key self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') self.model = GenreClassifier() self.genre_labels = self._load_genre_labels() nltk.download('punkt') nltk.download('stopwords') self.stop_words = set(stopwords.words('english'))
def _load_genre_labels(self):
"""Load predefined genre labels"""
# You would typically load these from a file or database
return [
"Fiction", "Non-fiction", "Mystery", "Romance", "Science Fiction",
"Fantasy", "Thriller", "Horror", "Historical Fiction", "Biography",
"Self-help", "Business", "Science", "Philosophy", "Poetry",
"Drama", "Adventure", "Literary Fiction", "Young Adult", "Children's"
]
def preprocess_text(self, text):
"""Preprocess the book text"""
# Tokenize and remove stop words
tokens = word_tokenize(text.lower())
tokens = [t for t in tokens if t not in self.stop_words]
# Convert to GPT2 tokens
encodings = self.tokenizer(
' '.join(tokens),
truncation=True,
max_length=1024,
padding='max_length',
return_tensors='pt'
)
return encodings
def extract_features(self, text):
"""Extract relevant features from the text"""
encodings = self.preprocess_text(text)
with torch.no_grad():
features = self.model(
input_ids=encodings['input_ids'],
attention_mask=encodings['attention_mask']
)
return features
def fine_tune_with_gpt3(self, training_data):
"""Fine-tune the model using GPT-3"""
# Prepare training data in the format expected by OpenAI
formatted_data = []
for book_text, genres in training_data:
formatted_data.append({
"prompt": f"Book text: {book_text[:1000]}...\nGenres:",
"completion": f" {', '.join(genres)}"
})
# Create fine-tuning job
try:
response = openai.FineTune.create(
training_file=self._upload_training_data(formatted_data),
model="gpt-3",
n_epochs=3,
batch_size=4,
learning_rate_multiplier=0.1
)
return response
except Exception as e:
print(f"Fine-tuning error: {e}")
return None
def _upload_training_data(self, formatted_data):
"""Upload training data to OpenAI"""
import json
with open('training_data.jsonl', 'w') as f:
for entry in formatted_data:
json.dump(entry, f)
f.write('\n')
with open('training_data.jsonl', 'rb') as f:
response = openai.File.create(
file=f,
purpose='fine-tune'
)
return response.id
def analyze_book(self, book_text):
"""Analyze a book and return top 20 genres with confidence scores"""
# Get base predictions from our model
features = self.extract_features(book_text)
predictions = features.numpy()[0]
# Use GPT-3 to enhance predictions
try:
response = openai.Completion.create(
model="gpt-3", # Use fine-tuned model ID if available
prompt=f"Book text: {book_text[:1000]}...\nGenres:",
max_tokens=100,
temperature=0.3
)
gpt3_genres = response.choices[0].text.strip().split(', ')
except:
gpt3_genres = []
# Combine both predictions
genres_with_scores = [
(genre, float(score))
for genre, score in zip(self.genre_labels, predictions)
]
# Boost scores for genres mentioned by GPT-3
for genre, score in genres_with_scores:
if genre in gpt3_genres:
score *= 1.2
# Sort and return top 20
return sorted(genres_with_scores, key=lambda x: x[1], reverse=True)[:20]
Example usage
def main(): # Initialize analyzer analyzer = BookGenreAnalyzer('your-api-key')
# Example book text
book_text = """
[Your book text here]
"""
# Get genre predictions
genres = analyzer.analyze_book(book_text)
# Print results
print("\nTop 20 Genres:")
for genre, confidence in genres:
print(f"{genre}: {confidence:.2%}")
# Example of fine-tuning
training_data = [
("Book 1 text...", ["Mystery", "Thriller"]),
("Book 2 text...", ["Science Fiction", "Adventure"]),
# Add more training examples
]
fine_tune_response = analyzer.fine_tune_with_gpt3(training_data)
if fine_tune_response:
print("\nFine-tuning job created successfully!")
if name == "main": main()