|
import pandas as pd |
|
import numpy as np |
|
from transformers import GPT2Tokenizer, GPT2Model |
|
from sklearn.preprocessing import MultiLabelBinarizer |
|
from torch import nn |
|
import torch |
|
import openai |
|
from collections import Counter |
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import word_tokenize |
|
|
|
class GenreClassifier(nn.Module): |
|
def __init__(self, num_genres=20): |
|
super().__init__() |
|
self.gpt2 = GPT2Model.from_pretrained('gpt2') |
|
self.dropout = nn.Dropout(0.1) |
|
self.genre_classifier = nn.Linear(768, num_genres) |
|
self.sigmoid = nn.Sigmoid() |
|
|
|
def forward(self, input_ids, attention_mask): |
|
outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask) |
|
pooled_output = outputs[0].mean(dim=1) |
|
pooled_output = self.dropout(pooled_output) |
|
genre_logits = self.genre_classifier(pooled_output) |
|
return self.sigmoid(genre_logits) |
|
|
|
class BookGenreAnalyzer: |
|
def __init__(self, api_key): |
|
"""Initialize the analyzer with OpenAI API key""" |
|
self.openai.api_key = api_key |
|
self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') |
|
self.model = GenreClassifier() |
|
self.genre_labels = self._load_genre_labels() |
|
nltk.download('punkt') |
|
nltk.download('stopwords') |
|
self.stop_words = set(stopwords.words('english')) |
|
|
|
def _load_genre_labels(self): |
|
"""Load predefined genre labels""" |
|
|
|
return [ |
|
"Fiction", "Non-fiction", "Mystery", "Romance", "Science Fiction", |
|
"Fantasy", "Thriller", "Horror", "Historical Fiction", "Biography", |
|
"Self-help", "Business", "Science", "Philosophy", "Poetry", |
|
"Drama", "Adventure", "Literary Fiction", "Young Adult", "Children's" |
|
] |
|
|
|
def preprocess_text(self, text): |
|
"""Preprocess the book text""" |
|
|
|
tokens = word_tokenize(text.lower()) |
|
tokens = [t for t in tokens if t not in self.stop_words] |
|
|
|
|
|
encodings = self.tokenizer( |
|
' '.join(tokens), |
|
truncation=True, |
|
max_length=1024, |
|
padding='max_length', |
|
return_tensors='pt' |
|
) |
|
return encodings |
|
|
|
def extract_features(self, text): |
|
"""Extract relevant features from the text""" |
|
encodings = self.preprocess_text(text) |
|
with torch.no_grad(): |
|
features = self.model( |
|
input_ids=encodings['input_ids'], |
|
attention_mask=encodings['attention_mask'] |
|
) |
|
return features |
|
|
|
def fine_tune_with_gpt3(self, training_data): |
|
"""Fine-tune the model using GPT-3""" |
|
|
|
formatted_data = [] |
|
for book_text, genres in training_data: |
|
formatted_data.append({ |
|
"prompt": f"Book text: {book_text[:1000]}...\nGenres:", |
|
"completion": f" {', '.join(genres)}" |
|
}) |
|
|
|
|
|
try: |
|
response = openai.FineTune.create( |
|
training_file=self._upload_training_data(formatted_data), |
|
model="gpt-3", |
|
n_epochs=3, |
|
batch_size=4, |
|
learning_rate_multiplier=0.1 |
|
) |
|
return response |
|
except Exception as e: |
|
print(f"Fine-tuning error: {e}") |
|
return None |
|
|
|
def _upload_training_data(self, formatted_data): |
|
"""Upload training data to OpenAI""" |
|
import json |
|
with open('training_data.jsonl', 'w') as f: |
|
for entry in formatted_data: |
|
json.dump(entry, f) |
|
f.write('\n') |
|
|
|
with open('training_data.jsonl', 'rb') as f: |
|
response = openai.File.create( |
|
file=f, |
|
purpose='fine-tune' |
|
) |
|
return response.id |
|
|
|
def analyze_book(self, book_text): |
|
"""Analyze a book and return top 20 genres with confidence scores""" |
|
|
|
features = self.extract_features(book_text) |
|
predictions = features.numpy()[0] |
|
|
|
|
|
try: |
|
response = openai.Completion.create( |
|
model="gpt-3", |
|
prompt=f"Book text: {book_text[:1000]}...\nGenres:", |
|
max_tokens=100, |
|
temperature=0.3 |
|
) |
|
gpt3_genres = response.choices[0].text.strip().split(', ') |
|
except: |
|
gpt3_genres = [] |
|
|
|
|
|
genres_with_scores = [ |
|
(genre, float(score)) |
|
for genre, score in zip(self.genre_labels, predictions) |
|
] |
|
|
|
|
|
for genre, score in genres_with_scores: |
|
if genre in gpt3_genres: |
|
score *= 1.2 |
|
|
|
|
|
return sorted(genres_with_scores, key=lambda x: x[1], reverse=True)[:20] |
|
|
|
|
|
def main(): |
|
|
|
analyzer = BookGenreAnalyzer('your-api-key') |
|
|
|
|
|
book_text = """ |
|
[Your book text here] |
|
""" |
|
|
|
|
|
genres = analyzer.analyze_book(book_text) |
|
|
|
|
|
print("\nTop 20 Genres:") |
|
for genre, confidence in genres: |
|
print(f"{genre}: {confidence:.2%}") |
|
|
|
|
|
training_data = [ |
|
("Book 1 text...", ["Mystery", "Thriller"]), |
|
("Book 2 text...", ["Science Fiction", "Adventure"]), |
|
|
|
] |
|
|
|
fine_tune_response = analyzer.fine_tune_with_gpt3(training_data) |
|
if fine_tune_response: |
|
print("\nFine-tuning job created successfully!") |
|
|
|
if __name__ == "__main__": |
|
main() |