import pandas as pd import numpy as np from transformers import GPT2Tokenizer, GPT2Model from sklearn.preprocessing import MultiLabelBinarizer from torch import nn import torch import openai from collections import Counter import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize class GenreClassifier(nn.Module): def __init__(self, num_genres=20): super().__init__() self.gpt2 = GPT2Model.from_pretrained('gpt2') self.dropout = nn.Dropout(0.1) self.genre_classifier = nn.Linear(768, num_genres) # 768 is GPT2's hidden size self.sigmoid = nn.Sigmoid() def forward(self, input_ids, attention_mask): outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask) pooled_output = outputs[0].mean(dim=1) # Average pooling pooled_output = self.dropout(pooled_output) genre_logits = self.genre_classifier(pooled_output) return self.sigmoid(genre_logits) class BookGenreAnalyzer: def __init__(self, api_key): """Initialize the analyzer with OpenAI API key""" self.openai.api_key = api_key self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') self.model = GenreClassifier() self.genre_labels = self._load_genre_labels() nltk.download('punkt') nltk.download('stopwords') self.stop_words = set(stopwords.words('english')) def _load_genre_labels(self): """Load predefined genre labels""" # You would typically load these from a file or database return [ "Fiction", "Non-fiction", "Mystery", "Romance", "Science Fiction", "Fantasy", "Thriller", "Horror", "Historical Fiction", "Biography", "Self-help", "Business", "Science", "Philosophy", "Poetry", "Drama", "Adventure", "Literary Fiction", "Young Adult", "Children's" ] def preprocess_text(self, text): """Preprocess the book text""" # Tokenize and remove stop words tokens = word_tokenize(text.lower()) tokens = [t for t in tokens if t not in self.stop_words] # Convert to GPT2 tokens encodings = self.tokenizer( ' '.join(tokens), truncation=True, max_length=1024, padding='max_length', return_tensors='pt' ) return encodings def extract_features(self, text): """Extract relevant features from the text""" encodings = self.preprocess_text(text) with torch.no_grad(): features = self.model( input_ids=encodings['input_ids'], attention_mask=encodings['attention_mask'] ) return features def fine_tune_with_gpt3(self, training_data): """Fine-tune the model using GPT-3""" # Prepare training data in the format expected by OpenAI formatted_data = [] for book_text, genres in training_data: formatted_data.append({ "prompt": f"Book text: {book_text[:1000]}...\nGenres:", "completion": f" {', '.join(genres)}" }) # Create fine-tuning job try: response = openai.FineTune.create( training_file=self._upload_training_data(formatted_data), model="gpt-3", n_epochs=3, batch_size=4, learning_rate_multiplier=0.1 ) return response except Exception as e: print(f"Fine-tuning error: {e}") return None def _upload_training_data(self, formatted_data): """Upload training data to OpenAI""" import json with open('training_data.jsonl', 'w') as f: for entry in formatted_data: json.dump(entry, f) f.write('\n') with open('training_data.jsonl', 'rb') as f: response = openai.File.create( file=f, purpose='fine-tune' ) return response.id def analyze_book(self, book_text): """Analyze a book and return top 20 genres with confidence scores""" # Get base predictions from our model features = self.extract_features(book_text) predictions = features.numpy()[0] # Use GPT-3 to enhance predictions try: response = openai.Completion.create( model="gpt-3", # Use fine-tuned model ID if available prompt=f"Book text: {book_text[:1000]}...\nGenres:", max_tokens=100, temperature=0.3 ) gpt3_genres = response.choices[0].text.strip().split(', ') except: gpt3_genres = [] # Combine both predictions genres_with_scores = [ (genre, float(score)) for genre, score in zip(self.genre_labels, predictions) ] # Boost scores for genres mentioned by GPT-3 for genre, score in genres_with_scores: if genre in gpt3_genres: score *= 1.2 # Sort and return top 20 return sorted(genres_with_scores, key=lambda x: x[1], reverse=True)[:20] # Example usage def main(): # Initialize analyzer analyzer = BookGenreAnalyzer('your-api-key') # Example book text book_text = """ [Your book text here] """ # Get genre predictions genres = analyzer.analyze_book(book_text) # Print results print("\nTop 20 Genres:") for genre, confidence in genres: print(f"{genre}: {confidence:.2%}") # Example of fine-tuning training_data = [ ("Book 1 text...", ["Mystery", "Thriller"]), ("Book 2 text...", ["Science Fiction", "Adventure"]), # Add more training examples ] fine_tune_response = analyzer.fine_tune_with_gpt3(training_data) if fine_tune_response: print("\nFine-tuning job created successfully!") if __name__ == "__main__": main()