Files changed (1) hide show
  1. app.py +316 -8
app.py CHANGED
@@ -9,10 +9,9 @@ from dotenv import load_dotenv
9
  load_dotenv()
10
 
11
  class TweetDatasetProcessor:
12
- def __init__(self, fine_tuned_model_name, pdf_path):
13
  self.tweets = []
14
  self.personality_profile = {}
15
- self.vectorizer = None # No need for vectorizer here since we're not clustering
16
  self.used_tweets = set() # Track used tweets to avoid repetition
17
  self.pdf_path = pdf_path
18
 
@@ -35,6 +34,8 @@ class TweetDatasetProcessor:
35
 
36
  def extract_text_from_pdf(self):
37
  """Extract text content from PDF file."""
 
 
38
  reader = PdfReader(self.pdf_path)
39
  text = ""
40
  for page in reader.pages:
@@ -111,24 +112,331 @@ class TweetDatasetProcessor:
111
  return generated_tweet
112
 
113
  # Gradio Interface Function
114
- def gradio_interface():
115
- # Path to the PDF with tweets
116
- pdf_path = 'Dataset (4).pdf' # Replace with your PDF file path
117
  fine_tuned_model_name = 'Manasa1/GPT2_Finetuned_tweets' # Replace with the path to your fine-tuned model
 
118
 
119
- processor = TweetDatasetProcessor(fine_tuned_model_name, pdf_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  text = processor.extract_text_from_pdf()
122
  tweets = processor.process_pdf_content(text)
 
 
123
  personality_analysis = processor.analyze_personality(max_tweets=50)
124
- generated_tweet = processor.generate_tweet(context="AI-powered tweet generation", sample_size=3)
 
 
125
 
126
  return personality_analysis, generated_tweet
127
 
128
  # Gradio app setup
129
  iface = gr.Interface(
130
  fn=gradio_interface,
131
- inputs=[],
 
 
 
132
  outputs=[
133
  gr.Textbox(label="Personality Analysis"),
134
  gr.Textbox(label="Generated Tweet")
 
9
  load_dotenv()
10
 
11
  class TweetDatasetProcessor:
12
+ def __init__(self, fine_tuned_model_name, pdf_path=None):
13
  self.tweets = []
14
  self.personality_profile = {}
 
15
  self.used_tweets = set() # Track used tweets to avoid repetition
16
  self.pdf_path = pdf_path
17
 
 
34
 
35
  def extract_text_from_pdf(self):
36
  """Extract text content from PDF file."""
37
+ if not self.pdf_path:
38
+ return ""
39
  reader = PdfReader(self.pdf_path)
40
  text = ""
41
  for page in reader.pages:
 
112
  return generated_tweet
113
 
114
  # Gradio Interface Function
115
+ def gradio_interface(pdf_file, context="AI-powered tweet generation"):
116
+ # Initialize the processor with uploaded PDF path
 
117
  fine_tuned_model_name = 'Manasa1/GPT2_Finetuned_tweets' # Replace with the path to your fine-tuned model
118
+ processor = TweetDatasetProcessor(fine_tuned_model_name, pdf_path=pdf_file.name)
119
 
120
+ # Extract text from PDF and process it
121
+ text = processor.extract_text_from_pdf()
122
+ tweets = processor.process_pdf_content(text)
123
+
124
+ # Analyze personality based on tweets
125
+ personality_analysis = processor.analyze_personality(max_tweets=50)
126
+
127
+ # Generate tweet based on the personality analysis and context
128
+ generated_tweet = processor.generate_tweet(context=context, sample_size=3)
129
+
130
+ return personality_analysis, generated_tweet
131
+
132
+ # Gradio app setup
133
+ iface = gr.Interface(
134
+ fn=gradio_interface,
135
+ inputs=[
136
+ gr.File(label="Upload PDF with Tweets"),
137
+ gr.Textbox(label="Context for Tweet Generation (optional)", placeholder="e.g., AI-powered tweet generation")
138
+ ],
139
+ outputs=[
140
+ gr.Textbox(label="Personality Analysis"),
141
+ gr.Textbox(label="Generated Tweet")
142
+ ],
143
+ live=True,
144
+ title="AI Personality and Tweet Generation",
145
+ description="Automatically analyze personality and generate tweets based on a provided PDF of tweets."
146
+ )
147
+
148
+ # Launch the app
149
+ if __name__ == "__main__":
150
+ iface.launch()
151
+ import gradio as gr
152
+ from transformers import AutoModelForCausalLM, AutoTokenizer
153
+ import random
154
+ from datetime import datetime
155
+ from PyPDF2 import PdfReader
156
+ import json
157
+ from dotenv import load_dotenv
158
+
159
+ load_dotenv()
160
+
161
+ class TweetDatasetProcessor:
162
+ def __init__(self, fine_tuned_model_name, pdf_path=None):
163
+ self.tweets = []
164
+ self.personality_profile = {}
165
+ self.used_tweets = set() # Track used tweets to avoid repetition
166
+ self.pdf_path = pdf_path
167
+
168
+ # Load fine-tuned model and tokenizer
169
+ self.model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_name)
170
+ self.tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_name)
171
+
172
+ @staticmethod
173
+ def _process_line(line):
174
+ """Process a single line."""
175
+ line = line.strip()
176
+ if not line or line.startswith('http'): # Skip empty lines and URLs
177
+ return None
178
+ return {
179
+ 'content': line,
180
+ 'timestamp': datetime.now(),
181
+ 'mentions': [word for word in line.split() if word.startswith('@')],
182
+ 'hashtags': [word for word in line.split() if word.startswith('#')]
183
+ }
184
+
185
+ def extract_text_from_pdf(self):
186
+ """Extract text content from PDF file."""
187
+ if not self.pdf_path:
188
+ return ""
189
+ reader = PdfReader(self.pdf_path)
190
+ text = ""
191
+ for page in reader.pages:
192
+ text += page.extract_text()
193
+ return text
194
+
195
+ def process_pdf_content(self, text):
196
+ """Process PDF content and clean extracted tweets."""
197
+ if not text.strip():
198
+ raise ValueError("The provided PDF appears to be empty.")
199
+
200
+ lines = text.split('\n')
201
+ clean_tweets = [TweetDatasetProcessor._process_line(line) for line in lines]
202
+ self.tweets = [tweet for tweet in clean_tweets if tweet]
203
+
204
+ if not self.tweets:
205
+ raise ValueError("No tweets were extracted from the PDF. Ensure the content is properly formatted.")
206
+
207
+ return self.tweets
208
+
209
+ def analyze_personality(self, max_tweets=50):
210
+ """Comprehensive personality analysis using a limited subset of tweets."""
211
+ if not self.tweets:
212
+ raise ValueError("No tweets available for personality analysis.")
213
+
214
+ all_tweets = [tweet['content'] for tweet in self.tweets][:max_tweets]
215
+ analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets:
216
+ Core beliefs, emotional tendencies, cognitive patterns, etc.
217
+ Tweets for analysis:
218
+ {json.dumps(all_tweets, indent=2)}
219
+ """
220
+
221
+ input_ids = self.tokenizer.encode(analysis_prompt, return_tensors='pt')
222
+ output = self.model.generate(input_ids, max_length=500, num_return_sequences=1, temperature=0.7)
223
+ personality_analysis = self.tokenizer.decode(output[0], skip_special_tokens=True)
224
+
225
+ self.personality_profile = personality_analysis
226
+ return self.personality_profile
227
+
228
+ def generate_tweet(self, context="", sample_size=3):
229
+ """Generate a new tweet by sampling random tweets and avoiding repetition."""
230
+ if not self.tweets:
231
+ return "Error: No tweets available for generation."
232
+
233
+ # Randomly sample unique tweets
234
+ available_tweets = [tweet for tweet in self.tweets if tweet['content'] not in self.used_tweets]
235
+ if len(available_tweets) < sample_size:
236
+ self.used_tweets.clear() # Reset used tweets if all have been used
237
+ available_tweets = self.tweets
238
+
239
+ sampled_tweets = random.sample(available_tweets, sample_size)
240
+ sampled_contents = [tweet['content'] for tweet in sampled_tweets]
241
+
242
+ # Update the used tweets tracker
243
+ self.used_tweets.update(sampled_contents)
244
+
245
+ # Truncate personality profile to avoid token overflow
246
+ personality_profile_excerpt = self.personality_profile[:400] if len(self.personality_profile) > 400 else self.personality_profile
247
+
248
+ # Construct the prompt
249
+ prompt = f"""Based on this personality profile:
250
+ {personality_profile_excerpt}
251
+ Current context or topic (if any):
252
+ {context}
253
+ Tweets for context:
254
+ {', '.join(sampled_contents)}
255
+ **Only generate the tweet. Do not include analysis, explanation, or any other content.**
256
+ """
257
+
258
+ input_ids = self.tokenizer.encode(prompt, return_tensors='pt')
259
+ output = self.model.generate(input_ids, max_length=150, num_return_sequences=1, temperature=1.0)
260
+ generated_tweet = self.tokenizer.decode(output[0], skip_special_tokens=True).strip()
261
+
262
+ return generated_tweet
263
+
264
+ # Gradio Interface Function
265
+ def gradio_interface(pdf_file, context="AI-powered tweet generation"):
266
+ # Initialize the processor with uploaded PDF path
267
+ fine_tuned_model_name = 'Manasa1/GPT2_Finetuned_tweets' # Replace with the path to your fine-tuned model
268
+ processor = TweetDatasetProcessor(fine_tuned_model_name, pdf_path=pdf_file.name)
269
+
270
+ # Extract text from PDF and process it
271
+ text = processor.extract_text_from_pdf()
272
+ tweets = processor.process_pdf_content(text)
273
+
274
+ # Analyze personality based on tweets
275
+ personality_analysis = processor.analyze_personality(max_tweets=50)
276
+
277
+ # Generate tweet based on the personality analysis and context
278
+ generated_tweet = processor.generate_tweet(context=context, sample_size=3)
279
+
280
+ return personality_analysis, generated_tweet
281
+
282
+ # Gradio app setup
283
+ iface = gr.Interface(
284
+ fn=gradio_interface,
285
+ inputs=[
286
+ gr.File(label="Upload PDF with Tweets"),
287
+ gr.Textbox(label="Context for Tweet Generation (optional)", placeholder="e.g., AI-powered tweet generation")
288
+ ],
289
+ outputs=[
290
+ gr.Textbox(label="Personality Analysis"),
291
+ gr.Textbox(label="Generated Tweet")
292
+ ],
293
+ live=True,
294
+ title="AI Personality and Tweet Generation",
295
+ description="Automatically analyze personality and generate tweets based on a provided PDF of tweets."
296
+ )
297
+
298
+ # Launch the app
299
+ if __name__ == "__main__":
300
+ iface.launch()
301
+ import gradio as gr
302
+ from transformers import AutoModelForCausalLM, AutoTokenizer
303
+ import random
304
+ from datetime import datetime
305
+ from PyPDF2 import PdfReader
306
+ import json
307
+ from dotenv import load_dotenv
308
+
309
+ load_dotenv()
310
+
311
+ class TweetDatasetProcessor:
312
+ def __init__(self, fine_tuned_model_name, pdf_path=None):
313
+ self.tweets = []
314
+ self.personality_profile = {}
315
+ self.used_tweets = set() # Track used tweets to avoid repetition
316
+ self.pdf_path = pdf_path
317
+
318
+ # Load fine-tuned model and tokenizer
319
+ self.model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_name)
320
+ self.tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_name)
321
+
322
+ @staticmethod
323
+ def _process_line(line):
324
+ """Process a single line."""
325
+ line = line.strip()
326
+ if not line or line.startswith('http'): # Skip empty lines and URLs
327
+ return None
328
+ return {
329
+ 'content': line,
330
+ 'timestamp': datetime.now(),
331
+ 'mentions': [word for word in line.split() if word.startswith('@')],
332
+ 'hashtags': [word for word in line.split() if word.startswith('#')]
333
+ }
334
+
335
+ def extract_text_from_pdf(self):
336
+ """Extract text content from PDF file."""
337
+ if not self.pdf_path:
338
+ return ""
339
+ reader = PdfReader(self.pdf_path)
340
+ text = ""
341
+ for page in reader.pages:
342
+ text += page.extract_text()
343
+ return text
344
+
345
+ def process_pdf_content(self, text):
346
+ """Process PDF content and clean extracted tweets."""
347
+ if not text.strip():
348
+ raise ValueError("The provided PDF appears to be empty.")
349
+
350
+ lines = text.split('\n')
351
+ clean_tweets = [TweetDatasetProcessor._process_line(line) for line in lines]
352
+ self.tweets = [tweet for tweet in clean_tweets if tweet]
353
+
354
+ if not self.tweets:
355
+ raise ValueError("No tweets were extracted from the PDF. Ensure the content is properly formatted.")
356
+
357
+ return self.tweets
358
+
359
+ def analyze_personality(self, max_tweets=50):
360
+ """Comprehensive personality analysis using a limited subset of tweets."""
361
+ if not self.tweets:
362
+ raise ValueError("No tweets available for personality analysis.")
363
 
364
+ all_tweets = [tweet['content'] for tweet in self.tweets][:max_tweets]
365
+ analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets:
366
+ Core beliefs, emotional tendencies, cognitive patterns, etc.
367
+ Tweets for analysis:
368
+ {json.dumps(all_tweets, indent=2)}
369
+ """
370
+
371
+ input_ids = self.tokenizer.encode(analysis_prompt, return_tensors='pt')
372
+ output = self.model.generate(input_ids, max_length=500, num_return_sequences=1, temperature=0.7)
373
+ personality_analysis = self.tokenizer.decode(output[0], skip_special_tokens=True)
374
+
375
+ self.personality_profile = personality_analysis
376
+ return self.personality_profile
377
+
378
+ def generate_tweet(self, context="", sample_size=3):
379
+ """Generate a new tweet by sampling random tweets and avoiding repetition."""
380
+ if not self.tweets:
381
+ return "Error: No tweets available for generation."
382
+
383
+ # Randomly sample unique tweets
384
+ available_tweets = [tweet for tweet in self.tweets if tweet['content'] not in self.used_tweets]
385
+ if len(available_tweets) < sample_size:
386
+ self.used_tweets.clear() # Reset used tweets if all have been used
387
+ available_tweets = self.tweets
388
+
389
+ sampled_tweets = random.sample(available_tweets, sample_size)
390
+ sampled_contents = [tweet['content'] for tweet in sampled_tweets]
391
+
392
+ # Update the used tweets tracker
393
+ self.used_tweets.update(sampled_contents)
394
+
395
+ # Truncate personality profile to avoid token overflow
396
+ personality_profile_excerpt = self.personality_profile[:400] if len(self.personality_profile) > 400 else self.personality_profile
397
+
398
+ # Construct the prompt
399
+ prompt = f"""Based on this personality profile:
400
+ {personality_profile_excerpt}
401
+ Current context or topic (if any):
402
+ {context}
403
+ Tweets for context:
404
+ {', '.join(sampled_contents)}
405
+ **Only generate the tweet. Do not include analysis, explanation, or any other content.**
406
+ """
407
+
408
+ input_ids = self.tokenizer.encode(prompt, return_tensors='pt')
409
+ output = self.model.generate(input_ids, max_length=150, num_return_sequences=1, temperature=1.0)
410
+ generated_tweet = self.tokenizer.decode(output[0], skip_special_tokens=True).strip()
411
+
412
+ return generated_tweet
413
+
414
+ # Gradio Interface Function
415
+ def gradio_interface(pdf_file, context="AI-powered tweet generation"):
416
+ # Initialize the processor with uploaded PDF path
417
+ fine_tuned_model_name = 'Manasa1/GPT2_Finetuned_tweets' # Replace with the path to your fine-tuned model
418
+ pdf_path = 'Dataset (4).pdf'
419
+ processor = TweetDatasetProcessor(fine_tuned_model_name, pdf_path=pdf_path)
420
+
421
+ # Extract text from PDF and process it
422
  text = processor.extract_text_from_pdf()
423
  tweets = processor.process_pdf_content(text)
424
+
425
+ # Analyze personality based on tweets
426
  personality_analysis = processor.analyze_personality(max_tweets=50)
427
+
428
+ # Generate tweet based on the personality analysis and context
429
+ generated_tweet = processor.generate_tweet(context=context, sample_size=3)
430
 
431
  return personality_analysis, generated_tweet
432
 
433
  # Gradio app setup
434
  iface = gr.Interface(
435
  fn=gradio_interface,
436
+ inputs=[
437
+ gr.File(label="Upload PDF with Tweets"),
438
+ gr.Textbox(label="Context for Tweet Generation (optional)", placeholder="e.g., AI-powered tweet generation")
439
+ ],
440
  outputs=[
441
  gr.Textbox(label="Personality Analysis"),
442
  gr.Textbox(label="Generated Tweet")