Aniruddha21 commited on
Commit
56ec760
1 Parent(s): e7947f0

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +510 -0
app.py ADDED
@@ -0,0 +1,510 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Fine Tuned Llama 2 for Comment Analysis
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1NX5z-wVpsEp8UigB0q7vZSZMFRa6nnEE
8
+
9
+ ##**Extract Youtube Comments**
10
+ """
11
+
12
+ # !pip uninstall gradio
13
+ # !pip3 install gradio -q
14
+ # !pip install --upgrade fastapi -q
15
+ # !pip install typing-extensions --upgrade
16
+
17
+ # import locale
18
+ # locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
19
+
20
+ # import locale
21
+ # locale.getpreferredencoding = lambda: "UTF-8"
22
+
23
+ # !pip3 install typing-extensions==4.2.0
24
+ # !pip3 install gradio -q
25
+ # !pip3 install --upgrade tensorflow
26
+
27
+ import pandas as pd
28
+ import gradio as gr
29
+ from googleapiclient.discovery import build
30
+ import csv
31
+ # import gradio as gr
32
+ from PIL import Image
33
+ import io
34
+
35
+ api_key = 'AIzaSyANfQYiumNUfJ8_YaDg-Hfr0BRXFhXnbvQ'
36
+
37
+ def video_comments(video_id):
38
+ # Create a CSV file to store comments
39
+ with open('comments.csv', 'w', newline='', encoding='utf-8') as csvfile:
40
+ fieldnames = ['Comment']
41
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
42
+ writer.writeheader()
43
+
44
+ # Counter to limit the number of comments
45
+ comment_count = 0
46
+
47
+ # creating youtube resource object
48
+ youtube = build('youtube', 'v3', developerKey=api_key)
49
+
50
+ # retrieve youtube video results
51
+ video_response = youtube.commentThreads().list(
52
+ part='snippet,replies',
53
+ videoId=video_id,
54
+ maxResults=100 # Adjust the number of comments per page as needed
55
+ ).execute()
56
+
57
+ # iterate video response
58
+ while video_response:
59
+
60
+ # extracting required info from each result object
61
+ for item in video_response['items']:
62
+
63
+ # Extracting comments
64
+ comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
65
+
66
+ # Write the comment to the CSV file
67
+ writer.writerow({'Comment': comment})
68
+
69
+ comment_count += 1
70
+
71
+ # Check if the maximum comment count is reached
72
+ if comment_count >= 50:
73
+ return
74
+
75
+ # Again repeat
76
+ if 'nextPageToken' in video_response:
77
+ video_response = youtube.commentThreads().list(
78
+ part='snippet,replies',
79
+ videoId=video_id,
80
+ pageToken=video_response['nextPageToken'],
81
+ maxResults=100 # Adjust the number of comments per page as needed
82
+ ).execute()
83
+ else:
84
+ break
85
+
86
+ def execution_function(input):
87
+ # Initialize a counter for deleted rows
88
+ deleted_row_count = 0
89
+
90
+ video_comments(input)
91
+
92
+ # calling the comment file created above
93
+ file_path = "/content/comments.csv"
94
+ df = pd.read_csv(file_path)
95
+
96
+ # Rename the column name to 'comments'
97
+ df.rename(columns={'Comment': 'comments'}, inplace=True)
98
+
99
+ # Get the first 300 comments for quick analysis
100
+ df = df.head(10)
101
+
102
+ return df
103
+ # return_distribution()
104
+
105
+ # comments_df = execution_function("6ydFDwv-n8w")
106
+ # comments_df = comments_df.head(20)
107
+
108
+ # comments_df.head()
109
+
110
+ """##**Fine - tune Llama 2**
111
+
112
+ IMP: This notebook runs on a T4 GPU.
113
+ """
114
+
115
+ # !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
116
+
117
+ import os
118
+ import torch
119
+ from datasets import load_dataset
120
+ from transformers import (
121
+ AutoModelForCausalLM,
122
+ AutoTokenizer,
123
+ BitsAndBytesConfig,
124
+ HfArgumentParser,
125
+ TrainingArguments,
126
+ pipeline,
127
+ logging,
128
+ )
129
+ from peft import LoraConfig, PeftModel
130
+ from trl import SFTTrainer
131
+
132
+ # The model that you want to train from the Hugging Face hub
133
+ model_name = "NousResearch/Llama-2-7b-chat-hf"
134
+
135
+ # The instruction dataset to use
136
+ # dataset_name = "mlabonne/guanaco-llama2-1k"
137
+
138
+ # Fine-tuned model name
139
+ # new_model = "llama-2-7b-miniguanaco"
140
+
141
+ ################################################################################
142
+ # QLoRA parameters
143
+ ################################################################################
144
+
145
+ # LoRA attention dimension
146
+ lora_r = 64
147
+
148
+ # Alpha parameter for LoRA scaling
149
+ lora_alpha = 16
150
+
151
+ # Dropout probability for LoRA layers
152
+ lora_dropout = 0.1
153
+
154
+ ################################################################################
155
+ # bitsandbytes parameters
156
+ ################################################################################
157
+
158
+ # Activate 4-bit precision base model loading
159
+ use_4bit = True
160
+
161
+ # Compute dtype for 4-bit base models
162
+ bnb_4bit_compute_dtype = "float16"
163
+
164
+ # Quantization type (fp4 or nf4)
165
+ bnb_4bit_quant_type = "nf4"
166
+
167
+ # Activate nested quantization for 4-bit base models (double quantization)
168
+ use_nested_quant = False
169
+
170
+ ################################################################################
171
+ # TrainingArguments parameters
172
+ ################################################################################
173
+
174
+ # Output directory where the model predictions and checkpoints will be stored
175
+ output_dir = "./results"
176
+
177
+ # Number of training epochs
178
+ num_train_epochs = 1
179
+
180
+ # Enable fp16/bf16 training (set bf16 to True with an A100)
181
+ fp16 = False
182
+ bf16 = False
183
+
184
+ # Batch size per GPU for training
185
+ per_device_train_batch_size = 4
186
+
187
+ # Batch size per GPU for evaluation
188
+ per_device_eval_batch_size = 4
189
+
190
+ # Number of update steps to accumulate the gradients for
191
+ gradient_accumulation_steps = 1
192
+
193
+ # Enable gradient checkpointing
194
+ gradient_checkpointing = True
195
+
196
+ # Maximum gradient normal (gradient clipping)
197
+ max_grad_norm = 0.3
198
+
199
+ # Initial learning rate (AdamW optimizer)
200
+ learning_rate = 2e-4
201
+
202
+ # Weight decay to apply to all layers except bias/LayerNorm weights
203
+ weight_decay = 0.001
204
+
205
+ # Optimizer to use
206
+ optim = "paged_adamw_32bit"
207
+
208
+ # Learning rate schedule
209
+ lr_scheduler_type = "cosine"
210
+
211
+ # Number of training steps (overrides num_train_epochs)
212
+ max_steps = -1
213
+
214
+ # Ratio of steps for a linear warmup (from 0 to learning rate)
215
+ warmup_ratio = 0.03
216
+
217
+ # Group sequences into batches with same length
218
+ # Saves memory and speeds up training considerably
219
+ group_by_length = True
220
+
221
+ # Save checkpoint every X updates steps
222
+ save_steps = 0
223
+
224
+ # Log every X updates steps
225
+ logging_steps = 25
226
+
227
+ ################################################################################
228
+ # SFT parameters
229
+ ################################################################################
230
+
231
+ # Maximum sequence length to use
232
+ max_seq_length = None
233
+
234
+ # Pack multiple short examples in the same input sequence to increase efficiency
235
+ packing = False
236
+
237
+ # Load the entire model on the GPU 0
238
+ device_map = {"": 0}
239
+
240
+ # Load dataset (you can process it here)
241
+ # dataset = load_dataset(dataset_name, split="train")
242
+
243
+ # Load tokenizer and model with QLoRA configuration
244
+ compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
245
+
246
+ bnb_config = BitsAndBytesConfig(
247
+ load_in_4bit=use_4bit,
248
+ bnb_4bit_quant_type=bnb_4bit_quant_type,
249
+ bnb_4bit_compute_dtype=compute_dtype,
250
+ bnb_4bit_use_double_quant=use_nested_quant,
251
+ )
252
+
253
+ # Check GPU compatibility with bfloat16
254
+ if compute_dtype == torch.float16 and use_4bit:
255
+ major, _ = torch.cuda.get_device_capability()
256
+ if major >= 8:
257
+ print("=" * 80)
258
+ print("Your GPU supports bfloat16: accelerate training with bf16=True")
259
+ print("=" * 80)
260
+
261
+ # Load base model
262
+ model = AutoModelForCausalLM.from_pretrained(
263
+ model_name,
264
+ quantization_config=bnb_config,
265
+ device_map=device_map
266
+ )
267
+ model.config.use_cache = False
268
+ model.config.pretraining_tp = 1
269
+
270
+ # Load LLaMA tokenizer
271
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
272
+ tokenizer.pad_token = tokenizer.eos_token
273
+ tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
274
+
275
+ # Load LoRA configuration
276
+ peft_config = LoraConfig(
277
+ lora_alpha=lora_alpha,
278
+ lora_dropout=lora_dropout,
279
+ r=lora_r,
280
+ bias="none",
281
+ task_type="CAUSAL_LM",
282
+ )
283
+
284
+ # Set training parameters
285
+ training_arguments = TrainingArguments(
286
+ output_dir=output_dir,
287
+ num_train_epochs=num_train_epochs,
288
+ per_device_train_batch_size=per_device_train_batch_size,
289
+ gradient_accumulation_steps=gradient_accumulation_steps,
290
+ optim=optim,
291
+ save_steps=save_steps,
292
+ logging_steps=logging_steps,
293
+ learning_rate=learning_rate,
294
+ weight_decay=weight_decay,
295
+ fp16=fp16,
296
+ bf16=bf16,
297
+ max_grad_norm=max_grad_norm,
298
+ max_steps=max_steps,
299
+ warmup_ratio=warmup_ratio,
300
+ group_by_length=group_by_length,
301
+ lr_scheduler_type=lr_scheduler_type,
302
+ report_to="tensorboard"
303
+ )
304
+
305
+ def extract_between_inst_and_newline(text):
306
+ start_tag = "[/INST]"
307
+ end_char = "\n"
308
+
309
+ start_index = text.find(start_tag)
310
+
311
+ if start_index != -1:
312
+ end_index = text.find(end_char, start_index)
313
+ if end_index != -1:
314
+ extracted_text = text[start_index + len(start_tag):end_index]
315
+ return extracted_text.strip()
316
+
317
+ return None
318
+
319
+ import re
320
+ from functools import lru_cache
321
+
322
+ @lru_cache
323
+ def extract_classification_and_remark(output):
324
+ classification_match = re.search(r'Classification: (.*?)\n', output)
325
+ remark_match = re.search(r'Remark: (.*?)$', output)
326
+
327
+ classification = classification_match.group(1) if classification_match else None
328
+ remark = remark_match.group(1) if remark_match else None
329
+
330
+ return classification, remark
331
+
332
+ # Ignore warnings
333
+ logging.set_verbosity(logging.CRITICAL)
334
+
335
+ # Run text generation pipeline with our next model
336
+ prompt = '''Can you classify the human input as either happy, sad, angry, surprised, confused or neutral and tell me why it was classified as such in one short sentence.
337
+ Don't reply anything besides the classification and the remark. Separate the classificaion and remark with :
338
+ Human input: {}'''
339
+
340
+ def process_comment(comment):
341
+ formatted_prompt = prompt.format(comment)
342
+ pipe = pipeline(task="text2text-generation", model=model, tokenizer=tokenizer, max_length=150)
343
+ result = pipe(f"<s>[INST] {formatted_prompt} [/INST]")
344
+ extract_output = result[0]['generated_text']
345
+ classification, remark = extract_classification_and_remark(extract_output)
346
+ return comment, classification, remark
347
+
348
+ import matplotlib.pyplot as plt
349
+ import seaborn as sns
350
+
351
+ def return_distribution(new_formatted_df):
352
+ # Assuming your DataFrame is named 'df'
353
+ sentiment_counts = new_formatted_df['classification'].value_counts()
354
+ fig = plt.figure()
355
+ sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values)
356
+ plt.xlabel('Sentiment')
357
+ plt.ylabel('Count')
358
+ plt.title('Sentiment Distribution')
359
+ return fig
360
+
361
+ from wordcloud import WordCloud
362
+
363
+ def return_highest_sentiment_worldcloud(new_formatted_df, sentiment):
364
+ # Create a word cloud for a specific sentiment, e.g., 'happy'
365
+ happy_comments = new_formatted_df[new_formatted_df['classification'] == sentiment]['comments']
366
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(happy_comments))
367
+ fig = plt.figure(figsize=(10, 5))
368
+ plt.imshow(wordcloud, interpolation='bilinear')
369
+ plt.axis('off')
370
+ plt.title('Word Cloud for the Strongest Sentiment')
371
+ return fig
372
+
373
+ import pandas as pd
374
+
375
+ def concatenate_remarks_based_on_classification(dataset):
376
+
377
+ # Create an empty dictionary to store concatenated remarks for each classification type.
378
+ concatenated_remarks = {}
379
+
380
+ # Iterate through the dataset to concatenate remarks.
381
+ for index, row in dataset.iterrows():
382
+ classification = row['classification']
383
+ remarks = row['remark']
384
+
385
+ # Check if the classification exists in the dictionary.
386
+ if classification in concatenated_remarks:
387
+ if remarks is not None:
388
+ concatenated_remarks[classification] += ' ' + str(remarks)
389
+ else:
390
+ if remarks is not None:
391
+ concatenated_remarks[classification] = str(remarks)
392
+
393
+ # Create a new DataFrame with the concatenated remarks.
394
+ concatenated_remarks_df = pd.DataFrame(list(concatenated_remarks.items()), columns=['classification', 'concatenated_remarks'])
395
+
396
+ return concatenated_remarks_df
397
+
398
+ # !pip install dask -q
399
+
400
+ # Run text generation pipeline with our next model
401
+ prompt1 = '''Can you summarize the following text in a paragraph of no more than 100 words. Don't respond with anything besides the summary.
402
+ Human input: {}'''
403
+
404
+ def summarize_text(comment):
405
+ formatted_prompt = prompt1.format(comment)
406
+ new_pipe = pipeline(task="text2text-generation", model=model, tokenizer=tokenizer, max_length=3000)
407
+ new_result = new_pipe(f"<s>[INST] {formatted_prompt} [/INST]")
408
+ return new_result
409
+
410
+ ## Function for first tab
411
+
412
+ import numpy as np
413
+ from concurrent.futures import ThreadPoolExecutor
414
+ import dask.dataframe as dd
415
+ from dask.distributed import Client, LocalCluster
416
+ # from multiprocessing import Pool
417
+ # num_processes = 4
418
+
419
+
420
+ # Import necessary libraries and functions here
421
+ # return_df = pd.DataFrame()
422
+ # final_analysed_df = pd.DataFrame() # Initialize as None at the global scope
423
+
424
+ # Define a Gradio interface
425
+ def sentiment_distribution_interface(video_id):
426
+ # global final_analysed_df
427
+ # global unique_classifications
428
+
429
+
430
+ return_df = pd.DataFrame()
431
+ # Call the execution function with the video_id
432
+ return_df = execution_function(video_id)
433
+ print(return_df.head())
434
+
435
+ from concurrent.futures import ThreadPoolExecutor
436
+
437
+ def process_row(row): #3.9s
438
+ comment, classification, remark = process_comment(row.comments)
439
+ return comment, classification, remark
440
+
441
+ with ThreadPoolExecutor(max_workers=4) as executor: # Adjust the number of workers as needed
442
+ results = list(executor.map(process_row, return_df.itertuples()))
443
+
444
+ print(type(results))
445
+ print(results)
446
+
447
+ print("__________________________________________________________________")
448
+
449
+ comments, classification, remark = zip(*results)
450
+
451
+ # Create a DataFrame from the separated data
452
+ df = pd.DataFrame({'comments': comments, 'classification': classification, 'remark': remark})
453
+
454
+ print(df.head())
455
+
456
+ print("__________________________________________________________________")
457
+
458
+ plot = return_distribution(df) # Modify this line to capture the plot
459
+
460
+ word_cloud = return_highest_sentiment_worldcloud(df, df['classification'].value_counts().idxmax())
461
+
462
+ df.to_csv('processed_comments.csv', index=False) # index=False prevents writing the row numbers as a column
463
+
464
+ #concatinating remarks for different sentiments
465
+ # concatenated_remarks_df = concatenate_remarks_based_on_classification(df)
466
+ # print(concatenated_remarks_df)
467
+
468
+ # final_analysed_df = df
469
+
470
+ return plot , word_cloud # Return the plot
471
+
472
+ # Function for Second Tab
473
+
474
+ def function_for_second_tab(input_val):
475
+
476
+ final_analysed_df = pd.read_csv('processed_comments.csv')
477
+ final_analysed_df = pd.DataFrame(final_analysed_df)
478
+ print(final_analysed_df.head())
479
+
480
+ word_cloud = return_highest_sentiment_worldcloud(final_analysed_df, input_val)
481
+
482
+ concatenated_remarks_df = concatenate_remarks_based_on_classification(final_analysed_df)
483
+
484
+ comments = concatenated_remarks_df.loc[concatenated_remarks_df['classification'] == 'Happy', 'concatenated_remarks'].values[0]
485
+
486
+ summarized_text = summarize_text(comments)
487
+
488
+ extract_output_summary = summarized_text[0]['generated_text']
489
+
490
+ final_extract = extract_output_summary.split('[/INST]')[1].strip()
491
+
492
+ return word_cloud, final_extract
493
+
494
+ # # Define the first tab
495
+ outputs = [gr.Plot(), gr.Plot()]
496
+ iface = gr.Interface(fn=sentiment_distribution_interface, inputs="text", outputs=outputs)
497
+
498
+
499
+ # # Define the second tab
500
+ output_second_tab = [gr.Plot(), "text"]
501
+ inputs = "text"
502
+
503
+ description = ("Enter the sentiment for which you want a detailed report")
504
+ app2 = gr.Interface(fn=function_for_second_tab, inputs=inputs, outputs=output_second_tab, description = description)
505
+
506
+ # launch the app
507
+ demo = gr.TabbedInterface([iface, app2], ["Welcome page", "Visualization page"])
508
+
509
+ if __name__ == "__main__":
510
+ demo.queue().launch()