Spaces:
Runtime error
Runtime error
Aniruddha21
commited on
Commit
•
56ec760
1
Parent(s):
e7947f0
Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,510 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Fine Tuned Llama 2 for Comment Analysis
|
3 |
+
|
4 |
+
Automatically generated by Colaboratory.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1NX5z-wVpsEp8UigB0q7vZSZMFRa6nnEE
|
8 |
+
|
9 |
+
##**Extract Youtube Comments**
|
10 |
+
"""
|
11 |
+
|
12 |
+
# !pip uninstall gradio
|
13 |
+
# !pip3 install gradio -q
|
14 |
+
# !pip install --upgrade fastapi -q
|
15 |
+
# !pip install typing-extensions --upgrade
|
16 |
+
|
17 |
+
# import locale
|
18 |
+
# locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
|
19 |
+
|
20 |
+
# import locale
|
21 |
+
# locale.getpreferredencoding = lambda: "UTF-8"
|
22 |
+
|
23 |
+
# !pip3 install typing-extensions==4.2.0
|
24 |
+
# !pip3 install gradio -q
|
25 |
+
# !pip3 install --upgrade tensorflow
|
26 |
+
|
27 |
+
import pandas as pd
|
28 |
+
import gradio as gr
|
29 |
+
from googleapiclient.discovery import build
|
30 |
+
import csv
|
31 |
+
# import gradio as gr
|
32 |
+
from PIL import Image
|
33 |
+
import io
|
34 |
+
|
35 |
+
api_key = 'AIzaSyANfQYiumNUfJ8_YaDg-Hfr0BRXFhXnbvQ'
|
36 |
+
|
37 |
+
def video_comments(video_id):
|
38 |
+
# Create a CSV file to store comments
|
39 |
+
with open('comments.csv', 'w', newline='', encoding='utf-8') as csvfile:
|
40 |
+
fieldnames = ['Comment']
|
41 |
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
42 |
+
writer.writeheader()
|
43 |
+
|
44 |
+
# Counter to limit the number of comments
|
45 |
+
comment_count = 0
|
46 |
+
|
47 |
+
# creating youtube resource object
|
48 |
+
youtube = build('youtube', 'v3', developerKey=api_key)
|
49 |
+
|
50 |
+
# retrieve youtube video results
|
51 |
+
video_response = youtube.commentThreads().list(
|
52 |
+
part='snippet,replies',
|
53 |
+
videoId=video_id,
|
54 |
+
maxResults=100 # Adjust the number of comments per page as needed
|
55 |
+
).execute()
|
56 |
+
|
57 |
+
# iterate video response
|
58 |
+
while video_response:
|
59 |
+
|
60 |
+
# extracting required info from each result object
|
61 |
+
for item in video_response['items']:
|
62 |
+
|
63 |
+
# Extracting comments
|
64 |
+
comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
|
65 |
+
|
66 |
+
# Write the comment to the CSV file
|
67 |
+
writer.writerow({'Comment': comment})
|
68 |
+
|
69 |
+
comment_count += 1
|
70 |
+
|
71 |
+
# Check if the maximum comment count is reached
|
72 |
+
if comment_count >= 50:
|
73 |
+
return
|
74 |
+
|
75 |
+
# Again repeat
|
76 |
+
if 'nextPageToken' in video_response:
|
77 |
+
video_response = youtube.commentThreads().list(
|
78 |
+
part='snippet,replies',
|
79 |
+
videoId=video_id,
|
80 |
+
pageToken=video_response['nextPageToken'],
|
81 |
+
maxResults=100 # Adjust the number of comments per page as needed
|
82 |
+
).execute()
|
83 |
+
else:
|
84 |
+
break
|
85 |
+
|
86 |
+
def execution_function(input):
|
87 |
+
# Initialize a counter for deleted rows
|
88 |
+
deleted_row_count = 0
|
89 |
+
|
90 |
+
video_comments(input)
|
91 |
+
|
92 |
+
# calling the comment file created above
|
93 |
+
file_path = "/content/comments.csv"
|
94 |
+
df = pd.read_csv(file_path)
|
95 |
+
|
96 |
+
# Rename the column name to 'comments'
|
97 |
+
df.rename(columns={'Comment': 'comments'}, inplace=True)
|
98 |
+
|
99 |
+
# Get the first 300 comments for quick analysis
|
100 |
+
df = df.head(10)
|
101 |
+
|
102 |
+
return df
|
103 |
+
# return_distribution()
|
104 |
+
|
105 |
+
# comments_df = execution_function("6ydFDwv-n8w")
|
106 |
+
# comments_df = comments_df.head(20)
|
107 |
+
|
108 |
+
# comments_df.head()
|
109 |
+
|
110 |
+
"""##**Fine - tune Llama 2**
|
111 |
+
|
112 |
+
IMP: This notebook runs on a T4 GPU.
|
113 |
+
"""
|
114 |
+
|
115 |
+
# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
|
116 |
+
|
117 |
+
import os
|
118 |
+
import torch
|
119 |
+
from datasets import load_dataset
|
120 |
+
from transformers import (
|
121 |
+
AutoModelForCausalLM,
|
122 |
+
AutoTokenizer,
|
123 |
+
BitsAndBytesConfig,
|
124 |
+
HfArgumentParser,
|
125 |
+
TrainingArguments,
|
126 |
+
pipeline,
|
127 |
+
logging,
|
128 |
+
)
|
129 |
+
from peft import LoraConfig, PeftModel
|
130 |
+
from trl import SFTTrainer
|
131 |
+
|
132 |
+
# The model that you want to train from the Hugging Face hub
|
133 |
+
model_name = "NousResearch/Llama-2-7b-chat-hf"
|
134 |
+
|
135 |
+
# The instruction dataset to use
|
136 |
+
# dataset_name = "mlabonne/guanaco-llama2-1k"
|
137 |
+
|
138 |
+
# Fine-tuned model name
|
139 |
+
# new_model = "llama-2-7b-miniguanaco"
|
140 |
+
|
141 |
+
################################################################################
|
142 |
+
# QLoRA parameters
|
143 |
+
################################################################################
|
144 |
+
|
145 |
+
# LoRA attention dimension
|
146 |
+
lora_r = 64
|
147 |
+
|
148 |
+
# Alpha parameter for LoRA scaling
|
149 |
+
lora_alpha = 16
|
150 |
+
|
151 |
+
# Dropout probability for LoRA layers
|
152 |
+
lora_dropout = 0.1
|
153 |
+
|
154 |
+
################################################################################
|
155 |
+
# bitsandbytes parameters
|
156 |
+
################################################################################
|
157 |
+
|
158 |
+
# Activate 4-bit precision base model loading
|
159 |
+
use_4bit = True
|
160 |
+
|
161 |
+
# Compute dtype for 4-bit base models
|
162 |
+
bnb_4bit_compute_dtype = "float16"
|
163 |
+
|
164 |
+
# Quantization type (fp4 or nf4)
|
165 |
+
bnb_4bit_quant_type = "nf4"
|
166 |
+
|
167 |
+
# Activate nested quantization for 4-bit base models (double quantization)
|
168 |
+
use_nested_quant = False
|
169 |
+
|
170 |
+
################################################################################
|
171 |
+
# TrainingArguments parameters
|
172 |
+
################################################################################
|
173 |
+
|
174 |
+
# Output directory where the model predictions and checkpoints will be stored
|
175 |
+
output_dir = "./results"
|
176 |
+
|
177 |
+
# Number of training epochs
|
178 |
+
num_train_epochs = 1
|
179 |
+
|
180 |
+
# Enable fp16/bf16 training (set bf16 to True with an A100)
|
181 |
+
fp16 = False
|
182 |
+
bf16 = False
|
183 |
+
|
184 |
+
# Batch size per GPU for training
|
185 |
+
per_device_train_batch_size = 4
|
186 |
+
|
187 |
+
# Batch size per GPU for evaluation
|
188 |
+
per_device_eval_batch_size = 4
|
189 |
+
|
190 |
+
# Number of update steps to accumulate the gradients for
|
191 |
+
gradient_accumulation_steps = 1
|
192 |
+
|
193 |
+
# Enable gradient checkpointing
|
194 |
+
gradient_checkpointing = True
|
195 |
+
|
196 |
+
# Maximum gradient normal (gradient clipping)
|
197 |
+
max_grad_norm = 0.3
|
198 |
+
|
199 |
+
# Initial learning rate (AdamW optimizer)
|
200 |
+
learning_rate = 2e-4
|
201 |
+
|
202 |
+
# Weight decay to apply to all layers except bias/LayerNorm weights
|
203 |
+
weight_decay = 0.001
|
204 |
+
|
205 |
+
# Optimizer to use
|
206 |
+
optim = "paged_adamw_32bit"
|
207 |
+
|
208 |
+
# Learning rate schedule
|
209 |
+
lr_scheduler_type = "cosine"
|
210 |
+
|
211 |
+
# Number of training steps (overrides num_train_epochs)
|
212 |
+
max_steps = -1
|
213 |
+
|
214 |
+
# Ratio of steps for a linear warmup (from 0 to learning rate)
|
215 |
+
warmup_ratio = 0.03
|
216 |
+
|
217 |
+
# Group sequences into batches with same length
|
218 |
+
# Saves memory and speeds up training considerably
|
219 |
+
group_by_length = True
|
220 |
+
|
221 |
+
# Save checkpoint every X updates steps
|
222 |
+
save_steps = 0
|
223 |
+
|
224 |
+
# Log every X updates steps
|
225 |
+
logging_steps = 25
|
226 |
+
|
227 |
+
################################################################################
|
228 |
+
# SFT parameters
|
229 |
+
################################################################################
|
230 |
+
|
231 |
+
# Maximum sequence length to use
|
232 |
+
max_seq_length = None
|
233 |
+
|
234 |
+
# Pack multiple short examples in the same input sequence to increase efficiency
|
235 |
+
packing = False
|
236 |
+
|
237 |
+
# Load the entire model on the GPU 0
|
238 |
+
device_map = {"": 0}
|
239 |
+
|
240 |
+
# Load dataset (you can process it here)
|
241 |
+
# dataset = load_dataset(dataset_name, split="train")
|
242 |
+
|
243 |
+
# Load tokenizer and model with QLoRA configuration
|
244 |
+
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
|
245 |
+
|
246 |
+
bnb_config = BitsAndBytesConfig(
|
247 |
+
load_in_4bit=use_4bit,
|
248 |
+
bnb_4bit_quant_type=bnb_4bit_quant_type,
|
249 |
+
bnb_4bit_compute_dtype=compute_dtype,
|
250 |
+
bnb_4bit_use_double_quant=use_nested_quant,
|
251 |
+
)
|
252 |
+
|
253 |
+
# Check GPU compatibility with bfloat16
|
254 |
+
if compute_dtype == torch.float16 and use_4bit:
|
255 |
+
major, _ = torch.cuda.get_device_capability()
|
256 |
+
if major >= 8:
|
257 |
+
print("=" * 80)
|
258 |
+
print("Your GPU supports bfloat16: accelerate training with bf16=True")
|
259 |
+
print("=" * 80)
|
260 |
+
|
261 |
+
# Load base model
|
262 |
+
model = AutoModelForCausalLM.from_pretrained(
|
263 |
+
model_name,
|
264 |
+
quantization_config=bnb_config,
|
265 |
+
device_map=device_map
|
266 |
+
)
|
267 |
+
model.config.use_cache = False
|
268 |
+
model.config.pretraining_tp = 1
|
269 |
+
|
270 |
+
# Load LLaMA tokenizer
|
271 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
272 |
+
tokenizer.pad_token = tokenizer.eos_token
|
273 |
+
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
|
274 |
+
|
275 |
+
# Load LoRA configuration
|
276 |
+
peft_config = LoraConfig(
|
277 |
+
lora_alpha=lora_alpha,
|
278 |
+
lora_dropout=lora_dropout,
|
279 |
+
r=lora_r,
|
280 |
+
bias="none",
|
281 |
+
task_type="CAUSAL_LM",
|
282 |
+
)
|
283 |
+
|
284 |
+
# Set training parameters
|
285 |
+
training_arguments = TrainingArguments(
|
286 |
+
output_dir=output_dir,
|
287 |
+
num_train_epochs=num_train_epochs,
|
288 |
+
per_device_train_batch_size=per_device_train_batch_size,
|
289 |
+
gradient_accumulation_steps=gradient_accumulation_steps,
|
290 |
+
optim=optim,
|
291 |
+
save_steps=save_steps,
|
292 |
+
logging_steps=logging_steps,
|
293 |
+
learning_rate=learning_rate,
|
294 |
+
weight_decay=weight_decay,
|
295 |
+
fp16=fp16,
|
296 |
+
bf16=bf16,
|
297 |
+
max_grad_norm=max_grad_norm,
|
298 |
+
max_steps=max_steps,
|
299 |
+
warmup_ratio=warmup_ratio,
|
300 |
+
group_by_length=group_by_length,
|
301 |
+
lr_scheduler_type=lr_scheduler_type,
|
302 |
+
report_to="tensorboard"
|
303 |
+
)
|
304 |
+
|
305 |
+
def extract_between_inst_and_newline(text):
|
306 |
+
start_tag = "[/INST]"
|
307 |
+
end_char = "\n"
|
308 |
+
|
309 |
+
start_index = text.find(start_tag)
|
310 |
+
|
311 |
+
if start_index != -1:
|
312 |
+
end_index = text.find(end_char, start_index)
|
313 |
+
if end_index != -1:
|
314 |
+
extracted_text = text[start_index + len(start_tag):end_index]
|
315 |
+
return extracted_text.strip()
|
316 |
+
|
317 |
+
return None
|
318 |
+
|
319 |
+
import re
|
320 |
+
from functools import lru_cache
|
321 |
+
|
322 |
+
@lru_cache
|
323 |
+
def extract_classification_and_remark(output):
|
324 |
+
classification_match = re.search(r'Classification: (.*?)\n', output)
|
325 |
+
remark_match = re.search(r'Remark: (.*?)$', output)
|
326 |
+
|
327 |
+
classification = classification_match.group(1) if classification_match else None
|
328 |
+
remark = remark_match.group(1) if remark_match else None
|
329 |
+
|
330 |
+
return classification, remark
|
331 |
+
|
332 |
+
# Ignore warnings
|
333 |
+
logging.set_verbosity(logging.CRITICAL)
|
334 |
+
|
335 |
+
# Run text generation pipeline with our next model
|
336 |
+
prompt = '''Can you classify the human input as either happy, sad, angry, surprised, confused or neutral and tell me why it was classified as such in one short sentence.
|
337 |
+
Don't reply anything besides the classification and the remark. Separate the classificaion and remark with :
|
338 |
+
Human input: {}'''
|
339 |
+
|
340 |
+
def process_comment(comment):
|
341 |
+
formatted_prompt = prompt.format(comment)
|
342 |
+
pipe = pipeline(task="text2text-generation", model=model, tokenizer=tokenizer, max_length=150)
|
343 |
+
result = pipe(f"<s>[INST] {formatted_prompt} [/INST]")
|
344 |
+
extract_output = result[0]['generated_text']
|
345 |
+
classification, remark = extract_classification_and_remark(extract_output)
|
346 |
+
return comment, classification, remark
|
347 |
+
|
348 |
+
import matplotlib.pyplot as plt
|
349 |
+
import seaborn as sns
|
350 |
+
|
351 |
+
def return_distribution(new_formatted_df):
|
352 |
+
# Assuming your DataFrame is named 'df'
|
353 |
+
sentiment_counts = new_formatted_df['classification'].value_counts()
|
354 |
+
fig = plt.figure()
|
355 |
+
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values)
|
356 |
+
plt.xlabel('Sentiment')
|
357 |
+
plt.ylabel('Count')
|
358 |
+
plt.title('Sentiment Distribution')
|
359 |
+
return fig
|
360 |
+
|
361 |
+
from wordcloud import WordCloud
|
362 |
+
|
363 |
+
def return_highest_sentiment_worldcloud(new_formatted_df, sentiment):
|
364 |
+
# Create a word cloud for a specific sentiment, e.g., 'happy'
|
365 |
+
happy_comments = new_formatted_df[new_formatted_df['classification'] == sentiment]['comments']
|
366 |
+
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(happy_comments))
|
367 |
+
fig = plt.figure(figsize=(10, 5))
|
368 |
+
plt.imshow(wordcloud, interpolation='bilinear')
|
369 |
+
plt.axis('off')
|
370 |
+
plt.title('Word Cloud for the Strongest Sentiment')
|
371 |
+
return fig
|
372 |
+
|
373 |
+
import pandas as pd
|
374 |
+
|
375 |
+
def concatenate_remarks_based_on_classification(dataset):
|
376 |
+
|
377 |
+
# Create an empty dictionary to store concatenated remarks for each classification type.
|
378 |
+
concatenated_remarks = {}
|
379 |
+
|
380 |
+
# Iterate through the dataset to concatenate remarks.
|
381 |
+
for index, row in dataset.iterrows():
|
382 |
+
classification = row['classification']
|
383 |
+
remarks = row['remark']
|
384 |
+
|
385 |
+
# Check if the classification exists in the dictionary.
|
386 |
+
if classification in concatenated_remarks:
|
387 |
+
if remarks is not None:
|
388 |
+
concatenated_remarks[classification] += ' ' + str(remarks)
|
389 |
+
else:
|
390 |
+
if remarks is not None:
|
391 |
+
concatenated_remarks[classification] = str(remarks)
|
392 |
+
|
393 |
+
# Create a new DataFrame with the concatenated remarks.
|
394 |
+
concatenated_remarks_df = pd.DataFrame(list(concatenated_remarks.items()), columns=['classification', 'concatenated_remarks'])
|
395 |
+
|
396 |
+
return concatenated_remarks_df
|
397 |
+
|
398 |
+
# !pip install dask -q
|
399 |
+
|
400 |
+
# Run text generation pipeline with our next model
|
401 |
+
prompt1 = '''Can you summarize the following text in a paragraph of no more than 100 words. Don't respond with anything besides the summary.
|
402 |
+
Human input: {}'''
|
403 |
+
|
404 |
+
def summarize_text(comment):
|
405 |
+
formatted_prompt = prompt1.format(comment)
|
406 |
+
new_pipe = pipeline(task="text2text-generation", model=model, tokenizer=tokenizer, max_length=3000)
|
407 |
+
new_result = new_pipe(f"<s>[INST] {formatted_prompt} [/INST]")
|
408 |
+
return new_result
|
409 |
+
|
410 |
+
## Function for first tab
|
411 |
+
|
412 |
+
import numpy as np
|
413 |
+
from concurrent.futures import ThreadPoolExecutor
|
414 |
+
import dask.dataframe as dd
|
415 |
+
from dask.distributed import Client, LocalCluster
|
416 |
+
# from multiprocessing import Pool
|
417 |
+
# num_processes = 4
|
418 |
+
|
419 |
+
|
420 |
+
# Import necessary libraries and functions here
|
421 |
+
# return_df = pd.DataFrame()
|
422 |
+
# final_analysed_df = pd.DataFrame() # Initialize as None at the global scope
|
423 |
+
|
424 |
+
# Define a Gradio interface
|
425 |
+
def sentiment_distribution_interface(video_id):
|
426 |
+
# global final_analysed_df
|
427 |
+
# global unique_classifications
|
428 |
+
|
429 |
+
|
430 |
+
return_df = pd.DataFrame()
|
431 |
+
# Call the execution function with the video_id
|
432 |
+
return_df = execution_function(video_id)
|
433 |
+
print(return_df.head())
|
434 |
+
|
435 |
+
from concurrent.futures import ThreadPoolExecutor
|
436 |
+
|
437 |
+
def process_row(row): #3.9s
|
438 |
+
comment, classification, remark = process_comment(row.comments)
|
439 |
+
return comment, classification, remark
|
440 |
+
|
441 |
+
with ThreadPoolExecutor(max_workers=4) as executor: # Adjust the number of workers as needed
|
442 |
+
results = list(executor.map(process_row, return_df.itertuples()))
|
443 |
+
|
444 |
+
print(type(results))
|
445 |
+
print(results)
|
446 |
+
|
447 |
+
print("__________________________________________________________________")
|
448 |
+
|
449 |
+
comments, classification, remark = zip(*results)
|
450 |
+
|
451 |
+
# Create a DataFrame from the separated data
|
452 |
+
df = pd.DataFrame({'comments': comments, 'classification': classification, 'remark': remark})
|
453 |
+
|
454 |
+
print(df.head())
|
455 |
+
|
456 |
+
print("__________________________________________________________________")
|
457 |
+
|
458 |
+
plot = return_distribution(df) # Modify this line to capture the plot
|
459 |
+
|
460 |
+
word_cloud = return_highest_sentiment_worldcloud(df, df['classification'].value_counts().idxmax())
|
461 |
+
|
462 |
+
df.to_csv('processed_comments.csv', index=False) # index=False prevents writing the row numbers as a column
|
463 |
+
|
464 |
+
#concatinating remarks for different sentiments
|
465 |
+
# concatenated_remarks_df = concatenate_remarks_based_on_classification(df)
|
466 |
+
# print(concatenated_remarks_df)
|
467 |
+
|
468 |
+
# final_analysed_df = df
|
469 |
+
|
470 |
+
return plot , word_cloud # Return the plot
|
471 |
+
|
472 |
+
# Function for Second Tab
|
473 |
+
|
474 |
+
def function_for_second_tab(input_val):
|
475 |
+
|
476 |
+
final_analysed_df = pd.read_csv('processed_comments.csv')
|
477 |
+
final_analysed_df = pd.DataFrame(final_analysed_df)
|
478 |
+
print(final_analysed_df.head())
|
479 |
+
|
480 |
+
word_cloud = return_highest_sentiment_worldcloud(final_analysed_df, input_val)
|
481 |
+
|
482 |
+
concatenated_remarks_df = concatenate_remarks_based_on_classification(final_analysed_df)
|
483 |
+
|
484 |
+
comments = concatenated_remarks_df.loc[concatenated_remarks_df['classification'] == 'Happy', 'concatenated_remarks'].values[0]
|
485 |
+
|
486 |
+
summarized_text = summarize_text(comments)
|
487 |
+
|
488 |
+
extract_output_summary = summarized_text[0]['generated_text']
|
489 |
+
|
490 |
+
final_extract = extract_output_summary.split('[/INST]')[1].strip()
|
491 |
+
|
492 |
+
return word_cloud, final_extract
|
493 |
+
|
494 |
+
# # Define the first tab
|
495 |
+
outputs = [gr.Plot(), gr.Plot()]
|
496 |
+
iface = gr.Interface(fn=sentiment_distribution_interface, inputs="text", outputs=outputs)
|
497 |
+
|
498 |
+
|
499 |
+
# # Define the second tab
|
500 |
+
output_second_tab = [gr.Plot(), "text"]
|
501 |
+
inputs = "text"
|
502 |
+
|
503 |
+
description = ("Enter the sentiment for which you want a detailed report")
|
504 |
+
app2 = gr.Interface(fn=function_for_second_tab, inputs=inputs, outputs=output_second_tab, description = description)
|
505 |
+
|
506 |
+
# launch the app
|
507 |
+
demo = gr.TabbedInterface([iface, app2], ["Welcome page", "Visualization page"])
|
508 |
+
|
509 |
+
if __name__ == "__main__":
|
510 |
+
demo.queue().launch()
|