Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""Fine Tuned Llama 2 for Comment Analysis | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1NX5z-wVpsEp8UigB0q7vZSZMFRa6nnEE | |
##**Extract Youtube Comments** | |
""" | |
# !pip uninstall gradio | |
# !pip3 install gradio -q | |
# !pip install --upgrade fastapi -q | |
# !pip install typing-extensions --upgrade | |
# import locale | |
# locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') | |
# import locale | |
# locale.getpreferredencoding = lambda: "UTF-8" | |
# !pip3 install typing-extensions==4.2.0 | |
# !pip3 install gradio -q | |
# !pip3 install --upgrade tensorflow | |
import pandas as pd | |
import gradio as gr | |
from googleapiclient.discovery import build | |
import csv | |
# import gradio as gr | |
from PIL import Image | |
import io | |
api_key = 'AIzaSyANfQYiumNUfJ8_YaDg-Hfr0BRXFhXnbvQ' | |
def video_comments(video_id): | |
# Create a CSV file to store comments | |
with open('comments.csv', 'w', newline='', encoding='utf-8') as csvfile: | |
fieldnames = ['Comment'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
# Counter to limit the number of comments | |
comment_count = 0 | |
# creating youtube resource object | |
youtube = build('youtube', 'v3', developerKey=api_key) | |
# retrieve youtube video results | |
video_response = youtube.commentThreads().list( | |
part='snippet,replies', | |
videoId=video_id, | |
maxResults=100 # Adjust the number of comments per page as needed | |
).execute() | |
# iterate video response | |
while video_response: | |
# extracting required info from each result object | |
for item in video_response['items']: | |
# Extracting comments | |
comment = item['snippet']['topLevelComment']['snippet']['textDisplay'] | |
# Write the comment to the CSV file | |
writer.writerow({'Comment': comment}) | |
comment_count += 1 | |
# Check if the maximum comment count is reached | |
if comment_count >= 50: | |
return | |
# Again repeat | |
if 'nextPageToken' in video_response: | |
video_response = youtube.commentThreads().list( | |
part='snippet,replies', | |
videoId=video_id, | |
pageToken=video_response['nextPageToken'], | |
maxResults=100 # Adjust the number of comments per page as needed | |
).execute() | |
else: | |
break | |
def execution_function(input): | |
# Initialize a counter for deleted rows | |
deleted_row_count = 0 | |
video_comments(input) | |
# calling the comment file created above | |
file_path = "/content/comments.csv" | |
df = pd.read_csv(file_path) | |
# Rename the column name to 'comments' | |
df.rename(columns={'Comment': 'comments'}, inplace=True) | |
# Get the first 300 comments for quick analysis | |
df = df.head(10) | |
return df | |
# return_distribution() | |
# comments_df = execution_function("6ydFDwv-n8w") | |
# comments_df = comments_df.head(20) | |
# comments_df.head() | |
"""##**Fine - tune Llama 2** | |
IMP: This notebook runs on a T4 GPU. | |
""" | |
# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 | |
import os | |
import torch | |
from datasets import load_dataset | |
from transformers import ( | |
AutoModelForCausalLM, | |
AutoTokenizer, | |
BitsAndBytesConfig, | |
HfArgumentParser, | |
TrainingArguments, | |
pipeline, | |
logging, | |
) | |
from peft import LoraConfig, PeftModel | |
from trl import SFTTrainer | |
# The model that you want to train from the Hugging Face hub | |
model_name = "NousResearch/Llama-2-7b-chat-hf" | |
# The instruction dataset to use | |
# dataset_name = "mlabonne/guanaco-llama2-1k" | |
# Fine-tuned model name | |
# new_model = "llama-2-7b-miniguanaco" | |
################################################################################ | |
# QLoRA parameters | |
################################################################################ | |
# LoRA attention dimension | |
lora_r = 64 | |
# Alpha parameter for LoRA scaling | |
lora_alpha = 16 | |
# Dropout probability for LoRA layers | |
lora_dropout = 0.1 | |
################################################################################ | |
# bitsandbytes parameters | |
################################################################################ | |
# Activate 4-bit precision base model loading | |
use_4bit = True | |
# Compute dtype for 4-bit base models | |
bnb_4bit_compute_dtype = "float16" | |
# Quantization type (fp4 or nf4) | |
bnb_4bit_quant_type = "nf4" | |
# Activate nested quantization for 4-bit base models (double quantization) | |
use_nested_quant = False | |
################################################################################ | |
# TrainingArguments parameters | |
################################################################################ | |
# Output directory where the model predictions and checkpoints will be stored | |
output_dir = "./results" | |
# Number of training epochs | |
num_train_epochs = 1 | |
# Enable fp16/bf16 training (set bf16 to True with an A100) | |
fp16 = False | |
bf16 = False | |
# Batch size per GPU for training | |
per_device_train_batch_size = 4 | |
# Batch size per GPU for evaluation | |
per_device_eval_batch_size = 4 | |
# Number of update steps to accumulate the gradients for | |
gradient_accumulation_steps = 1 | |
# Enable gradient checkpointing | |
gradient_checkpointing = True | |
# Maximum gradient normal (gradient clipping) | |
max_grad_norm = 0.3 | |
# Initial learning rate (AdamW optimizer) | |
learning_rate = 2e-4 | |
# Weight decay to apply to all layers except bias/LayerNorm weights | |
weight_decay = 0.001 | |
# Optimizer to use | |
optim = "paged_adamw_32bit" | |
# Learning rate schedule | |
lr_scheduler_type = "cosine" | |
# Number of training steps (overrides num_train_epochs) | |
max_steps = -1 | |
# Ratio of steps for a linear warmup (from 0 to learning rate) | |
warmup_ratio = 0.03 | |
# Group sequences into batches with same length | |
# Saves memory and speeds up training considerably | |
group_by_length = True | |
# Save checkpoint every X updates steps | |
save_steps = 0 | |
# Log every X updates steps | |
logging_steps = 25 | |
################################################################################ | |
# SFT parameters | |
################################################################################ | |
# Maximum sequence length to use | |
max_seq_length = None | |
# Pack multiple short examples in the same input sequence to increase efficiency | |
packing = False | |
# Load the entire model on the GPU 0 | |
device_map = {"": 0} | |
# Load dataset (you can process it here) | |
# dataset = load_dataset(dataset_name, split="train") | |
# Load tokenizer and model with QLoRA configuration | |
compute_dtype = getattr(torch, bnb_4bit_compute_dtype) | |
bnb_config = BitsAndBytesConfig( | |
load_in_4bit=use_4bit, | |
bnb_4bit_quant_type=bnb_4bit_quant_type, | |
bnb_4bit_compute_dtype=compute_dtype, | |
bnb_4bit_use_double_quant=use_nested_quant, | |
) | |
# Check GPU compatibility with bfloat16 | |
if compute_dtype == torch.float16 and use_4bit: | |
major, _ = torch.cuda.get_device_capability() | |
if major >= 8: | |
print("=" * 80) | |
print("Your GPU supports bfloat16: accelerate training with bf16=True") | |
print("=" * 80) | |
# Load base model | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
quantization_config=bnb_config, | |
device_map=device_map | |
) | |
model.config.use_cache = False | |
model.config.pretraining_tp = 1 | |
# Load LLaMA tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
tokenizer.pad_token = tokenizer.eos_token | |
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training | |
# Load LoRA configuration | |
peft_config = LoraConfig( | |
lora_alpha=lora_alpha, | |
lora_dropout=lora_dropout, | |
r=lora_r, | |
bias="none", | |
task_type="CAUSAL_LM", | |
) | |
# Set training parameters | |
training_arguments = TrainingArguments( | |
output_dir=output_dir, | |
num_train_epochs=num_train_epochs, | |
per_device_train_batch_size=per_device_train_batch_size, | |
gradient_accumulation_steps=gradient_accumulation_steps, | |
optim=optim, | |
save_steps=save_steps, | |
logging_steps=logging_steps, | |
learning_rate=learning_rate, | |
weight_decay=weight_decay, | |
fp16=fp16, | |
bf16=bf16, | |
max_grad_norm=max_grad_norm, | |
max_steps=max_steps, | |
warmup_ratio=warmup_ratio, | |
group_by_length=group_by_length, | |
lr_scheduler_type=lr_scheduler_type, | |
report_to="tensorboard" | |
) | |
def extract_between_inst_and_newline(text): | |
start_tag = "[/INST]" | |
end_char = "\n" | |
start_index = text.find(start_tag) | |
if start_index != -1: | |
end_index = text.find(end_char, start_index) | |
if end_index != -1: | |
extracted_text = text[start_index + len(start_tag):end_index] | |
return extracted_text.strip() | |
return None | |
import re | |
from functools import lru_cache | |
def extract_classification_and_remark(output): | |
classification_match = re.search(r'Classification: (.*?)\n', output) | |
remark_match = re.search(r'Remark: (.*?)$', output) | |
classification = classification_match.group(1) if classification_match else None | |
remark = remark_match.group(1) if remark_match else None | |
return classification, remark | |
# Ignore warnings | |
logging.set_verbosity(logging.CRITICAL) | |
# Run text generation pipeline with our next model | |
prompt = '''Can you classify the human input as either happy, sad, angry, surprised, confused or neutral and tell me why it was classified as such in one short sentence. | |
Don't reply anything besides the classification and the remark. Separate the classificaion and remark with : | |
Human input: {}''' | |
def process_comment(comment): | |
formatted_prompt = prompt.format(comment) | |
pipe = pipeline(task="text2text-generation", model=model, tokenizer=tokenizer, max_length=150) | |
result = pipe(f"<s>[INST] {formatted_prompt} [/INST]") | |
extract_output = result[0]['generated_text'] | |
classification, remark = extract_classification_and_remark(extract_output) | |
return comment, classification, remark | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
def return_distribution(new_formatted_df): | |
# Assuming your DataFrame is named 'df' | |
sentiment_counts = new_formatted_df['classification'].value_counts() | |
fig = plt.figure() | |
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values) | |
plt.xlabel('Sentiment') | |
plt.ylabel('Count') | |
plt.title('Sentiment Distribution') | |
return fig | |
from wordcloud import WordCloud | |
def return_highest_sentiment_worldcloud(new_formatted_df, sentiment): | |
# Create a word cloud for a specific sentiment, e.g., 'happy' | |
happy_comments = new_formatted_df[new_formatted_df['classification'] == sentiment]['comments'] | |
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(happy_comments)) | |
fig = plt.figure(figsize=(10, 5)) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis('off') | |
plt.title('Word Cloud for the Strongest Sentiment') | |
return fig | |
import pandas as pd | |
def concatenate_remarks_based_on_classification(dataset): | |
# Create an empty dictionary to store concatenated remarks for each classification type. | |
concatenated_remarks = {} | |
# Iterate through the dataset to concatenate remarks. | |
for index, row in dataset.iterrows(): | |
classification = row['classification'] | |
remarks = row['remark'] | |
# Check if the classification exists in the dictionary. | |
if classification in concatenated_remarks: | |
if remarks is not None: | |
concatenated_remarks[classification] += ' ' + str(remarks) | |
else: | |
if remarks is not None: | |
concatenated_remarks[classification] = str(remarks) | |
# Create a new DataFrame with the concatenated remarks. | |
concatenated_remarks_df = pd.DataFrame(list(concatenated_remarks.items()), columns=['classification', 'concatenated_remarks']) | |
return concatenated_remarks_df | |
# !pip install dask -q | |
# Run text generation pipeline with our next model | |
prompt1 = '''Can you summarize the following text in a paragraph of no more than 100 words. Don't respond with anything besides the summary. | |
Human input: {}''' | |
def summarize_text(comment): | |
formatted_prompt = prompt1.format(comment) | |
new_pipe = pipeline(task="text2text-generation", model=model, tokenizer=tokenizer, max_length=3000) | |
new_result = new_pipe(f"<s>[INST] {formatted_prompt} [/INST]") | |
return new_result | |
## Function for first tab | |
import numpy as np | |
from concurrent.futures import ThreadPoolExecutor | |
import dask.dataframe as dd | |
from dask.distributed import Client, LocalCluster | |
# from multiprocessing import Pool | |
# num_processes = 4 | |
# Import necessary libraries and functions here | |
# return_df = pd.DataFrame() | |
# final_analysed_df = pd.DataFrame() # Initialize as None at the global scope | |
# Define a Gradio interface | |
def sentiment_distribution_interface(video_id): | |
# global final_analysed_df | |
# global unique_classifications | |
return_df = pd.DataFrame() | |
# Call the execution function with the video_id | |
return_df = execution_function(video_id) | |
print(return_df.head()) | |
from concurrent.futures import ThreadPoolExecutor | |
def process_row(row): #3.9s | |
comment, classification, remark = process_comment(row.comments) | |
return comment, classification, remark | |
with ThreadPoolExecutor(max_workers=4) as executor: # Adjust the number of workers as needed | |
results = list(executor.map(process_row, return_df.itertuples())) | |
print(type(results)) | |
print(results) | |
print("__________________________________________________________________") | |
comments, classification, remark = zip(*results) | |
# Create a DataFrame from the separated data | |
df = pd.DataFrame({'comments': comments, 'classification': classification, 'remark': remark}) | |
print(df.head()) | |
print("__________________________________________________________________") | |
plot = return_distribution(df) # Modify this line to capture the plot | |
word_cloud = return_highest_sentiment_worldcloud(df, df['classification'].value_counts().idxmax()) | |
df.to_csv('processed_comments.csv', index=False) # index=False prevents writing the row numbers as a column | |
#concatinating remarks for different sentiments | |
# concatenated_remarks_df = concatenate_remarks_based_on_classification(df) | |
# print(concatenated_remarks_df) | |
# final_analysed_df = df | |
return plot , word_cloud # Return the plot | |
# Function for Second Tab | |
def function_for_second_tab(input_val): | |
final_analysed_df = pd.read_csv('processed_comments.csv') | |
final_analysed_df = pd.DataFrame(final_analysed_df) | |
print(final_analysed_df.head()) | |
word_cloud = return_highest_sentiment_worldcloud(final_analysed_df, input_val) | |
concatenated_remarks_df = concatenate_remarks_based_on_classification(final_analysed_df) | |
comments = concatenated_remarks_df.loc[concatenated_remarks_df['classification'] == 'Happy', 'concatenated_remarks'].values[0] | |
summarized_text = summarize_text(comments) | |
extract_output_summary = summarized_text[0]['generated_text'] | |
final_extract = extract_output_summary.split('[/INST]')[1].strip() | |
return word_cloud, final_extract | |
# # Define the first tab | |
outputs = [gr.Plot(), gr.Plot()] | |
iface = gr.Interface(fn=sentiment_distribution_interface, inputs="text", outputs=outputs) | |
# # Define the second tab | |
output_second_tab = [gr.Plot(), "text"] | |
inputs = "text" | |
description = ("Enter the sentiment for which you want a detailed report") | |
app2 = gr.Interface(fn=function_for_second_tab, inputs=inputs, outputs=output_second_tab, description = description) | |
# launch the app | |
demo = gr.TabbedInterface([iface, app2], ["Welcome page", "Visualization page"]) | |
if __name__ == "__main__": | |
demo.queue().launch() | |