dejanseo commited on
Commit
de55574
1 Parent(s): f20c53e

Upload 4 files

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. goodies/data.csv +3 -0
  3. goodies/sentiment.py +142 -0
  4. goodies/synth.py +53 -0
  5. goodies/train.py +143 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ goodies/data.csv filter=lfs diff=lfs merge=lfs -text
goodies/data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:025591039882326919545ffe4e47a9285d3f567c617c7b061a4f335f8a3d3a2b
3
+ size 11089589
goodies/sentiment.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ import pandas as pd
7
+ import altair as alt
8
+ from collections import OrderedDict
9
+ import nltk
10
+ from nltk.tokenize import sent_tokenize
11
+
12
+ nltk.download('punkt')
13
+
14
+ # Load model and tokenizer
15
+ model_name = 'C:/projects/sentiment/albert_sentiment_model/checkpoint-3000'
16
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
17
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
18
+
19
+ # Sentiment labels as textual descriptions
20
+ sentiment_labels = {
21
+ 0: "very positive",
22
+ 1: "positive",
23
+ 2: "somewhat positive",
24
+ 3: "neutral",
25
+ 4: "somewhat negative",
26
+ 5: "negative",
27
+ 6: "very negative"
28
+ }
29
+
30
+ # Background colors for sentiments
31
+ background_colors = {
32
+ "very positive": "rgba(0, 255, 0, 0.5)",
33
+ "positive": "rgba(0, 255, 0, 0.3)",
34
+ "somewhat positive": "rgba(0, 255, 0, 0.1)",
35
+ "neutral": "rgba(128, 128, 128, 0.1)",
36
+ "somewhat negative": "rgba(255, 0, 0, 0.1)",
37
+ "negative": "rgba(255, 0, 0, 0.3)",
38
+ "very negative": "rgba(255, 0, 0, 0.5)"
39
+ }
40
+
41
+ # Function to get text content from a URL
42
+ def get_text_from_url(url):
43
+ response = requests.get(url)
44
+ if response.status_code == 200:
45
+ soup = BeautifulSoup(response.content, 'html.parser')
46
+ paragraphs = soup.find_all('p')
47
+ return ' '.join(p.get_text() for p in paragraphs)
48
+ return ""
49
+
50
+ # Function to classify text
51
+ def classify_text(text, max_length):
52
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
53
+ with torch.no_grad():
54
+ outputs = model(**inputs)
55
+ scores = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
56
+ return scores
57
+
58
+ # Function to handle long texts
59
+ def classify_long_text(text):
60
+ max_length = tokenizer.model_max_length
61
+ # Split the text into chunks
62
+ chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
63
+ aggregate_scores = [0] * len(sentiment_labels)
64
+ chunk_scores_list = []
65
+ for chunk in chunks:
66
+ chunk_scores = classify_text(chunk, max_length)
67
+ chunk_scores_list.append(chunk_scores)
68
+ aggregate_scores = [x + y for x, y in zip(aggregate_scores, chunk_scores)]
69
+ # Average the scores
70
+ aggregate_scores = [x / len(chunks) for x in aggregate_scores]
71
+ return aggregate_scores, chunk_scores_list, chunks
72
+
73
+ # Function to classify each sentence in the text
74
+ def classify_sentences(text):
75
+ sentences = sent_tokenize(text)
76
+ sentence_scores = []
77
+ for sentence in sentences:
78
+ scores = classify_text(sentence, tokenizer.model_max_length)
79
+ sentiment_idx = scores.index(max(scores))
80
+ sentiment = sentiment_labels[sentiment_idx]
81
+ sentence_scores.append((sentence, sentiment))
82
+ return sentence_scores
83
+
84
+ # Streamlit UI
85
+ st.title("Sentiment Classification from URL")
86
+
87
+ url = st.text_input("Enter URL:")
88
+ if url:
89
+ text = get_text_from_url(url)
90
+ if text:
91
+ scores, chunk_scores_list, chunks = classify_long_text(text)
92
+ scores_dict = {sentiment_labels[i]: scores[i] for i in range(len(sentiment_labels))}
93
+
94
+ # Ensure the exact order of labels in the graph
95
+ sentiment_order = [
96
+ "very positive", "positive", "somewhat positive",
97
+ "neutral",
98
+ "somewhat negative", "negative", "very negative"
99
+ ]
100
+ ordered_scores_dict = OrderedDict((label, scores_dict[label]) for label in sentiment_order)
101
+
102
+ # Prepare the DataFrame and reindex
103
+ df = pd.DataFrame.from_dict(ordered_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
104
+
105
+ # Use Altair to plot the bar chart
106
+ chart = alt.Chart(df.reset_index()).mark_bar().encode(
107
+ x=alt.X('index', sort=sentiment_order, title='Sentiment'),
108
+ y='Likelihood'
109
+ ).properties(
110
+ width=600,
111
+ height=400
112
+ )
113
+
114
+ st.altair_chart(chart, use_container_width=True)
115
+
116
+ # Display each chunk and its own chart
117
+ for i, (chunk_scores, chunk) in enumerate(zip(chunk_scores_list, chunks)):
118
+ chunk_scores_dict = {sentiment_labels[j]: chunk_scores[j] for j in range(len(sentiment_labels))}
119
+ ordered_chunk_scores_dict = OrderedDict((label, chunk_scores_dict[label]) for label in sentiment_order)
120
+ df_chunk = pd.DataFrame.from_dict(ordered_chunk_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
121
+
122
+ chunk_chart = alt.Chart(df_chunk.reset_index()).mark_bar().encode(
123
+ x=alt.X('index', sort=sentiment_order, title='Sentiment'),
124
+ y='Likelihood'
125
+ ).properties(
126
+ width=600,
127
+ height=400
128
+ )
129
+
130
+ st.write(f"Chunk {i + 1}:")
131
+ st.write(chunk)
132
+ st.altair_chart(chunk_chart, use_container_width=True)
133
+
134
+ # Sentence-level classification with background colors
135
+ st.write("Extracted Text with Sentiment Highlights:")
136
+ sentence_scores = classify_sentences(text)
137
+ for sentence, sentiment in sentence_scores:
138
+ bg_color = background_colors[sentiment]
139
+ st.markdown(f'<span style="background-color: {bg_color}">{sentence}</span>', unsafe_allow_html=True)
140
+
141
+ else:
142
+ st.write("Could not extract text from the provided URL.")
goodies/synth.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import csv
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ import torch
5
+
6
+ # Load the model and tokenizer from the local directory
7
+ model_path = "C:\\models\\llama-3-8b-Instruct-bnb-4bit"
8
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
9
+ model = AutoModelForCausalLM.from_pretrained(model_path)
10
+
11
+ # Parameters for generating data
12
+ num_samples = 100000
13
+ output_file = 'raw_data.csv'
14
+
15
+ # Sentiment labels as textual descriptions
16
+ sentiment_labels = {
17
+ 0: "very positive",
18
+ 1: "positive",
19
+ 2: "somewhat positive",
20
+ 3: "neutral",
21
+ 4: "somewhat negative",
22
+ 5: "negative",
23
+ 6: "very negative"
24
+ }
25
+
26
+ # Ensure output CSV file exists and create if not, with headers
27
+ if not os.path.exists(output_file):
28
+ with open(output_file, 'w', newline='', encoding='utf-8') as file:
29
+ writer = csv.writer(file)
30
+ writer.writerow(['text', 'label']) # Writing the header
31
+
32
+ # Append raw generated data to the CSV file
33
+ for i in range(num_samples):
34
+ label = i % len(sentiment_labels) # Ensure labels cycle properly from 0 to 6
35
+ sentiment = sentiment_labels[label]
36
+ # Encode the prompt with dynamic sentiment label
37
+ prompt = f"Generate a short article on a random topic and writing style, ensuring the sentiment is {sentiment}. Write nothing but the article text. Do not include the sentiment in the text of the article."
38
+ print(f"Generating sample {i+1}/{num_samples}: {prompt}") # Output the prompt to console for verification
39
+ input_ids = tokenizer.encode(prompt, return_tensors='pt')
40
+
41
+ # Generate response from the model
42
+ output = model.generate(input_ids, max_length=200, do_sample=True, top_k=50, top_p=0.95, temperature=0.7)
43
+ response = tokenizer.decode(output[0], skip_special_tokens=True)
44
+
45
+ # Get only the new tokens generated by the model
46
+ new_tokens = response[len(prompt):].strip()
47
+
48
+ # Append the raw generated text and numeric label to the CSV file
49
+ with open(output_file, 'a', newline='', encoding='utf-8') as file:
50
+ writer = csv.writer(file)
51
+ writer.writerow([new_tokens, label]) # Writing each row as it's generated
52
+
53
+ print(f"Data generation completed. Data appended to {output_file}")
goodies/train.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from sklearn.model_selection import train_test_split
4
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, TrainerCallback
5
+ from transformers import DataCollatorWithPadding
6
+ from datasets import load_metric, Dataset
7
+ import torch
8
+ import wandb
9
+
10
+ # Set tweakable parameters
11
+ model_name = 'albert-base-v2'
12
+ num_labels = 7 # Number of sentiment labels
13
+ output_dir = './albert_sentiment_model'
14
+ data_file = 'data.csv'
15
+ wandb_entity = 'dejan'
16
+ batch_size = 8
17
+ num_train_epochs = 30
18
+ learning_rate = 5e-5
19
+
20
+ # Initialize wandb
21
+ wandb.init(entity=wandb_entity, project="sentiment_classification")
22
+
23
+ # Load and preprocess the dataset
24
+ df = pd.read_csv(data_file, header=None, names=['text', 'label'])
25
+
26
+ # Remove leading instructions and prompts (assuming we know the prompt structure)
27
+ df['text'] = df['text'].apply(lambda x: x.split('Write nothing but the article text. Do not include the sentiment in the text of the article.')[-1].strip())
28
+
29
+ # Display the cleaned data
30
+ print(df.head())
31
+
32
+ train_texts, val_texts, train_labels, val_labels = train_test_split(
33
+ df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
34
+ )
35
+
36
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
37
+ train_encodings = tokenizer(train_texts, truncation=True, padding=True)
38
+ val_encodings = tokenizer(val_texts, truncation=True, padding=True)
39
+
40
+ train_dataset = Dataset.from_dict({
41
+ 'input_ids': train_encodings['input_ids'],
42
+ 'attention_mask': train_encodings['attention_mask'],
43
+ 'labels': train_labels
44
+ })
45
+
46
+ val_dataset = Dataset.from_dict({
47
+ 'input_ids': val_encodings['input_ids'],
48
+ 'attention_mask': val_encodings['attention_mask'],
49
+ 'labels': val_labels
50
+ })
51
+
52
+ # Define data collator
53
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
54
+
55
+ # Define metrics
56
+ accuracy_metric = load_metric("accuracy")
57
+ precision_metric = load_metric("precision")
58
+ recall_metric = load_metric("recall")
59
+ f1_metric = load_metric("f1")
60
+
61
+ def compute_metrics(eval_pred):
62
+ logits, labels = eval_pred
63
+ predictions = torch.argmax(torch.tensor(logits), dim=-1)
64
+ accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
65
+ precision = precision_metric.compute(predictions=predictions, references=labels, average='weighted')
66
+ recall = recall_metric.compute(predictions=predictions, references=labels, average='weighted')
67
+ f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
68
+
69
+ wandb.log({
70
+ "eval_accuracy": accuracy["accuracy"],
71
+ "eval_precision": precision["precision"],
72
+ "eval_recall": recall["recall"],
73
+ "eval_f1": f1["f1"],
74
+ })
75
+
76
+ return {
77
+ "accuracy": accuracy["accuracy"],
78
+ "precision": precision["precision"],
79
+ "recall": recall["recall"],
80
+ "f1": f1["f1"],
81
+ }
82
+
83
+ # Training arguments
84
+ training_args = TrainingArguments(
85
+ output_dir=output_dir,
86
+ num_train_epochs=num_train_epochs,
87
+ per_device_train_batch_size=batch_size,
88
+ per_device_eval_batch_size=batch_size,
89
+ warmup_steps=500,
90
+ weight_decay=0.01,
91
+ logging_dir='./logs',
92
+ logging_steps=10,
93
+ evaluation_strategy="steps",
94
+ eval_steps=500,
95
+ save_strategy="steps",
96
+ save_steps=500,
97
+ load_best_model_at_end=True,
98
+ metric_for_best_model="eval_loss",
99
+ learning_rate=learning_rate,
100
+ report_to="wandb",
101
+ lr_scheduler_type="linear",
102
+ logging_strategy="steps",
103
+ )
104
+
105
+ # Early stopping callback
106
+ class EarlyStoppingCallback(TrainerCallback):
107
+ def __init__(self, patience=2):
108
+ self.patience = patience
109
+ self.best_metric = None
110
+ self.best_model_checkpoint = None
111
+ self.epochs_no_improve = 0
112
+
113
+ def on_evaluate(self, args, state, control, **kwargs):
114
+ eval_metric = kwargs['metrics'][training_args.metric_for_best_model]
115
+ if self.best_metric is None or eval_metric < self.best_metric:
116
+ self.best_metric = eval_metric
117
+ self.best_model_checkpoint = state.global_step
118
+ self.epochs_no_improve = 0
119
+ else:
120
+ self.epochs_no_improve += 1
121
+ if self.epochs_no_improve >= self.patience:
122
+ print(f"Stopping early after {self.epochs_no_improve} evaluations with no improvement.")
123
+ control.should_training_stop = True
124
+
125
+ # Trainer
126
+ trainer = Trainer(
127
+ model=AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels),
128
+ args=training_args,
129
+ train_dataset=train_dataset,
130
+ eval_dataset=val_dataset,
131
+ data_collator=data_collator,
132
+ compute_metrics=compute_metrics,
133
+ callbacks=[EarlyStoppingCallback(patience=2)]
134
+ )
135
+
136
+ # Train and save the final model
137
+ trainer.train()
138
+ trainer.save_model(output_dir)
139
+
140
+ # Finalize wandb
141
+ wandb.finish()
142
+
143
+ print(f"Training completed. Model saved to {output_dir}")