Upload 4 files

Browse files

Files changed (5) hide show

.gitattributes +1 -0
goodies/data.csv +3 -0
goodies/sentiment.py +142 -0
goodies/synth.py +53 -0
goodies/train.py +143 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+goodies/data.csv filter=lfs diff=lfs merge=lfs -text

goodies/data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:025591039882326919545ffe4e47a9285d3f567c617c7b061a4f335f8a3d3a2b
+size 11089589

goodies/sentiment.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import streamlit as st
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import altair as alt
+from collections import OrderedDict
+import nltk
+from nltk.tokenize import sent_tokenize
+nltk.download('punkt')
+# Load model and tokenizer
+model_name = 'C:/projects/sentiment/albert_sentiment_model/checkpoint-3000'
+model = AutoModelForSequenceClassification.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Sentiment labels as textual descriptions
+sentiment_labels = {
+    0: "very positive",
+    1: "positive",
+    2: "somewhat positive",
+    3: "neutral",
+    4: "somewhat negative",
+    5: "negative",
+    6: "very negative"
+}
+# Background colors for sentiments
+background_colors = {
+    "very positive": "rgba(0, 255, 0, 0.5)",
+    "positive": "rgba(0, 255, 0, 0.3)",
+    "somewhat positive": "rgba(0, 255, 0, 0.1)",
+    "neutral": "rgba(128, 128, 128, 0.1)",
+    "somewhat negative": "rgba(255, 0, 0, 0.1)",
+    "negative": "rgba(255, 0, 0, 0.3)",
+    "very negative": "rgba(255, 0, 0, 0.5)"
+}
+# Function to get text content from a URL
+def get_text_from_url(url):
+    response = requests.get(url)
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.content, 'html.parser')
+        paragraphs = soup.find_all('p')
+        return ' '.join(p.get_text() for p in paragraphs)
+    return ""
+# Function to classify text
+def classify_text(text, max_length):
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    scores = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
+    return scores
+# Function to handle long texts
+def classify_long_text(text):
+    max_length = tokenizer.model_max_length
+    # Split the text into chunks
+    chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
+    aggregate_scores = [0] * len(sentiment_labels)
+    chunk_scores_list = []
+    for chunk in chunks:
+        chunk_scores = classify_text(chunk, max_length)
+        chunk_scores_list.append(chunk_scores)
+        aggregate_scores = [x + y for x, y in zip(aggregate_scores, chunk_scores)]
+    # Average the scores
+    aggregate_scores = [x / len(chunks) for x in aggregate_scores]
+    return aggregate_scores, chunk_scores_list, chunks
+# Function to classify each sentence in the text
+def classify_sentences(text):
+    sentences = sent_tokenize(text)
+    sentence_scores = []
+    for sentence in sentences:
+        scores = classify_text(sentence, tokenizer.model_max_length)
+        sentiment_idx = scores.index(max(scores))
+        sentiment = sentiment_labels[sentiment_idx]
+        sentence_scores.append((sentence, sentiment))
+    return sentence_scores
+# Streamlit UI
+st.title("Sentiment Classification from URL")
+url = st.text_input("Enter URL:")
+if url:
+    text = get_text_from_url(url)
+    if text:
+        scores, chunk_scores_list, chunks = classify_long_text(text)
+        scores_dict = {sentiment_labels[i]: scores[i] for i in range(len(sentiment_labels))}
+        # Ensure the exact order of labels in the graph
+        sentiment_order = [
+            "very positive", "positive", "somewhat positive",
+            "neutral",
+            "somewhat negative", "negative", "very negative"
+        ]
+        ordered_scores_dict = OrderedDict((label, scores_dict[label]) for label in sentiment_order)
+        # Prepare the DataFrame and reindex
+        df = pd.DataFrame.from_dict(ordered_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
+        # Use Altair to plot the bar chart
+        chart = alt.Chart(df.reset_index()).mark_bar().encode(
+            x=alt.X('index', sort=sentiment_order, title='Sentiment'),
+            y='Likelihood'
+        ).properties(
+            width=600,
+            height=400
+        )
+        st.altair_chart(chart, use_container_width=True)
+        # Display each chunk and its own chart
+        for i, (chunk_scores, chunk) in enumerate(zip(chunk_scores_list, chunks)):
+            chunk_scores_dict = {sentiment_labels[j]: chunk_scores[j] for j in range(len(sentiment_labels))}
+            ordered_chunk_scores_dict = OrderedDict((label, chunk_scores_dict[label]) for label in sentiment_order)
+            df_chunk = pd.DataFrame.from_dict(ordered_chunk_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
+            chunk_chart = alt.Chart(df_chunk.reset_index()).mark_bar().encode(
+                x=alt.X('index', sort=sentiment_order, title='Sentiment'),
+                y='Likelihood'
+            ).properties(
+                width=600,
+                height=400
+            )
+            st.write(f"Chunk {i + 1}:")
+            st.write(chunk)
+            st.altair_chart(chunk_chart, use_container_width=True)
+        # Sentence-level classification with background colors
+        st.write("Extracted Text with Sentiment Highlights:")
+        sentence_scores = classify_sentences(text)
+        for sentence, sentiment in sentence_scores:
+            bg_color = background_colors[sentiment]
+            st.markdown(f'<span style="background-color: {bg_color}">{sentence}</span>', unsafe_allow_html=True)
+    else:
+        st.write("Could not extract text from the provided URL.")

goodies/synth.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+import csv
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+# Load the model and tokenizer from the local directory
+model_path = "C:\\models\\llama-3-8b-Instruct-bnb-4bit"
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+model = AutoModelForCausalLM.from_pretrained(model_path)
+# Parameters for generating data
+num_samples = 100000
+output_file = 'raw_data.csv'
+# Sentiment labels as textual descriptions
+sentiment_labels = {
+    0: "very positive",
+    1: "positive",
+    2: "somewhat positive",
+    3: "neutral",
+    4: "somewhat negative",
+    5: "negative",
+    6: "very negative"
+}
+# Ensure output CSV file exists and create if not, with headers
+if not os.path.exists(output_file):
+    with open(output_file, 'w', newline='', encoding='utf-8') as file:
+        writer = csv.writer(file)
+        writer.writerow(['text', 'label'])  # Writing the header
+# Append raw generated data to the CSV file
+for i in range(num_samples):
+    label = i % len(sentiment_labels)  # Ensure labels cycle properly from 0 to 6
+    sentiment = sentiment_labels[label]
+    # Encode the prompt with dynamic sentiment label
+    prompt = f"Generate a short article on a random topic and writing style, ensuring the sentiment is {sentiment}. Write nothing but the article text. Do not include the sentiment in the text of the article."
+    print(f"Generating sample {i+1}/{num_samples}: {prompt}")  # Output the prompt to console for verification
+    input_ids = tokenizer.encode(prompt, return_tensors='pt')
+    # Generate response from the model
+    output = model.generate(input_ids, max_length=200, do_sample=True, top_k=50, top_p=0.95, temperature=0.7)
+    response = tokenizer.decode(output[0], skip_special_tokens=True)
+    # Get only the new tokens generated by the model
+    new_tokens = response[len(prompt):].strip()
+    # Append the raw generated text and numeric label to the CSV file
+    with open(output_file, 'a', newline='', encoding='utf-8') as file:
+        writer = csv.writer(file)
+        writer.writerow([new_tokens, label])  # Writing each row as it's generated
+print(f"Data generation completed. Data appended to {output_file}")

goodies/train.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, TrainerCallback
+from transformers import DataCollatorWithPadding
+from datasets import load_metric, Dataset
+import torch
+import wandb
+# Set tweakable parameters
+model_name = 'albert-base-v2'
+num_labels = 7  # Number of sentiment labels
+output_dir = './albert_sentiment_model'
+data_file = 'data.csv'
+wandb_entity = 'dejan'
+batch_size = 8
+num_train_epochs = 30
+learning_rate = 5e-5
+# Initialize wandb
+wandb.init(entity=wandb_entity, project="sentiment_classification")
+# Load and preprocess the dataset
+df = pd.read_csv(data_file, header=None, names=['text', 'label'])
+# Remove leading instructions and prompts (assuming we know the prompt structure)
+df['text'] = df['text'].apply(lambda x: x.split('Write nothing but the article text. Do not include the sentiment in the text of the article.')[-1].strip())
+# Display the cleaned data
+print(df.head())
+train_texts, val_texts, train_labels, val_labels = train_test_split(
+    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
+)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+train_encodings = tokenizer(train_texts, truncation=True, padding=True)
+val_encodings = tokenizer(val_texts, truncation=True, padding=True)
+train_dataset = Dataset.from_dict({
+    'input_ids': train_encodings['input_ids'],
+    'attention_mask': train_encodings['attention_mask'],
+    'labels': train_labels
+})
+val_dataset = Dataset.from_dict({
+    'input_ids': val_encodings['input_ids'],
+    'attention_mask': val_encodings['attention_mask'],
+    'labels': val_labels
+})
+# Define data collator
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+# Define metrics
+accuracy_metric = load_metric("accuracy")
+precision_metric = load_metric("precision")
+recall_metric = load_metric("recall")
+f1_metric = load_metric("f1")
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    predictions = torch.argmax(torch.tensor(logits), dim=-1)
+    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
+    precision = precision_metric.compute(predictions=predictions, references=labels, average='weighted')
+    recall = recall_metric.compute(predictions=predictions, references=labels, average='weighted')
+    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
+    wandb.log({
+        "eval_accuracy": accuracy["accuracy"],
+        "eval_precision": precision["precision"],
+        "eval_recall": recall["recall"],
+        "eval_f1": f1["f1"],
+    })
+    return {
+        "accuracy": accuracy["accuracy"],
+        "precision": precision["precision"],
+        "recall": recall["recall"],
+        "f1": f1["f1"],
+    }
+# Training arguments
+training_args = TrainingArguments(
+    output_dir=output_dir,
+    num_train_epochs=num_train_epochs,
+    per_device_train_batch_size=batch_size,
+    per_device_eval_batch_size=batch_size,
+    warmup_steps=500,
+    weight_decay=0.01,
+    logging_dir='./logs',
+    logging_steps=10,
+    evaluation_strategy="steps",
+    eval_steps=500,
+    save_strategy="steps",
+    save_steps=500,
+    load_best_model_at_end=True,
+    metric_for_best_model="eval_loss",
+    learning_rate=learning_rate,
+    report_to="wandb",
+    lr_scheduler_type="linear",
+    logging_strategy="steps",
+)
+# Early stopping callback
+class EarlyStoppingCallback(TrainerCallback):
+    def __init__(self, patience=2):
+        self.patience = patience
+        self.best_metric = None
+        self.best_model_checkpoint = None
+        self.epochs_no_improve = 0
+    def on_evaluate(self, args, state, control, **kwargs):
+        eval_metric = kwargs['metrics'][training_args.metric_for_best_model]
+        if self.best_metric is None or eval_metric < self.best_metric:
+            self.best_metric = eval_metric
+            self.best_model_checkpoint = state.global_step
+            self.epochs_no_improve = 0
+        else:
+            self.epochs_no_improve += 1
+            if self.epochs_no_improve >= self.patience:
+                print(f"Stopping early after {self.epochs_no_improve} evaluations with no improvement.")
+                control.should_training_stop = True
+# Trainer
+trainer = Trainer(
+    model=AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels),
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=val_dataset,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+    callbacks=[EarlyStoppingCallback(patience=2)]
+)
+# Train and save the final model
+trainer.train()
+trainer.save_model(output_dir)
+# Finalize wandb
+wandb.finish()
+print(f"Training completed. Model saved to {output_dir}")