dejanseo
/

sentiment-croatian

+import csv
+import torch
+from transformers import pipeline
+# Initialize the chatbot with half-precision
+chatbot = pipeline(
+    "text-generation",
+    model="mistralai/Mistral-7B-Instruct-v0.3",
+    torch_dtype=torch.float16,
+    device=0  # Assuming you are using a GPU
+)
+# Sentiments and their labels
+sentiments = ["Positive", "Neutral", "Negative"]
+# List of content formats to cycle through
+formats = [
+    "Feature Stories", "Instructional Manuals", "FAQs", "Policy Documents", "Live Stream Descriptions",
+    "Editorial Content", "Research Papers", "User Manuals", "Commentaries", "Opinion Pieces",
+    "Newsletters", "Online Courses", "Photo Essays", "Annual Reports", "User-Generated Content",
+    "Testimonials", "DIY Content", "How-To Videos", "Campaign Reports", "Legal Briefs",
+    "Blog Posts", "Case Studies", "Tutorials", "Interviews", "Press Releases",
+    "eBooks", "Infographics", "Webinars", "Podcast Descriptions", "Video Scripts",
+    "Advertisements", "Forum Discussions", "Whitepapers", "Surveys", "Product Reviews",
+    "Event Summaries", "Opinion Editorials", "Letters to the Editor", "Round-Up Posts",
+    "Buying Guides", "Checklists", "Cheat Sheets", "Recipes", "Travel Guides",
+    "Profiles", "Lists", "Q&A Sessions", "Debates", "Polls"
+]
+# List of topics to cycle through
+topics = [
+    "Family", "Travel", "Politics", "Science", "Health", "Technology", "Sports",
+    "Education", "Environment", "Economics", "Culture", "History", "Music",
+    "Literature", "Food", "Art", "Fashion", "Entertainment", "Business",
+    "Relationships", "Fitness", "Automotive", "Finance", "Real Estate", "Law",
+    "Psychology", "Philosophy", "Religion", "Gardening", "DIY", "Hobbies",
+    "Pets", "Career", "Marketing", "Customer Service", "Networking", "Innovation",
+    "Artificial Intelligence", "Sustainability", "Social Issues", "Digital Media",
+    "Programming", "Cybersecurity", "Astronomy", "Geography", "Travel Tips",
+    "Cooking", "Parenting", "Productivity", "Mindfulness", "Mental Health",
+    "Self-Improvement", "Leadership", "Teamwork", "Volunteering", "Nonprofits",
+    "Gaming", "E-commerce", "Photography", "Videography", "Film", "Television",
+    "Streaming Services", "Podcasts", "Public Speaking", "Event Planning",
+    "Interior Design", "Architecture", "Urban Development", "Agriculture",
+    "Climate Change", "Renewable Energy", "Space Exploration", "Biotechnology",
+    "Cryptocurrency", "Blockchain", "Robotics", "Automated Systems", "Genetics",
+    "Medicine", "Pharmacy", "Veterinary Science", "Marine Biology", "Ecology",
+    "Conservation", "Wildlife", "Botany", "Zoology", "Geology", "Meteorology",
+    "Aviation", "Maritime", "Logistics", "Supply Chain", "Human Resources",
+    "Diversity and Inclusion", "Ethics", "Corporate Governance", "Public Relations",
+    "Journalism", "Advertising", "Sales", "Customer Experience", "Retail",
+    "Hospitality", "Tourism", "Luxury Goods", "Consumer Electronics", "Fashion Design",
+    "Textiles", "Jewelry", "Cosmetics", "Skincare", "Perfume", "Toys", "Gadgets",
+    "Home Appliances", "Furniture", "Home Improvement", "Landscaping", "Real Estate Investment"
+]
+# CSV file setup with utf-8 encoding and quoting minimal
+csv_file = "sentences.csv"
+with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
+    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
+    writer.writerow(["text", "label"])
+# Function to ensure correct quoting
+def ensure_correct_quoting(text):
+    # Check if the text is already properly quoted
+    if text.startswith('"') and text.endswith('"'):
+        return text
+    else:
+        return f'"{text}"'  # Add quotes if not already present
+# Collect and save responses until reaching 100,000 rows
+row_count = 0
+format_index = 0
+topic_index = 0
+while row_count < 100000:
+    for idx, sentiment in enumerate(sentiments):
+        format_type = formats[format_index % len(formats)]
+        format_index += 1
+        topic = topics[topic_index % len(topics)]
+        topic_index += 1
+        # Add the current sentiment prompt with the format and topic
+        prompt = f"Write a single sentence of web content in Croatian. Content type: {format_type}. Topic: {topic}. Sentiment: {sentiment}."
+        response = chatbot(prompt, max_new_tokens=100)  # Adjusted max_new_tokens for longer responses
+        # Debug print to check response format
+        print(f"Full model response: {response}")
+        # Extract the generated text from the response structure
+        generated_text = response[0]['generated_text']
+        # Remove any part of the prompt from the generated text if it exists
+        clean_text = generated_text.replace(prompt, "").strip().split('\n')[0]
+        # Ensure the text starts and ends with quotes only if it doesn't already
+        correctly_quoted_text = ensure_correct_quoting(clean_text)
+        # Append the clean response text to the CSV
+        with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
+            writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
+            writer.writerow([correctly_quoted_text, idx])
+        row_count += 1
+        print(f"Response for sentiment '{sentiment}' saved to {csv_file}. Total rows: {row_count}")
+        if row_count >= 100000:
+            break
+print("All responses saved. Total rows:", row_count)

train.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import pandas as pd
+from sklearn.model_selection import train_test_split
+from transformers import ElectraTokenizer, ElectraForSequenceClassification, Trainer, TrainingArguments
+import torch
+from datasets import Dataset
+import wandb
+from sklearn.metrics import precision_recall_fscore_support, accuracy_score
+# Load dataset
+data = pd.read_csv('sentences.csv')
+# Split dataset into train and eval sets
+train_df, eval_df = train_test_split(data, test_size=0.2, random_state=42)
+# Convert to Hugging Face Dataset
+train_dataset = Dataset.from_pandas(train_df)
+eval_dataset = Dataset.from_pandas(eval_df)
+# Initialize the tokenizer and model
+model_name = 'classla/bcms-bertic'
+tokenizer = ElectraTokenizer.from_pretrained(model_name)
+model = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=3)
+# Tokenize the datasets
+def tokenize_function(examples):
+    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
+train_dataset = train_dataset.map(tokenize_function, batched=True)
+eval_dataset = eval_dataset.map(tokenize_function, batched=True)
+# Set format for PyTorch
+train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+# Define the compute_metrics function
+def compute_metrics(p):
+    preds = p.predictions.argmax(-1)
+    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
+    acc = accuracy_score(p.label_ids, preds)
+    return {
+        'accuracy': acc,
+        'precision': precision,
+        'recall': recall,
+        'f1': f1
+    }
+# Define the training arguments
+training_args = TrainingArguments(
+    output_dir='./results',
+    evaluation_strategy='epoch',
+    save_strategy='epoch',
+    learning_rate=1e-5,
+    per_device_train_batch_size=128,
+    per_device_eval_batch_size=128,
+    num_train_epochs=20,
+    weight_decay=0.01,
+    warmup_steps=500,
+    logging_dir='./logs',
+    logging_steps=10,
+    save_total_limit=20,
+    load_best_model_at_end=True,
+    metric_for_best_model='accuracy',
+    report_to='wandb',
+    run_name='sentiment-classification',
+)
+# Initialize WandB
+wandb.init(project="sentiment-classification", entity="dejan")
+# Define Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    compute_metrics=compute_metrics
+)
+# Train the model
+trainer.train()
+# Evaluate the model
+trainer.evaluate()
+# Finish the WandB run
+wandb.finish()
+# Save the model
+model.save_pretrained('./sentiment-model')
+tokenizer.save_pretrained('./sentiment-model')