sentiment-croatian / synth.py
dejanseo's picture
Upload 2 files
eb404b7 verified
import csv
import torch
from transformers import pipeline
# Initialize the chatbot with half-precision
chatbot = pipeline(
"text-generation",
model="mistralai/Mistral-7B-Instruct-v0.3",
torch_dtype=torch.float16,
device=0 # Assuming you are using a GPU
)
# Sentiments and their labels
sentiments = ["Positive", "Neutral", "Negative"]
# List of content formats to cycle through
formats = [
"Feature Stories", "Instructional Manuals", "FAQs", "Policy Documents", "Live Stream Descriptions",
"Editorial Content", "Research Papers", "User Manuals", "Commentaries", "Opinion Pieces",
"Newsletters", "Online Courses", "Photo Essays", "Annual Reports", "User-Generated Content",
"Testimonials", "DIY Content", "How-To Videos", "Campaign Reports", "Legal Briefs",
"Blog Posts", "Case Studies", "Tutorials", "Interviews", "Press Releases",
"eBooks", "Infographics", "Webinars", "Podcast Descriptions", "Video Scripts",
"Advertisements", "Forum Discussions", "Whitepapers", "Surveys", "Product Reviews",
"Event Summaries", "Opinion Editorials", "Letters to the Editor", "Round-Up Posts",
"Buying Guides", "Checklists", "Cheat Sheets", "Recipes", "Travel Guides",
"Profiles", "Lists", "Q&A Sessions", "Debates", "Polls"
]
# List of topics to cycle through
topics = [
"Family", "Travel", "Politics", "Science", "Health", "Technology", "Sports",
"Education", "Environment", "Economics", "Culture", "History", "Music",
"Literature", "Food", "Art", "Fashion", "Entertainment", "Business",
"Relationships", "Fitness", "Automotive", "Finance", "Real Estate", "Law",
"Psychology", "Philosophy", "Religion", "Gardening", "DIY", "Hobbies",
"Pets", "Career", "Marketing", "Customer Service", "Networking", "Innovation",
"Artificial Intelligence", "Sustainability", "Social Issues", "Digital Media",
"Programming", "Cybersecurity", "Astronomy", "Geography", "Travel Tips",
"Cooking", "Parenting", "Productivity", "Mindfulness", "Mental Health",
"Self-Improvement", "Leadership", "Teamwork", "Volunteering", "Nonprofits",
"Gaming", "E-commerce", "Photography", "Videography", "Film", "Television",
"Streaming Services", "Podcasts", "Public Speaking", "Event Planning",
"Interior Design", "Architecture", "Urban Development", "Agriculture",
"Climate Change", "Renewable Energy", "Space Exploration", "Biotechnology",
"Cryptocurrency", "Blockchain", "Robotics", "Automated Systems", "Genetics",
"Medicine", "Pharmacy", "Veterinary Science", "Marine Biology", "Ecology",
"Conservation", "Wildlife", "Botany", "Zoology", "Geology", "Meteorology",
"Aviation", "Maritime", "Logistics", "Supply Chain", "Human Resources",
"Diversity and Inclusion", "Ethics", "Corporate Governance", "Public Relations",
"Journalism", "Advertising", "Sales", "Customer Experience", "Retail",
"Hospitality", "Tourism", "Luxury Goods", "Consumer Electronics", "Fashion Design",
"Textiles", "Jewelry", "Cosmetics", "Skincare", "Perfume", "Toys", "Gadgets",
"Home Appliances", "Furniture", "Home Improvement", "Landscaping", "Real Estate Investment"
]
# CSV file setup with utf-8 encoding and quoting minimal
csv_file = "sentences.csv"
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow(["text", "label"])
# Function to ensure correct quoting
def ensure_correct_quoting(text):
# Check if the text is already properly quoted
if text.startswith('"') and text.endswith('"'):
return text
else:
return f'"{text}"' # Add quotes if not already present
# Collect and save responses until reaching 100,000 rows
row_count = 0
format_index = 0
topic_index = 0
while row_count < 100000:
for idx, sentiment in enumerate(sentiments):
format_type = formats[format_index % len(formats)]
format_index += 1
topic = topics[topic_index % len(topics)]
topic_index += 1
# Add the current sentiment prompt with the format and topic
prompt = f"Write a single sentence of web content in Croatian. Content type: {format_type}. Topic: {topic}. Sentiment: {sentiment}."
response = chatbot(prompt, max_new_tokens=100) # Adjusted max_new_tokens for longer responses
# Debug print to check response format
print(f"Full model response: {response}")
# Extract the generated text from the response structure
generated_text = response[0]['generated_text']
# Remove any part of the prompt from the generated text if it exists
clean_text = generated_text.replace(prompt, "").strip().split('\n')[0]
# Ensure the text starts and ends with quotes only if it doesn't already
correctly_quoted_text = ensure_correct_quoting(clean_text)
# Append the clean response text to the CSV
with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow([correctly_quoted_text, idx])
row_count += 1
print(f"Response for sentiment '{sentiment}' saved to {csv_file}. Total rows: {row_count}")
if row_count >= 100000:
break
print("All responses saved. Total rows:", row_count)