import csv import torch from transformers import pipeline # Initialize the chatbot with half-precision chatbot = pipeline( "text-generation", model="mistralai/Mistral-7B-Instruct-v0.3", torch_dtype=torch.float16, device=0 # Assuming you are using a GPU ) # Sentiments and their labels sentiments = ["Positive", "Neutral", "Negative"] # List of content formats to cycle through formats = [ "Feature Stories", "Instructional Manuals", "FAQs", "Policy Documents", "Live Stream Descriptions", "Editorial Content", "Research Papers", "User Manuals", "Commentaries", "Opinion Pieces", "Newsletters", "Online Courses", "Photo Essays", "Annual Reports", "User-Generated Content", "Testimonials", "DIY Content", "How-To Videos", "Campaign Reports", "Legal Briefs", "Blog Posts", "Case Studies", "Tutorials", "Interviews", "Press Releases", "eBooks", "Infographics", "Webinars", "Podcast Descriptions", "Video Scripts", "Advertisements", "Forum Discussions", "Whitepapers", "Surveys", "Product Reviews", "Event Summaries", "Opinion Editorials", "Letters to the Editor", "Round-Up Posts", "Buying Guides", "Checklists", "Cheat Sheets", "Recipes", "Travel Guides", "Profiles", "Lists", "Q&A Sessions", "Debates", "Polls" ] # List of topics to cycle through topics = [ "Family", "Travel", "Politics", "Science", "Health", "Technology", "Sports", "Education", "Environment", "Economics", "Culture", "History", "Music", "Literature", "Food", "Art", "Fashion", "Entertainment", "Business", "Relationships", "Fitness", "Automotive", "Finance", "Real Estate", "Law", "Psychology", "Philosophy", "Religion", "Gardening", "DIY", "Hobbies", "Pets", "Career", "Marketing", "Customer Service", "Networking", "Innovation", "Artificial Intelligence", "Sustainability", "Social Issues", "Digital Media", "Programming", "Cybersecurity", "Astronomy", "Geography", "Travel Tips", "Cooking", "Parenting", "Productivity", "Mindfulness", "Mental Health", "Self-Improvement", "Leadership", "Teamwork", "Volunteering", "Nonprofits", "Gaming", "E-commerce", "Photography", "Videography", "Film", "Television", "Streaming Services", "Podcasts", "Public Speaking", "Event Planning", "Interior Design", "Architecture", "Urban Development", "Agriculture", "Climate Change", "Renewable Energy", "Space Exploration", "Biotechnology", "Cryptocurrency", "Blockchain", "Robotics", "Automated Systems", "Genetics", "Medicine", "Pharmacy", "Veterinary Science", "Marine Biology", "Ecology", "Conservation", "Wildlife", "Botany", "Zoology", "Geology", "Meteorology", "Aviation", "Maritime", "Logistics", "Supply Chain", "Human Resources", "Diversity and Inclusion", "Ethics", "Corporate Governance", "Public Relations", "Journalism", "Advertising", "Sales", "Customer Experience", "Retail", "Hospitality", "Tourism", "Luxury Goods", "Consumer Electronics", "Fashion Design", "Textiles", "Jewelry", "Cosmetics", "Skincare", "Perfume", "Toys", "Gadgets", "Home Appliances", "Furniture", "Home Improvement", "Landscaping", "Real Estate Investment" ] # CSV file setup with utf-8 encoding and quoting minimal csv_file = "sentences.csv" with open(csv_file, mode='w', newline='', encoding='utf-8') as file: writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(["text", "label"]) # Function to ensure correct quoting def ensure_correct_quoting(text): # Check if the text is already properly quoted if text.startswith('"') and text.endswith('"'): return text else: return f'"{text}"' # Add quotes if not already present # Collect and save responses until reaching 100,000 rows row_count = 0 format_index = 0 topic_index = 0 while row_count < 100000: for idx, sentiment in enumerate(sentiments): format_type = formats[format_index % len(formats)] format_index += 1 topic = topics[topic_index % len(topics)] topic_index += 1 # Add the current sentiment prompt with the format and topic prompt = f"Write a single sentence of web content in Croatian. Content type: {format_type}. Topic: {topic}. Sentiment: {sentiment}." response = chatbot(prompt, max_new_tokens=100) # Adjusted max_new_tokens for longer responses # Debug print to check response format print(f"Full model response: {response}") # Extract the generated text from the response structure generated_text = response[0]['generated_text'] # Remove any part of the prompt from the generated text if it exists clean_text = generated_text.replace(prompt, "").strip().split('\n')[0] # Ensure the text starts and ends with quotes only if it doesn't already correctly_quoted_text = ensure_correct_quoting(clean_text) # Append the clean response text to the CSV with open(csv_file, mode='a', newline='', encoding='utf-8') as file: writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([correctly_quoted_text, idx]) row_count += 1 print(f"Response for sentiment '{sentiment}' saved to {csv_file}. Total rows: {row_count}") if row_count >= 100000: break print("All responses saved. Total rows:", row_count)