File size: 10,188 Bytes
d424f07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import csv
import torch
from transformers import pipeline
import random

# Initialize the chatbot with half-precision
chatbot = pipeline(
    "text-generation", 
    model="mistralai/Mistral-7B-Instruct-v0.3",
    torch_dtype=torch.float16,
    device=0  # Assuming you are using a GPU
)

# Sentiments and their labels
sentiments = ["Positive or Encouraging", "Neutral or Factual", "Negative or Toxic"]

# List of content formats to cycle through
formats = [
    "Feature Stories", "Instructional Manuals", "FAQs", "Policy Documents", "Live Stream Descriptions", 
    "Editorial Content", "Research Papers", "User Manuals", "Commentaries", "Opinion Pieces", 
    "Newsletters", "Online Courses", "Photo Essays", "Annual Reports", "User-Generated Content", 
    "Testimonials", "DIY Content", "How-To Videos", "Campaign Reports", "Legal Briefs",
    "Blog Posts", "Case Studies", "Tutorials", "Interviews", "Press Releases", 
    "eBooks", "Infographics", "Webinars", "Podcast Descriptions", "Video Scripts",
    "Advertisements", "Forum Discussions", "Whitepapers", "Surveys", "Product Reviews",
    "Event Summaries", "Opinion Editorials", "Letters to the Editor", "Round-Up Posts", 
    "Buying Guides", "Checklists", "Cheat Sheets", "Recipes", "Travel Guides",
    "Profiles", "Lists", "Q&A Sessions", "Debates", "Polls"
]

# List of topics to cycle through
topics = [
    "Family", "Travel", "Politics", "Science", "Health", "Technology", "Sports", 
    "Education", "Environment", "Economics", "Culture", "History", "Music", 
    "Literature", "Food", "Art", "Fashion", "Entertainment", "Business", 
    "Relationships", "Fitness", "Automotive", "Finance", "Real Estate", "Law", 
    "Psychology", "Philosophy", "Religion", "Gardening", "DIY", "Hobbies", 
    "Pets", "Career", "Marketing", "Customer Service", "Networking", "Innovation", 
    "Artificial Intelligence", "Sustainability", "Social Issues", "Digital Media", 
    "Programming", "Cybersecurity", "Astronomy", "Geography", "Travel Tips", 
    "Cooking", "Parenting", "Productivity", "Mindfulness", "Mental Health", 
    "Self-Improvement", "Leadership", "Teamwork", "Volunteering", "Nonprofits", 
    "Gaming", "E-commerce", "Photography", "Videography", "Film", "Television", 
    "Streaming Services", "Podcasts", "Public Speaking", "Event Planning", 
    "Interior Design", "Architecture", "Urban Development", "Agriculture", 
    "Climate Change", "Renewable Energy", "Space Exploration", "Biotechnology", 
    "Cryptocurrency", "Blockchain", "Robotics", "Automated Systems", "Genetics", 
    "Medicine", "Pharmacy", "Veterinary Science", "Marine Biology", "Ecology", 
    "Conservation", "Wildlife", "Botany", "Zoology", "Geology", "Meteorology", 
    "Aviation", "Maritime", "Logistics", "Supply Chain", "Human Resources", 
    "Diversity and Inclusion", "Ethics", "Corporate Governance", "Public Relations", 
    "Journalism", "Advertising", "Sales", "Customer Experience", "Retail", 
    "Hospitality", "Tourism", "Luxury Goods", "Consumer Electronics", "Fashion Design", 
    "Textiles", "Jewelry", "Cosmetics", "Skincare", "Perfume", "Toys", "Gadgets", 
    "Home Appliances", "Furniture", "Home Improvement", "Landscaping", "Real Estate Investment"
]

# List of styles to cycle through
styles = [
    "Super Casual", "Internet Slang", "Every Day", "Formal", "Conversational", 
    "Bad Grammar and Spelling", "Lazy typing", "Professional", "Academic", 
    "Technical", "Narrative", "Descriptive", "Analytical", "Critical", 
    "Objective", "Subjective", "Third Person", "First Person", 
    "Persuasive", "Informative", "Journalistic", "Reflective",
    "DM", "Social", "Informal", "Casual", "Colloquial"
]

# List of starting phrases
starting_phrases = [
    "Have you ever wondered", "Let's talk about", "It's interesting how", 
    "Did you know", "The reality is", "Many people believe", 
    "It's surprising that", "You might not know", "Let's dive into", 
    "Here's the thing", "A common misconception is", "It's clear that", 
    "Most people don't realize", "One thing to note is", 
    "The fact is", "Consider this", "Here's an example", 
    "Think about", "For instance", "To illustrate", 
    "In my experience", "A key point is", "It's worth noting", 
    "Let's explore", "Interestingly enough", "I want to highlight", 
    "When it comes to", "The truth is", "Many experts agree", 
    "Research shows", "Statistics indicate", "It's often said", 
    "In reality", "From my perspective", "Surprisingly", 
    "One thing I've noticed", "In recent studies", "Let's break down", 
    "People often forget", "You should know", "Interestingly", 
    "It turns out", "As it happens", "Experts suggest", 
    "The surprising fact is", "It's commonly known", "Let's be honest", 
    "The reality of", "It's fascinating that", "Have you noticed", 
    "The thing is", "It's a fact that", "Let's not forget", 
    "Studies have shown", "A notable point is", "It's often overlooked", 
    "An important aspect is", "Let's take a closer look", 
    "It's essential to understand", "Interestingly, research suggests", 
    "One aspect to consider is", "It's beneficial to know", 
    "It's worth considering", "The interesting thing is", "Let's examine", 
    "A surprising fact is", "It's helpful to know", "One surprising element is", 
    "Imagine this", "Here's a thought", "You might be surprised", 
    "Think of it this way", "Here's an idea", "It's funny how", 
    "Let me tell you", "Picture this", "The question is", 
    "Believe it or not", "You won't believe", "Let's face it", 
    "The best part is", "What's interesting is", "I discovered that", 
    "It's amazing how", "The funny thing is", "Here's why", 
    "What if I told you", "It's worth mentioning", "This reminds me of", 
    "Let me explain", "Here's something new", "I realized that", 
    "Have you seen", "You might enjoy", "I learned that", 
    "It's clear to see", "What's fascinating is", "Here's a question", 
    "I heard that", "The cool part is", "Here's what happened", 
    "It appears that", "It's evident that", "Let me share", 
    "You'll find that", "What's notable is", "Consider the fact that", 
    "It's interesting to note", "Hello everyone", "Hi there", 
    "Greetings", "Hey folks", "Good morning", "Good afternoon", 
    "Good evening", "Hey", "What's up", "Hi", "Hello", 
    "Amazing!", "Serious?", "Wow...", "That's pretty cool.", 
    "Can you believe it?", "Unbelievable!", "Incredible!", "No way!", 
    "Check this out", "Guess what?", "Surprise!", "Fascinating!", 
    "Impressive!", "I don't get it?", "Really?", "What?", 
    "Why?", "How come?", "Is that so?", "Are you sure?", 
    "What do you think?", "By the way", "Just so you know", 
    "For your information", "Incidentally", "On a side note", 
    "As a reminder", "In addition", "Besides that", 
    "While we're on the subject", "Speaking of which", 
    "Have you", "Has anyone", "Would we", "Would it be", 
    "OK, now", "OK but", "OK you", "OK nobody",
    "Here's a quick fact", "To put it simply", "Here's why this matters", 
    "Let's consider", "Now, think about this", "Take this into account", 
    "Here's something to think about", "On that note", 
    "Interestingly enough", "Just imagine", "That reminds me", 
    "As it turns out", "Here's a fun fact", "The reality of it is", 
    "By the way, did you know", "Interestingly", "Speaking of", 
    "Now, let's dive in", "You'll be surprised to know", 
    "I recently discovered", "Would you believe", "Can you imagine", 
    "What's more", "Even more interesting is"
]

# CSV file setup with utf-8 encoding and quoting minimal
csv_file = "sentences.csv"
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["text", "label"])

# Function to ensure correct quoting
def ensure_correct_quoting(text):
    # Check if the text is already properly quoted
    if text.startswith('"') and text.endswith('"'):
        return text
    else:
        return f'"{text}"'

# Collect and save responses until reaching 100,000 rows
row_count = 0
format_index = 0
topic_index = 0
style_index = 0

while row_count < 100000:
    for idx, sentiment in enumerate(sentiments):
        format_type = formats[format_index % len(formats)]
        format_index += 1
        topic = topics[topic_index % len(topics)]
        topic_index += 1
        style = styles[style_index % len(styles)]
        style_index += 1
        start_phrase = random.choice(starting_phrases)

        # Add the current sentiment prompt with the format, topic, and style
        prompt = f"Start your paragraph with '{start_phrase}'. Write a single paragraph of text. Format: {format_type}. Topic: {topic}. Vibe: {sentiment}. Style: {style}."
        
        response = chatbot(prompt, max_new_tokens=100)  # Adjusted max_new_tokens for longer responses

        # Debug print to check response format
        print(f"Full model response: {response}")

        # Extract the generated text from the response structure
        generated_text = response[0]['generated_text']

        # Remove any part of the prompt from the generated text if it exists
        clean_text = generated_text.replace(prompt, "").strip().split('\n')[0]

        # Ensure the text starts and ends with quotes only if it doesn't already
        correctly_quoted_text = ensure_correct_quoting(clean_text)

        # Append the clean response text to the CSV
        with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            writer.writerow([correctly_quoted_text, idx])

        row_count += 1
        print(f"Response for sentiment '{sentiment}' saved to {csv_file}. Total rows: {row_count}")

        if row_count >= 100000:
            break

print("All responses saved. Total rows:", row_count)