dejanseo commited on
Commit
d424f07
1 Parent(s): 8edab9a

Upload synth.py

Browse files
Files changed (1) hide show
  1. synth.py +190 -0
synth.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import torch
3
+ from transformers import pipeline
4
+ import random
5
+
6
+ # Initialize the chatbot with half-precision
7
+ chatbot = pipeline(
8
+ "text-generation",
9
+ model="mistralai/Mistral-7B-Instruct-v0.3",
10
+ torch_dtype=torch.float16,
11
+ device=0 # Assuming you are using a GPU
12
+ )
13
+
14
+ # Sentiments and their labels
15
+ sentiments = ["Positive or Encouraging", "Neutral or Factual", "Negative or Toxic"]
16
+
17
+ # List of content formats to cycle through
18
+ formats = [
19
+ "Feature Stories", "Instructional Manuals", "FAQs", "Policy Documents", "Live Stream Descriptions",
20
+ "Editorial Content", "Research Papers", "User Manuals", "Commentaries", "Opinion Pieces",
21
+ "Newsletters", "Online Courses", "Photo Essays", "Annual Reports", "User-Generated Content",
22
+ "Testimonials", "DIY Content", "How-To Videos", "Campaign Reports", "Legal Briefs",
23
+ "Blog Posts", "Case Studies", "Tutorials", "Interviews", "Press Releases",
24
+ "eBooks", "Infographics", "Webinars", "Podcast Descriptions", "Video Scripts",
25
+ "Advertisements", "Forum Discussions", "Whitepapers", "Surveys", "Product Reviews",
26
+ "Event Summaries", "Opinion Editorials", "Letters to the Editor", "Round-Up Posts",
27
+ "Buying Guides", "Checklists", "Cheat Sheets", "Recipes", "Travel Guides",
28
+ "Profiles", "Lists", "Q&A Sessions", "Debates", "Polls"
29
+ ]
30
+
31
+ # List of topics to cycle through
32
+ topics = [
33
+ "Family", "Travel", "Politics", "Science", "Health", "Technology", "Sports",
34
+ "Education", "Environment", "Economics", "Culture", "History", "Music",
35
+ "Literature", "Food", "Art", "Fashion", "Entertainment", "Business",
36
+ "Relationships", "Fitness", "Automotive", "Finance", "Real Estate", "Law",
37
+ "Psychology", "Philosophy", "Religion", "Gardening", "DIY", "Hobbies",
38
+ "Pets", "Career", "Marketing", "Customer Service", "Networking", "Innovation",
39
+ "Artificial Intelligence", "Sustainability", "Social Issues", "Digital Media",
40
+ "Programming", "Cybersecurity", "Astronomy", "Geography", "Travel Tips",
41
+ "Cooking", "Parenting", "Productivity", "Mindfulness", "Mental Health",
42
+ "Self-Improvement", "Leadership", "Teamwork", "Volunteering", "Nonprofits",
43
+ "Gaming", "E-commerce", "Photography", "Videography", "Film", "Television",
44
+ "Streaming Services", "Podcasts", "Public Speaking", "Event Planning",
45
+ "Interior Design", "Architecture", "Urban Development", "Agriculture",
46
+ "Climate Change", "Renewable Energy", "Space Exploration", "Biotechnology",
47
+ "Cryptocurrency", "Blockchain", "Robotics", "Automated Systems", "Genetics",
48
+ "Medicine", "Pharmacy", "Veterinary Science", "Marine Biology", "Ecology",
49
+ "Conservation", "Wildlife", "Botany", "Zoology", "Geology", "Meteorology",
50
+ "Aviation", "Maritime", "Logistics", "Supply Chain", "Human Resources",
51
+ "Diversity and Inclusion", "Ethics", "Corporate Governance", "Public Relations",
52
+ "Journalism", "Advertising", "Sales", "Customer Experience", "Retail",
53
+ "Hospitality", "Tourism", "Luxury Goods", "Consumer Electronics", "Fashion Design",
54
+ "Textiles", "Jewelry", "Cosmetics", "Skincare", "Perfume", "Toys", "Gadgets",
55
+ "Home Appliances", "Furniture", "Home Improvement", "Landscaping", "Real Estate Investment"
56
+ ]
57
+
58
+ # List of styles to cycle through
59
+ styles = [
60
+ "Super Casual", "Internet Slang", "Every Day", "Formal", "Conversational",
61
+ "Bad Grammar and Spelling", "Lazy typing", "Professional", "Academic",
62
+ "Technical", "Narrative", "Descriptive", "Analytical", "Critical",
63
+ "Objective", "Subjective", "Third Person", "First Person",
64
+ "Persuasive", "Informative", "Journalistic", "Reflective",
65
+ "DM", "Social", "Informal", "Casual", "Colloquial"
66
+ ]
67
+
68
+ # List of starting phrases
69
+ starting_phrases = [
70
+ "Have you ever wondered", "Let's talk about", "It's interesting how",
71
+ "Did you know", "The reality is", "Many people believe",
72
+ "It's surprising that", "You might not know", "Let's dive into",
73
+ "Here's the thing", "A common misconception is", "It's clear that",
74
+ "Most people don't realize", "One thing to note is",
75
+ "The fact is", "Consider this", "Here's an example",
76
+ "Think about", "For instance", "To illustrate",
77
+ "In my experience", "A key point is", "It's worth noting",
78
+ "Let's explore", "Interestingly enough", "I want to highlight",
79
+ "When it comes to", "The truth is", "Many experts agree",
80
+ "Research shows", "Statistics indicate", "It's often said",
81
+ "In reality", "From my perspective", "Surprisingly",
82
+ "One thing I've noticed", "In recent studies", "Let's break down",
83
+ "People often forget", "You should know", "Interestingly",
84
+ "It turns out", "As it happens", "Experts suggest",
85
+ "The surprising fact is", "It's commonly known", "Let's be honest",
86
+ "The reality of", "It's fascinating that", "Have you noticed",
87
+ "The thing is", "It's a fact that", "Let's not forget",
88
+ "Studies have shown", "A notable point is", "It's often overlooked",
89
+ "An important aspect is", "Let's take a closer look",
90
+ "It's essential to understand", "Interestingly, research suggests",
91
+ "One aspect to consider is", "It's beneficial to know",
92
+ "It's worth considering", "The interesting thing is", "Let's examine",
93
+ "A surprising fact is", "It's helpful to know", "One surprising element is",
94
+ "Imagine this", "Here's a thought", "You might be surprised",
95
+ "Think of it this way", "Here's an idea", "It's funny how",
96
+ "Let me tell you", "Picture this", "The question is",
97
+ "Believe it or not", "You won't believe", "Let's face it",
98
+ "The best part is", "What's interesting is", "I discovered that",
99
+ "It's amazing how", "The funny thing is", "Here's why",
100
+ "What if I told you", "It's worth mentioning", "This reminds me of",
101
+ "Let me explain", "Here's something new", "I realized that",
102
+ "Have you seen", "You might enjoy", "I learned that",
103
+ "It's clear to see", "What's fascinating is", "Here's a question",
104
+ "I heard that", "The cool part is", "Here's what happened",
105
+ "It appears that", "It's evident that", "Let me share",
106
+ "You'll find that", "What's notable is", "Consider the fact that",
107
+ "It's interesting to note", "Hello everyone", "Hi there",
108
+ "Greetings", "Hey folks", "Good morning", "Good afternoon",
109
+ "Good evening", "Hey", "What's up", "Hi", "Hello",
110
+ "Amazing!", "Serious?", "Wow...", "That's pretty cool.",
111
+ "Can you believe it?", "Unbelievable!", "Incredible!", "No way!",
112
+ "Check this out", "Guess what?", "Surprise!", "Fascinating!",
113
+ "Impressive!", "I don't get it?", "Really?", "What?",
114
+ "Why?", "How come?", "Is that so?", "Are you sure?",
115
+ "What do you think?", "By the way", "Just so you know",
116
+ "For your information", "Incidentally", "On a side note",
117
+ "As a reminder", "In addition", "Besides that",
118
+ "While we're on the subject", "Speaking of which",
119
+ "Have you", "Has anyone", "Would we", "Would it be",
120
+ "OK, now", "OK but", "OK you", "OK nobody",
121
+ "Here's a quick fact", "To put it simply", "Here's why this matters",
122
+ "Let's consider", "Now, think about this", "Take this into account",
123
+ "Here's something to think about", "On that note",
124
+ "Interestingly enough", "Just imagine", "That reminds me",
125
+ "As it turns out", "Here's a fun fact", "The reality of it is",
126
+ "By the way, did you know", "Interestingly", "Speaking of",
127
+ "Now, let's dive in", "You'll be surprised to know",
128
+ "I recently discovered", "Would you believe", "Can you imagine",
129
+ "What's more", "Even more interesting is"
130
+ ]
131
+
132
+ # CSV file setup with utf-8 encoding and quoting minimal
133
+ csv_file = "sentences.csv"
134
+ with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
135
+ writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
136
+ writer.writerow(["text", "label"])
137
+
138
+ # Function to ensure correct quoting
139
+ def ensure_correct_quoting(text):
140
+ # Check if the text is already properly quoted
141
+ if text.startswith('"') and text.endswith('"'):
142
+ return text
143
+ else:
144
+ return f'"{text}"'
145
+
146
+ # Collect and save responses until reaching 100,000 rows
147
+ row_count = 0
148
+ format_index = 0
149
+ topic_index = 0
150
+ style_index = 0
151
+
152
+ while row_count < 100000:
153
+ for idx, sentiment in enumerate(sentiments):
154
+ format_type = formats[format_index % len(formats)]
155
+ format_index += 1
156
+ topic = topics[topic_index % len(topics)]
157
+ topic_index += 1
158
+ style = styles[style_index % len(styles)]
159
+ style_index += 1
160
+ start_phrase = random.choice(starting_phrases)
161
+
162
+ # Add the current sentiment prompt with the format, topic, and style
163
+ prompt = f"Start your paragraph with '{start_phrase}'. Write a single paragraph of text. Format: {format_type}. Topic: {topic}. Vibe: {sentiment}. Style: {style}."
164
+
165
+ response = chatbot(prompt, max_new_tokens=100) # Adjusted max_new_tokens for longer responses
166
+
167
+ # Debug print to check response format
168
+ print(f"Full model response: {response}")
169
+
170
+ # Extract the generated text from the response structure
171
+ generated_text = response[0]['generated_text']
172
+
173
+ # Remove any part of the prompt from the generated text if it exists
174
+ clean_text = generated_text.replace(prompt, "").strip().split('\n')[0]
175
+
176
+ # Ensure the text starts and ends with quotes only if it doesn't already
177
+ correctly_quoted_text = ensure_correct_quoting(clean_text)
178
+
179
+ # Append the clean response text to the CSV
180
+ with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
181
+ writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
182
+ writer.writerow([correctly_quoted_text, idx])
183
+
184
+ row_count += 1
185
+ print(f"Response for sentiment '{sentiment}' saved to {csv_file}. Total rows: {row_count}")
186
+
187
+ if row_count >= 100000:
188
+ break
189
+
190
+ print("All responses saved. Total rows:", row_count)