File size: 5,542 Bytes
eb404b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import csv
import torch
from transformers import pipeline

# Initialize the chatbot with half-precision
chatbot = pipeline(
    "text-generation", 
    model="mistralai/Mistral-7B-Instruct-v0.3",
    torch_dtype=torch.float16,
    device=0  # Assuming you are using a GPU
)

# Sentiments and their labels
sentiments = ["Positive", "Neutral", "Negative"]

# List of content formats to cycle through
formats = [
    "Feature Stories", "Instructional Manuals", "FAQs", "Policy Documents", "Live Stream Descriptions", 
    "Editorial Content", "Research Papers", "User Manuals", "Commentaries", "Opinion Pieces", 
    "Newsletters", "Online Courses", "Photo Essays", "Annual Reports", "User-Generated Content", 
    "Testimonials", "DIY Content", "How-To Videos", "Campaign Reports", "Legal Briefs",
    "Blog Posts", "Case Studies", "Tutorials", "Interviews", "Press Releases", 
    "eBooks", "Infographics", "Webinars", "Podcast Descriptions", "Video Scripts",
    "Advertisements", "Forum Discussions", "Whitepapers", "Surveys", "Product Reviews",
    "Event Summaries", "Opinion Editorials", "Letters to the Editor", "Round-Up Posts", 
    "Buying Guides", "Checklists", "Cheat Sheets", "Recipes", "Travel Guides",
    "Profiles", "Lists", "Q&A Sessions", "Debates", "Polls"
]

# List of topics to cycle through
topics = [
    "Family", "Travel", "Politics", "Science", "Health", "Technology", "Sports", 
    "Education", "Environment", "Economics", "Culture", "History", "Music", 
    "Literature", "Food", "Art", "Fashion", "Entertainment", "Business", 
    "Relationships", "Fitness", "Automotive", "Finance", "Real Estate", "Law", 
    "Psychology", "Philosophy", "Religion", "Gardening", "DIY", "Hobbies", 
    "Pets", "Career", "Marketing", "Customer Service", "Networking", "Innovation", 
    "Artificial Intelligence", "Sustainability", "Social Issues", "Digital Media", 
    "Programming", "Cybersecurity", "Astronomy", "Geography", "Travel Tips", 
    "Cooking", "Parenting", "Productivity", "Mindfulness", "Mental Health", 
    "Self-Improvement", "Leadership", "Teamwork", "Volunteering", "Nonprofits", 
    "Gaming", "E-commerce", "Photography", "Videography", "Film", "Television", 
    "Streaming Services", "Podcasts", "Public Speaking", "Event Planning", 
    "Interior Design", "Architecture", "Urban Development", "Agriculture", 
    "Climate Change", "Renewable Energy", "Space Exploration", "Biotechnology", 
    "Cryptocurrency", "Blockchain", "Robotics", "Automated Systems", "Genetics", 
    "Medicine", "Pharmacy", "Veterinary Science", "Marine Biology", "Ecology", 
    "Conservation", "Wildlife", "Botany", "Zoology", "Geology", "Meteorology", 
    "Aviation", "Maritime", "Logistics", "Supply Chain", "Human Resources", 
    "Diversity and Inclusion", "Ethics", "Corporate Governance", "Public Relations", 
    "Journalism", "Advertising", "Sales", "Customer Experience", "Retail", 
    "Hospitality", "Tourism", "Luxury Goods", "Consumer Electronics", "Fashion Design", 
    "Textiles", "Jewelry", "Cosmetics", "Skincare", "Perfume", "Toys", "Gadgets", 
    "Home Appliances", "Furniture", "Home Improvement", "Landscaping", "Real Estate Investment"
]

# CSV file setup with utf-8 encoding and quoting minimal
csv_file = "sentences.csv"
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["text", "label"])

# Function to ensure correct quoting
def ensure_correct_quoting(text):
    # Check if the text is already properly quoted
    if text.startswith('"') and text.endswith('"'):
        return text
    else:
        return f'"{text}"'  # Add quotes if not already present

# Collect and save responses until reaching 100,000 rows
row_count = 0
format_index = 0
topic_index = 0

while row_count < 100000:
    for idx, sentiment in enumerate(sentiments):
        format_type = formats[format_index % len(formats)]
        format_index += 1
        topic = topics[topic_index % len(topics)]
        topic_index += 1

        # Add the current sentiment prompt with the format and topic
        prompt = f"Write a single sentence of web content in Croatian. Content type: {format_type}. Topic: {topic}. Sentiment: {sentiment}."
        
        response = chatbot(prompt, max_new_tokens=100)  # Adjusted max_new_tokens for longer responses

        # Debug print to check response format
        print(f"Full model response: {response}")

        # Extract the generated text from the response structure
        generated_text = response[0]['generated_text']

        # Remove any part of the prompt from the generated text if it exists
        clean_text = generated_text.replace(prompt, "").strip().split('\n')[0]

        # Ensure the text starts and ends with quotes only if it doesn't already
        correctly_quoted_text = ensure_correct_quoting(clean_text)

        # Append the clean response text to the CSV
        with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            writer.writerow([correctly_quoted_text, idx])

        row_count += 1
        print(f"Response for sentiment '{sentiment}' saved to {csv_file}. Total rows: {row_count}")

        if row_count >= 100000:
            break

print("All responses saved. Total rows:", row_count)