File size: 4,572 Bytes
8eab63f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pandas as pd
from datasets import Dataset
import google.generativeai as genai
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")


def load_articles():
    """

    Load articles for summarization

    Replace this with your actual data loading logic

    """
    # This is a placeholder - replace with actual data loading
    sample_articles = [
        "Scientists have discovered a new species of deep-sea fish that can withstand extreme pressure. The fish, found at depths of over 8,000 meters, has unique adaptations including specialized cell membranes and pressure-resistant proteins. This discovery may lead to new applications in biotechnology and materials science.",
        "The city council voted yesterday to approve the new urban development plan. The plan includes affordable housing initiatives, expanded public transportation, and investments in green spaces. Critics argue that the plan doesn't address existing infrastructure problems, while supporters praise its forward-thinking approach to urban growth.",
    ]
    return sample_articles


def generate_styled_summary(text: str, style: str, model) -> str:
    """

    Generates a summary of the given text in the specified style using Gemini.



    Args:

        text: The text to summarize.

        style: The desired style (e.g., "formal", "informal").

        model: The initialized Gemini model.



    Returns:

        The generated summary, or an empty string if an error occurred.

    """
    if not text:
        return ""  # Handle empty input

    prompt = f"Summarize the following text in a {style} style:\n\n{text}"

    try:
        response = model.generate_content(
            prompt,
            generation_config=genai.types.GenerationConfig(
                max_output_tokens=150,
                temperature=0.7,  # Add temperature for some variability
                top_p=0.95,  # Add top_p
                top_k=40,  # Add top_k
            ),
        )
        if response.text:
            return response.text
        else:
            print(
                f"Warning: Empty response for style '{style}' and text: {text[:50]}..."
            )  # Show first 50 chars
            return ""

    except Exception as e:
        print(f"Error generating summary (style: {style}): {e}")
        return ""


def prepare_stylized_dataset():
    """

    Create or load a dataset with text and corresponding stylized summaries

    Format: [{"text": original_text, "summary_formal": formal_summary, "summary_informal": informal_summary, ...}]

    """
    # Configure Gemini API
    if not GEMINI_API_KEY:
        raise ValueError(
            "GEMINI_API_KEY not found in environment variables or .env file"
        )

    genai.configure(api_key=GEMINI_API_KEY)

    # Initialize the model
    model = genai.GenerativeModel("gemini-2.0-flash")

    # Sample data preparation
    data = []
    for article in load_articles():
        entry = {"text": article}
        for style in ["formal", "informal", "humorous", "poetic"]:
            summary = generate_styled_summary(article, style, model)
            entry[f"summary_{style}"] = summary
        data.append(entry)

    return pd.DataFrame(data)


def prepare_dataset_for_training(df, tokenizer, style="formal"):
    """Convert dataframe to format suitable for training"""

    def preprocess_function(examples):
        # Prepend style token to input
        inputs = [f"[{style.upper()}] {text}" for text in examples["text"]]
        model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

        # Tokenize summaries
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(
                examples[f"summary_{style}"],
                max_length=128,
                truncation=True,
                padding="max_length",
            )

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    # Convert to Hugging Face dataset
    dataset = Dataset.from_pandas(df)

    # Split dataset into train and validation
    dataset = dataset.train_test_split(test_size=0.2)
    dataset = {"train": dataset["train"], "validation": dataset["test"]}

    # Tokenize dataset
    tokenized_dataset = {}
    for split in dataset:
        tokenized_dataset[split] = dataset[split].map(preprocess_function, batched=True)

    return tokenized_dataset