Spaces:
Runtime error
Runtime error
import pandas as pd | |
from datasets import Dataset | |
import google.generativeai as genai | |
from dotenv import load_dotenv | |
import os | |
# Load environment variables from .env file | |
load_dotenv() | |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") | |
def load_articles(): | |
""" | |
Load articles for summarization | |
Replace this with your actual data loading logic | |
""" | |
# This is a placeholder - replace with actual data loading | |
sample_articles = [ | |
"Scientists have discovered a new species of deep-sea fish that can withstand extreme pressure. The fish, found at depths of over 8,000 meters, has unique adaptations including specialized cell membranes and pressure-resistant proteins. This discovery may lead to new applications in biotechnology and materials science.", | |
"The city council voted yesterday to approve the new urban development plan. The plan includes affordable housing initiatives, expanded public transportation, and investments in green spaces. Critics argue that the plan doesn't address existing infrastructure problems, while supporters praise its forward-thinking approach to urban growth.", | |
] | |
return sample_articles | |
def generate_styled_summary(text: str, style: str, model) -> str: | |
""" | |
Generates a summary of the given text in the specified style using Gemini. | |
Args: | |
text: The text to summarize. | |
style: The desired style (e.g., "formal", "informal"). | |
model: The initialized Gemini model. | |
Returns: | |
The generated summary, or an empty string if an error occurred. | |
""" | |
if not text: | |
return "" # Handle empty input | |
prompt = f"Summarize the following text in a {style} style:\n\n{text}" | |
try: | |
response = model.generate_content( | |
prompt, | |
generation_config=genai.types.GenerationConfig( | |
max_output_tokens=150, | |
temperature=0.7, # Add temperature for some variability | |
top_p=0.95, # Add top_p | |
top_k=40, # Add top_k | |
), | |
) | |
if response.text: | |
return response.text | |
else: | |
print( | |
f"Warning: Empty response for style '{style}' and text: {text[:50]}..." | |
) # Show first 50 chars | |
return "" | |
except Exception as e: | |
print(f"Error generating summary (style: {style}): {e}") | |
return "" | |
def prepare_stylized_dataset(): | |
""" | |
Create or load a dataset with text and corresponding stylized summaries | |
Format: [{"text": original_text, "summary_formal": formal_summary, "summary_informal": informal_summary, ...}] | |
""" | |
# Configure Gemini API | |
if not GEMINI_API_KEY: | |
raise ValueError( | |
"GEMINI_API_KEY not found in environment variables or .env file" | |
) | |
genai.configure(api_key=GEMINI_API_KEY) | |
# Initialize the model | |
model = genai.GenerativeModel("gemini-2.0-flash") | |
# Sample data preparation | |
data = [] | |
for article in load_articles(): | |
entry = {"text": article} | |
for style in ["formal", "informal", "humorous", "poetic"]: | |
summary = generate_styled_summary(article, style, model) | |
entry[f"summary_{style}"] = summary | |
data.append(entry) | |
return pd.DataFrame(data) | |
def prepare_dataset_for_training(df, tokenizer, style="formal"): | |
"""Convert dataframe to format suitable for training""" | |
def preprocess_function(examples): | |
# Prepend style token to input | |
inputs = [f"[{style.upper()}] {text}" for text in examples["text"]] | |
model_inputs = tokenizer(inputs, max_length=1024, truncation=True) | |
# Tokenize summaries | |
with tokenizer.as_target_tokenizer(): | |
labels = tokenizer( | |
examples[f"summary_{style}"], | |
max_length=128, | |
truncation=True, | |
padding="max_length", | |
) | |
model_inputs["labels"] = labels["input_ids"] | |
return model_inputs | |
# Convert to Hugging Face dataset | |
dataset = Dataset.from_pandas(df) | |
# Split dataset into train and validation | |
dataset = dataset.train_test_split(test_size=0.2) | |
dataset = {"train": dataset["train"], "validation": dataset["test"]} | |
# Tokenize dataset | |
tokenized_dataset = {} | |
for split in dataset: | |
tokenized_dataset[split] = dataset[split].map(preprocess_function, batched=True) | |
return tokenized_dataset | |