Spaces:

amine-araich
/

controlled-text-summarization

Runtime error

App Files Files Community

controlled-text-summarization / data.py

amine-araich

first commit

8eab63f verified 4 months ago

raw

history blame contribute delete

4.57 kB

	import pandas as pd
	from datasets import Dataset
	import google.generativeai as genai
	from dotenv import load_dotenv
	import os

	# Load environment variables from .env file
	load_dotenv()
	GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")


	def load_articles():
	"""
	Load articles for summarization
	Replace this with your actual data loading logic
	"""
	# This is a placeholder - replace with actual data loading
	sample_articles = [
	"Scientists have discovered a new species of deep-sea fish that can withstand extreme pressure. The fish, found at depths of over 8,000 meters, has unique adaptations including specialized cell membranes and pressure-resistant proteins. This discovery may lead to new applications in biotechnology and materials science.",
	"The city council voted yesterday to approve the new urban development plan. The plan includes affordable housing initiatives, expanded public transportation, and investments in green spaces. Critics argue that the plan doesn't address existing infrastructure problems, while supporters praise its forward-thinking approach to urban growth.",
	]
	return sample_articles


	def generate_styled_summary(text: str, style: str, model) -> str:
	"""
	Generates a summary of the given text in the specified style using Gemini.

	Args:
	text: The text to summarize.
	style: The desired style (e.g., "formal", "informal").
	model: The initialized Gemini model.

	Returns:
	The generated summary, or an empty string if an error occurred.
	"""
	if not text:
	return "" # Handle empty input

	prompt = f"Summarize the following text in a {style} style:\n\n{text}"

	try:
	response = model.generate_content(
	prompt,
	generation_config=genai.types.GenerationConfig(
	max_output_tokens=150,
	temperature=0.7, # Add temperature for some variability
	top_p=0.95, # Add top_p
	top_k=40, # Add top_k
	),
	)
	if response.text:
	return response.text
	else:
	print(
	f"Warning: Empty response for style '{style}' and text: {text[:50]}..."
	) # Show first 50 chars
	return ""

	except Exception as e:
	print(f"Error generating summary (style: {style}): {e}")
	return ""


	def prepare_stylized_dataset():
	"""
	Create or load a dataset with text and corresponding stylized summaries
	Format: [{"text": original_text, "summary_formal": formal_summary, "summary_informal": informal_summary, ...}]
	"""
	# Configure Gemini API
	if not GEMINI_API_KEY:
	raise ValueError(
	"GEMINI_API_KEY not found in environment variables or .env file"
	)

	genai.configure(api_key=GEMINI_API_KEY)

	# Initialize the model
	model = genai.GenerativeModel("gemini-2.0-flash")

	# Sample data preparation
	data = []
	for article in load_articles():
	entry = {"text": article}
	for style in ["formal", "informal", "humorous", "poetic"]:
	summary = generate_styled_summary(article, style, model)
	entry[f"summary_{style}"] = summary
	data.append(entry)

	return pd.DataFrame(data)


	def prepare_dataset_for_training(df, tokenizer, style="formal"):
	"""Convert dataframe to format suitable for training"""

	def preprocess_function(examples):
	# Prepend style token to input
	inputs = [f"[{style.upper()}] {text}" for text in examples["text"]]
	model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

	# Tokenize summaries
	with tokenizer.as_target_tokenizer():
	labels = tokenizer(
	examples[f"summary_{style}"],
	max_length=128,
	truncation=True,
	padding="max_length",
	)

	model_inputs["labels"] = labels["input_ids"]
	return model_inputs

	# Convert to Hugging Face dataset
	dataset = Dataset.from_pandas(df)

	# Split dataset into train and validation
	dataset = dataset.train_test_split(test_size=0.2)
	dataset = {"train": dataset["train"], "validation": dataset["test"]}

	# Tokenize dataset
	tokenized_dataset = {}
	for split in dataset:
	tokenized_dataset[split] = dataset[split].map(preprocess_function, batched=True)

	return tokenized_dataset