Spaces:

satyaki-mitra
/

Text_Authenticator

Running

App Files Files Community

Text_Authenticator / evaluation /generate_ai_data.py

satyaki-mitra

Evaluation added

4466506 about 1 month ago

raw

history blame contribute delete

9.09 kB

	# DEPENDENCIES
	import json
	import requests
	from tqdm import tqdm
	from pathlib import Path
	from loguru import logger



	# Configuration
	MODEL_NAME = "mistral:7b"
	HUMAN_DIR = Path("evaluation/human")
	AI_DIR = Path("evaluation/ai_generated")

	# Domain-specific generation parameters
	GENERATION_PARAMS = {"academic" : {"temperature" : 0.5, "top_p" : 0.85},
	"creative" : {"temperature" : 0.9, "top_p" : 0.95},
	"legal" : {"temperature" : 0.3, "top_p" : 0.80},
	"social_media" : {"temperature" : 0.8, "top_p" : 0.95},
	"blog_personal" : {"temperature" : 0.8, "top_p" : 0.95},
	"marketing" : {"temperature" : 0.7, "top_p" : 0.90},
	"journalism" : {"temperature" : 0.6, "top_p" : 0.85},
	"default" : {"temperature" : 0.7, "top_p" : 0.9},
	}

	# Improved domain-specific prompts
	PROMPTS = {"general" : "Write a comprehensive 500-1000 word informative article explaining a common topic that would appear in an encyclopedia. Use clear, neutral language with proper structure.",
	"academic" : "Write a formal research abstract (500-1000 words) for a scientific study. Include: background context, research methodology, key findings, and implications. Use academic vocabulary and formal tone appropriate for peer-reviewed publication.",
	"creative" : "Write a creative narrative passage (500-1000 words) with vivid descriptions, engaging storytelling, and literary devices. Focus on character, setting, or emotion with rich sensory details.",
	"ai_ml" : "Write a technical explanation (500-1000 words) of a machine learning concept or recent AI advancement. Include mathematical intuition, practical applications, and current research directions. Use precise technical terminology.",
	"software_dev" : "Write developer documentation (500-1000 words) explaining the implementation of a software design pattern or architectural principle. Include code examples, use cases, trade-offs, and best practices for professional developers.",
	"technical_doc" : "Write comprehensive API documentation (500-1000 words) for a REST endpoint. Include: endpoint description, request/response parameters with data types, authentication requirements, example requests, error codes, and usage notes. Use Markdown formatting.",
	"engineering" : "Write an engineering technical report excerpt (500-1000 words) analyzing a system design or technical solution. Include specifications, performance analysis, design constraints, and recommendations.",
	"science" : "Write a scientific explanation (500-1000 words) of a natural phenomenon or research finding. Include underlying mechanisms, experimental evidence, and real-world implications. Use precise scientific terminology.",
	"business" : "Write a professional business analysis (500-1000 words) covering market trends, competitive landscape, or strategic insights. Use business terminology, data-driven arguments, and executive-level language.",
	"legal" : "Draft a formal legal document excerpt (500-1000 words) such as a contract clause, terms of service, or policy statement. Use precise legal terminology, proper structure, and formal language appropriate for legal documents.",
	"medical" : "Write a clinical case description or medical research abstract (500-1000 words) with appropriate medical terminology. Include patient presentation, diagnostic approach, treatment, and outcomes or research methodology and findings.",
	"journalism" : "Write a journalistic news article (500-1000 words) in neutral reporting style. Include a compelling lead, factual reporting, quotes from sources, and balanced coverage. Follow AP style conventions.",
	"marketing" : "Write persuasive marketing content (500-1000 words) for a technology product or service. Include compelling value propositions, benefit-focused copy, clear calls to action, and engaging language that converts readers.",
	"social_media" : "Write 5-7 engaging social media posts (500-1000 words total) discussing a technology trend. Use informal conversational tone, include relevant hashtags, emojis where appropriate, and encourage engagement. Mix different post types.",
	"blog_personal" : "Write a personal blog post (500-1000 words) sharing personal experiences, opinions, or reflections on a topic. Use first-person perspective, informal conversational tone, and authentic voice.",
	"tutorial" : "Write a comprehensive step-by-step tutorial (500-1000 words) teaching beginners how to accomplish a specific technical task. Use clear numbered steps, explanatory notes, common pitfalls, and helpful tips.",
	}


	def generate_with_ollama(prompt: str, domain: str, max_tokens: int = 600) -> str:
	"""
	Generate text using Ollama with domain-specific parameters

	Arguments:
	----------
	prompt { str } : Generation prompt

	domain { str } : Domain name for parameter lookup

	max_tokens { int } : Maximum tokens to generate

	Returns:
	--------
	{ str } : Generated text (empty string if failed)
	"""
	url = "http://localhost:11434/api/generate"
	params = GENERATION_PARAMS.get(domain, GENERATION_PARAMS["default"])

	payload = {"model" : MODEL_NAME,
	"prompt" : prompt,
	"stream" : False,
	"options" : {"temperature" : params["temperature"],
	"top_p" : params["top_p"],
	"num_predict" : max_tokens,
	}
	}

	try:
	response = requests.post(url = url,
	json = payload,
	timeout = 60,
	)

	if (response.status_code == 200):
	result = response.json()
	return result.get("response", "").strip()

	else:
	logger.warning(f"Ollama error: {response.status_code}")
	return ""

	except Exception as e:
	logger.warning(f"Request failed: {e}")
	return ""


	def validate_generated_text(text: str, min_words: int = 100) -> bool:
	"""
	Validate generated text quality

	Arguments:
	----------
	text { str } : Generated text

	min_words { int } : Minimum word count

	Returns:
	--------
	{ bool } : True if valid
	"""
	if not text:
	return False

	word_count = len(text.split())

	return (word_count >= min_words)


	def main():
	"""
	Generate AI texts for all domains
	"""
	print("=" * 70)
	print("TEXT-AUTH: Generating AI Data")
	print("=" * 70)
	print(f"\nModel: {MODEL_NAME}")
	print(f"Generation strategy: Domain-specific prompts with adaptive parameters\n")

	total_generated = 0
	total_failed = 0

	for domain_dir in HUMAN_DIR.iterdir():
	if not domain_dir.is_dir():
	continue

	domain = domain_dir.name

	if domain not in PROMPTS:
	logger.warning(f"Skipping {domain}: no prompt defined")
	continue

	(AI_DIR / domain).mkdir(parents = True, exist_ok = True)

	files = list(domain_dir.glob("*.txt"))
	domain_success = 0
	domain_failed = 0

	logger.info(f"\n→ Generating for {domain} ({len(files)} samples)...")

	for i, file in enumerate(tqdm(files, desc = f" {domain}")):
	prompt = PROMPTS[domain]
	ai_text = generate_with_ollama(prompt = prompt,
	domain = domain,
	)

	# Validate generated text
	if validate_generated_text(ai_text, min_words = 100):
	output_path = AI_DIR / domain / f"ai_{domain}_{i}.txt"

	with open(output_path, "w", encoding = "utf-8") as f:
	f.write(ai_text)

	domain_success += 1
	total_generated += 1

	else:
	logger.warning(f" Failed to generate valid text for {domain}_{i} (skipping)")
	domain_failed += 1
	total_failed += 1

	logger.info(f" {domain}: {domain_success} generated, {domain_failed} failed")

	# Summary
	print("\n" + "=" * 70)
	print("Generation Summary")
	print("=" * 70)
	print(f"Total generated: {total_generated}")
	print(f"Total failed: {total_failed}")
	print("=" * 70)


	# Execution
	if __name__ == "__main__":
	main()