| | |
| | import json |
| | import requests |
| | from tqdm import tqdm |
| | from pathlib import Path |
| | from loguru import logger |
| |
|
| |
|
| |
|
| | |
| | MODEL_NAME = "mistral:7b" |
| | HUMAN_DIR = Path("evaluation/human") |
| | AI_DIR = Path("evaluation/ai_generated") |
| |
|
| | |
| | GENERATION_PARAMS = {"academic" : {"temperature" : 0.5, "top_p" : 0.85}, |
| | "creative" : {"temperature" : 0.9, "top_p" : 0.95}, |
| | "legal" : {"temperature" : 0.3, "top_p" : 0.80}, |
| | "social_media" : {"temperature" : 0.8, "top_p" : 0.95}, |
| | "blog_personal" : {"temperature" : 0.8, "top_p" : 0.95}, |
| | "marketing" : {"temperature" : 0.7, "top_p" : 0.90}, |
| | "journalism" : {"temperature" : 0.6, "top_p" : 0.85}, |
| | "default" : {"temperature" : 0.7, "top_p" : 0.9}, |
| | } |
| |
|
| | |
| | PROMPTS = {"general" : "Write a comprehensive 500-1000 word informative article explaining a common topic that would appear in an encyclopedia. Use clear, neutral language with proper structure.", |
| | "academic" : "Write a formal research abstract (500-1000 words) for a scientific study. Include: background context, research methodology, key findings, and implications. Use academic vocabulary and formal tone appropriate for peer-reviewed publication.", |
| | "creative" : "Write a creative narrative passage (500-1000 words) with vivid descriptions, engaging storytelling, and literary devices. Focus on character, setting, or emotion with rich sensory details.", |
| | "ai_ml" : "Write a technical explanation (500-1000 words) of a machine learning concept or recent AI advancement. Include mathematical intuition, practical applications, and current research directions. Use precise technical terminology.", |
| | "software_dev" : "Write developer documentation (500-1000 words) explaining the implementation of a software design pattern or architectural principle. Include code examples, use cases, trade-offs, and best practices for professional developers.", |
| | "technical_doc" : "Write comprehensive API documentation (500-1000 words) for a REST endpoint. Include: endpoint description, request/response parameters with data types, authentication requirements, example requests, error codes, and usage notes. Use Markdown formatting.", |
| | "engineering" : "Write an engineering technical report excerpt (500-1000 words) analyzing a system design or technical solution. Include specifications, performance analysis, design constraints, and recommendations.", |
| | "science" : "Write a scientific explanation (500-1000 words) of a natural phenomenon or research finding. Include underlying mechanisms, experimental evidence, and real-world implications. Use precise scientific terminology.", |
| | "business" : "Write a professional business analysis (500-1000 words) covering market trends, competitive landscape, or strategic insights. Use business terminology, data-driven arguments, and executive-level language.", |
| | "legal" : "Draft a formal legal document excerpt (500-1000 words) such as a contract clause, terms of service, or policy statement. Use precise legal terminology, proper structure, and formal language appropriate for legal documents.", |
| | "medical" : "Write a clinical case description or medical research abstract (500-1000 words) with appropriate medical terminology. Include patient presentation, diagnostic approach, treatment, and outcomes or research methodology and findings.", |
| | "journalism" : "Write a journalistic news article (500-1000 words) in neutral reporting style. Include a compelling lead, factual reporting, quotes from sources, and balanced coverage. Follow AP style conventions.", |
| | "marketing" : "Write persuasive marketing content (500-1000 words) for a technology product or service. Include compelling value propositions, benefit-focused copy, clear calls to action, and engaging language that converts readers.", |
| | "social_media" : "Write 5-7 engaging social media posts (500-1000 words total) discussing a technology trend. Use informal conversational tone, include relevant hashtags, emojis where appropriate, and encourage engagement. Mix different post types.", |
| | "blog_personal" : "Write a personal blog post (500-1000 words) sharing personal experiences, opinions, or reflections on a topic. Use first-person perspective, informal conversational tone, and authentic voice.", |
| | "tutorial" : "Write a comprehensive step-by-step tutorial (500-1000 words) teaching beginners how to accomplish a specific technical task. Use clear numbered steps, explanatory notes, common pitfalls, and helpful tips.", |
| | } |
| |
|
| |
|
| | def generate_with_ollama(prompt: str, domain: str, max_tokens: int = 600) -> str: |
| | """ |
| | Generate text using Ollama with domain-specific parameters |
| | |
| | Arguments: |
| | ---------- |
| | prompt { str } : Generation prompt |
| | |
| | domain { str } : Domain name for parameter lookup |
| | |
| | max_tokens { int } : Maximum tokens to generate |
| | |
| | Returns: |
| | -------- |
| | { str } : Generated text (empty string if failed) |
| | """ |
| | url = "http://localhost:11434/api/generate" |
| | params = GENERATION_PARAMS.get(domain, GENERATION_PARAMS["default"]) |
| | |
| | payload = {"model" : MODEL_NAME, |
| | "prompt" : prompt, |
| | "stream" : False, |
| | "options" : {"temperature" : params["temperature"], |
| | "top_p" : params["top_p"], |
| | "num_predict" : max_tokens, |
| | } |
| | } |
| |
|
| | try: |
| | response = requests.post(url = url, |
| | json = payload, |
| | timeout = 60, |
| | ) |
| |
|
| | if (response.status_code == 200): |
| | result = response.json() |
| | return result.get("response", "").strip() |
| |
|
| | else: |
| | logger.warning(f"Ollama error: {response.status_code}") |
| | return "" |
| |
|
| | except Exception as e: |
| | logger.warning(f"Request failed: {e}") |
| | return "" |
| |
|
| |
|
| | def validate_generated_text(text: str, min_words: int = 100) -> bool: |
| | """ |
| | Validate generated text quality |
| | |
| | Arguments: |
| | ---------- |
| | text { str } : Generated text |
| | |
| | min_words { int } : Minimum word count |
| | |
| | Returns: |
| | -------- |
| | { bool } : True if valid |
| | """ |
| | if not text: |
| | return False |
| | |
| | word_count = len(text.split()) |
| | |
| | return (word_count >= min_words) |
| |
|
| |
|
| | def main(): |
| | """ |
| | Generate AI texts for all domains |
| | """ |
| | print("=" * 70) |
| | print("TEXT-AUTH: Generating AI Data") |
| | print("=" * 70) |
| | print(f"\nModel: {MODEL_NAME}") |
| | print(f"Generation strategy: Domain-specific prompts with adaptive parameters\n") |
| | |
| | total_generated = 0 |
| | total_failed = 0 |
| |
|
| | for domain_dir in HUMAN_DIR.iterdir(): |
| | if not domain_dir.is_dir(): |
| | continue |
| | |
| | domain = domain_dir.name |
| | |
| | if domain not in PROMPTS: |
| | logger.warning(f"Skipping {domain}: no prompt defined") |
| | continue |
| |
|
| | (AI_DIR / domain).mkdir(parents = True, exist_ok = True) |
| |
|
| | files = list(domain_dir.glob("*.txt")) |
| | domain_success = 0 |
| | domain_failed = 0 |
| |
|
| | logger.info(f"\n→ Generating for {domain} ({len(files)} samples)...") |
| |
|
| | for i, file in enumerate(tqdm(files, desc = f" {domain}")): |
| | prompt = PROMPTS[domain] |
| | ai_text = generate_with_ollama(prompt = prompt, |
| | domain = domain, |
| | ) |
| | |
| | |
| | if validate_generated_text(ai_text, min_words = 100): |
| | output_path = AI_DIR / domain / f"ai_{domain}_{i}.txt" |
| | |
| | with open(output_path, "w", encoding = "utf-8") as f: |
| | f.write(ai_text) |
| | |
| | domain_success += 1 |
| | total_generated += 1 |
| | |
| | else: |
| | logger.warning(f" Failed to generate valid text for {domain}_{i} (skipping)") |
| | domain_failed += 1 |
| | total_failed += 1 |
| | |
| | logger.info(f" {domain}: {domain_success} generated, {domain_failed} failed") |
| | |
| | |
| | print("\n" + "=" * 70) |
| | print("Generation Summary") |
| | print("=" * 70) |
| | print(f"Total generated: {total_generated}") |
| | print(f"Total failed: {total_failed}") |
| | print("=" * 70) |
| |
|
| |
|
| | |
| | if __name__ == "__main__": |
| | main() |