| from __future__ import annotations |
|
|
| import json |
| from pathlib import Path |
|
|
| BASE_DIR = Path(__file__).resolve().parent.parent |
| OUTPUT_DIR = BASE_DIR / "data" / "decision_phase_difficulty" |
| BENCHMARK_PATH = BASE_DIR / "data" / "decision_phase_benchmark.jsonl" |
|
|
| TRAIN_THEMES = ( |
| { |
| "topic": "crm software", |
| "product": "CRM", |
| "products": "CRM tools", |
| "provider_a": "HubSpot", |
| "provider_b": "Zoho", |
| "domain": "a small sales team", |
| "goal": "manage leads better", |
| "support_object": "account", |
| "support_detail": "password reset", |
| "asset": "CRM onboarding guide", |
| }, |
| { |
| "topic": "analytics software", |
| "product": "analytics platform", |
| "products": "analytics platforms", |
| "provider_a": "Mixpanel", |
| "provider_b": "Amplitude", |
| "domain": "a product team", |
| "goal": "measure activation", |
| "support_object": "dashboard", |
| "support_detail": "data sync issue", |
| "asset": "analytics setup guide", |
| }, |
| { |
| "topic": "laptops", |
| "product": "laptop", |
| "products": "laptops", |
| "provider_a": "MacBook Air", |
| "provider_b": "Dell XPS 13", |
| "domain": "college work", |
| "goal": "choose the right laptop", |
| "support_object": "order", |
| "support_detail": "delivery delay", |
| "asset": "laptop buying checklist", |
| }, |
| ) |
|
|
| BENCHMARK_THEMES = { |
| "easy": { |
| "topic": "help desk software", |
| "product": "help desk platform", |
| "products": "help desk tools", |
| "provider_a": "Zendesk", |
| "provider_b": "Freshdesk", |
| "domain": "a support team", |
| "goal": "handle tickets faster", |
| "support_object": "billing portal", |
| "support_detail": "invoice issue", |
| "asset": "help desk buyer guide", |
| }, |
| "medium": { |
| "topic": "cars", |
| "product": "car", |
| "products": "cars", |
| "provider_a": "Toyota Corolla", |
| "provider_b": "Honda Civic", |
| "domain": "daily commuting", |
| "goal": "choose the right car", |
| "support_object": "reservation", |
| "support_detail": "test drive booking", |
| "asset": "car buying worksheet", |
| }, |
| "hard": { |
| "topic": "hosting platforms", |
| "product": "hosting platform", |
| "products": "hosting providers", |
| "provider_a": "Vercel", |
| "provider_b": "Netlify", |
| "domain": "a startup launch", |
| "goal": "ship a new website", |
| "support_object": "deployment", |
| "support_detail": "domain setup problem", |
| "asset": "hosting migration guide", |
| }, |
| } |
|
|
| PHASE_TEMPLATES = { |
| "awareness": { |
| "easy": ( |
| "What is {topic}?", |
| "Explain {topic}.", |
| "How does {product} work?", |
| "What does {provider_a} do?", |
| "Give me the basics of {topic}.", |
| ), |
| "medium": ( |
| "Help me understand what problem {provider_a} solves.", |
| "What should a beginner know about {products}?", |
| "Before I look at options, what is a {product}?", |
| "What is the purpose of {topic} in {domain}?", |
| "What does a {product} actually help with?", |
| ), |
| "hard": ( |
| "I am not shopping yet, I just want to understand what {topic} is.", |
| "Before I evaluate anything, what role does {topic} play in {domain}?", |
| "I keep hearing about {provider_a}; what is it actually for?", |
| "Can you clarify what people mean by {topic} in practice?", |
| "I only need an overview of {topic} right now.", |
| ), |
| }, |
| "research": { |
| "easy": ( |
| "What {products} should I explore for {domain}?", |
| "Show me options to consider for {goal}.", |
| "What tools should I look at for {domain}?", |
| "Help me research {products}.", |
| "Where should I start with {products}?", |
| ), |
| "medium": ( |
| "I am early in the process and want to explore {products}.", |
| "Give me a shortlist of {products} worth researching.", |
| "What directions should I investigate for {goal}?", |
| "What categories should I look at before narrowing down?", |
| "What are some promising {products} for {domain}?", |
| ), |
| "hard": ( |
| "I am not ready to compare vendors yet, just help me scope the market.", |
| "What should I research first if I am only beginning to look at {products}?", |
| "I need a landscape view before I make a shortlist.", |
| "What are the main options in this space before I decide anything?", |
| "Help me map the market for {products} without recommending one yet.", |
| ), |
| }, |
| "consideration": { |
| "easy": ( |
| "Best {product} for {domain}.", |
| "{provider_a} vs {provider_b}.", |
| "Compare {products} for {goal}.", |
| "Which {product} looks best for {domain}?", |
| "What are some {products} worth considering?", |
| ), |
| "medium": ( |
| "Compare {provider_a} and {provider_b} for {goal}.", |
| "What are the pros and cons of {provider_a}?", |
| "Help me evaluate the best {products} for {domain}.", |
| "Which {product} seems worth considering right now?", |
| "I am comparing options for {goal}; what should be on the shortlist?", |
| ), |
| "hard": ( |
| "I am past basic research and now weighing tradeoffs between {provider_a} and {provider_b}.", |
| "I want to compare serious options before committing to one.", |
| "Help me think through the tradeoffs in the current shortlist.", |
| "What looks strongest if I am narrowing down to a few options?", |
| "I have done research, now help me compare the finalists.", |
| ), |
| }, |
| "decision": { |
| "easy": ( |
| "Which {product} should I choose?", |
| "Should I pick {provider_a} or {provider_b}?", |
| "Which option should I commit to?", |
| "What is the best fit for me right now?", |
| "Which plan should I choose today?", |
| ), |
| "medium": ( |
| "I am ready to decide between {provider_a} and {provider_b}.", |
| "Help me pick the final option for {goal}.", |
| "Which {product} should I commit to this week?", |
| "I need to make the call now; which option fits best?", |
| "What should I choose if I need to decide today?", |
| ), |
| "hard": ( |
| "I have a shortlist and need to commit to one vendor now.", |
| "I am at the point of commitment and need a final recommendation.", |
| "Which option should we sign off on before next week?", |
| "I have enough information; tell me which one to go with.", |
| "I need the final pick, not another round of comparison.", |
| ), |
| }, |
| "action": { |
| "easy": ( |
| "Start my free trial.", |
| "Book a demo with {provider_a}.", |
| "Create my account.", |
| "Buy {provider_a} now.", |
| "Download the {asset}.", |
| ), |
| "medium": ( |
| "Take me to checkout for {provider_a}.", |
| "Get me signed up for {provider_a}.", |
| "Reserve my spot with {provider_a}.", |
| "I want to purchase {provider_a} today.", |
| "Send me the download link for the {asset}.", |
| ), |
| "hard": ( |
| "I am ready to move forward now, where do I start the purchase?", |
| "Help me complete the signup flow for {provider_a}.", |
| "I want to act on this immediately and get access now.", |
| "Can you help me finish the order for {provider_a}?", |
| "I have decided, now let me complete the next step.", |
| ), |
| }, |
| "post_purchase": { |
| "easy": ( |
| "How do I set up my new {product}?", |
| "Show me how to import contacts into {provider_a}.", |
| "How do I onboard my team after purchase?", |
| "What should I enable first after signup?", |
| "How do I configure my account now that I signed up?", |
| ), |
| "medium": ( |
| "We already subscribed; how do we get value quickly?", |
| "What is the best way to roll this out after purchase?", |
| "Help me configure {provider_a} now that we bought it.", |
| "How do I invite teammates after signing up?", |
| "What should I do first after we activate the plan?", |
| ), |
| "hard": ( |
| "We already made the purchase, now I need guidance on rollout and setup.", |
| "This is not a buying decision anymore; I need post-purchase onboarding help.", |
| "I need adoption guidance now that the contract is signed.", |
| "What is the right onboarding sequence after we commit to {provider_a}?", |
| "We are past checkout and need implementation help.", |
| ), |
| }, |
| "support": { |
| "easy": ( |
| "I cannot log into my {support_object}.", |
| "How do I reset my password?", |
| "My invoice is wrong.", |
| "The integration keeps failing.", |
| "Our dashboard is not loading.", |
| ), |
| "medium": ( |
| "Can you help me fix a {support_detail}?", |
| "I am stuck because my {support_object} keeps breaking.", |
| "My password reset link is not working.", |
| "I need support with my {support_object}.", |
| "Why is {provider_a} not syncing correctly?", |
| ), |
| "hard": ( |
| "I am not evaluating anything, I just need this issue fixed.", |
| "This is a live support problem, not a buying question.", |
| "Please help me resolve a problem with my existing account.", |
| "I cannot continue because something is broken in my setup.", |
| "I need troubleshooting help, not recommendations.", |
| ), |
| }, |
| } |
|
|
|
|
| def split_for_index(index: int) -> str: |
| bucket = index % 5 |
| if bucket < 3: |
| return "train" |
| if bucket == 3: |
| return "val" |
| return "test" |
|
|
|
|
| def write_jsonl(path: Path, rows: list[dict]) -> None: |
| path.parent.mkdir(parents=True, exist_ok=True) |
| with path.open("w", encoding="utf-8") as handle: |
| for row in rows: |
| handle.write(json.dumps(row, sort_keys=True) + "\n") |
|
|
|
|
| def render_text(template: str, spec: dict[str, str]) -> str: |
| return template.format(**spec).strip() |
|
|
|
|
| def build_training_splits() -> dict[str, list[dict]]: |
| splits = {"train": [], "val": [], "test": []} |
| seen: set[str] = set() |
| counters = {(phase, difficulty): 0 for phase in PHASE_TEMPLATES for difficulty in PHASE_TEMPLATES[phase]} |
|
|
| for phase, difficulty_map in PHASE_TEMPLATES.items(): |
| for difficulty, templates in difficulty_map.items(): |
| for template in templates: |
| theme_specs = TRAIN_THEMES if "{" in template else ({},) |
| for spec in theme_specs: |
| text = render_text(template, spec) |
| key = text.lower() |
| if key in seen: |
| continue |
| seen.add(key) |
| split_name = split_for_index(counters[(phase, difficulty)]) |
| counters[(phase, difficulty)] += 1 |
| splits[split_name].append( |
| { |
| "text": text, |
| "decision_phase": phase, |
| "difficulty": difficulty, |
| "source": "synthetic_decision_phase_difficulty", |
| } |
| ) |
| return splits |
|
|
|
|
| def build_benchmark_rows() -> list[dict]: |
| rows: list[dict] = [] |
| seen: set[str] = set() |
| for phase, difficulty_map in PHASE_TEMPLATES.items(): |
| for difficulty, templates in difficulty_map.items(): |
| spec = BENCHMARK_THEMES.get(difficulty, {}) |
| for template in templates: |
| text = render_text(template, spec) |
| key = text.lower() |
| if key in seen: |
| continue |
| seen.add(key) |
| rows.append( |
| { |
| "text": text, |
| "decision_phase": phase, |
| "difficulty": difficulty, |
| "source": "decision_phase_benchmark", |
| } |
| ) |
| return rows |
|
|
|
|
| def main() -> None: |
| splits = build_training_splits() |
| for split_name, rows in splits.items(): |
| write_jsonl(OUTPUT_DIR / f"{split_name}.jsonl", rows) |
| print(f"{split_name}: {len(rows)} rows") |
|
|
| benchmark_rows = build_benchmark_rows() |
| write_jsonl(BENCHMARK_PATH, benchmark_rows) |
| print(f"benchmark: {len(benchmark_rows)} rows") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|