| | import json |
| | import requests |
| | import os |
| | import FindEmailWorkFlowV2 |
| | import SendEmailWorkFlowV2 |
| | import setup |
| | from llm_client import get_client |
| | from pdf_utils import extract_text_from_pdf |
| |
|
| | |
| | HUNTER_API_KEY = setup.HUNTER_API_KEY |
| |
|
| |
|
| | def load_legacy_excel_emails(excel_path="Workflow Company Log.xlsx"): |
| | """ |
| | Load email addresses from the legacy Excel file as an initial database. |
| | This is a READ-ONLY operation - the Excel file is never modified. |
| | All new emails are stored in the SQLite database only. |
| | |
| | Args: |
| | excel_path: Path to the Excel file containing previously sent emails |
| | (default: "Workflow Company Log.xlsx") |
| | |
| | Returns: |
| | tuple: (set of email addresses, set of domains) |
| | """ |
| | emails = set() |
| | domains = set() |
| |
|
| | |
| | if not os.path.exists(excel_path): |
| | print(f"Legacy Excel file not found at '{excel_path}'. Skipping Excel import.") |
| | return emails, domains |
| |
|
| | try: |
| | import pandas as pd |
| |
|
| | |
| | df = pd.read_excel(excel_path, header=None) |
| |
|
| | |
| | for value in df.iloc[:, 0].dropna(): |
| | email = str(value).strip() |
| |
|
| | |
| | if '@' in email and '.' in email: |
| | emails.add(email) |
| |
|
| | |
| | domain = email.split('@')[1] |
| | domains.add(domain) |
| |
|
| | print(f"Loaded {len(emails)} emails from legacy Excel file ({len(domains)} unique domains)") |
| |
|
| | except Exception as e: |
| | print(f"Error reading Excel file '{excel_path}': {str(e)}") |
| | print("Continuing without legacy Excel data...") |
| |
|
| | return emails, domains |
| |
|
| | def askClaudeToFindCompanies(api_key=None, location="Atlanta", industry="Clean Tech", num_companies=5): |
| | """ |
| | Uses OpenRouter API to find startup companies based on location and industry. |
| | Returns only company names and domains (no emails or contacts). |
| | |
| | Args: |
| | api_key: Deprecated, kept for backwards compatibility |
| | location: City or region to search for companies (default: "Atlanta") |
| | industry: Industry type to target (default: "Clean Tech") |
| | num_companies: Number of companies to find (default: 5) |
| | |
| | Returns: |
| | list: List of dicts with keys: company_name, domain |
| | """ |
| | client = get_client() |
| |
|
| | |
| | industry_examples = "" |
| | if "clean tech" in industry.lower() or "green" in industry.lower(): |
| | industry_examples = "(renewable energy, carbon capture, waste reduction, sustainable materials, etc.)" |
| | elif "ai" in industry.lower() or "ml" in industry.lower(): |
| | industry_examples = "(machine learning, artificial intelligence, natural language processing, computer vision, etc.)" |
| | elif "fintech" in industry.lower(): |
| | industry_examples = "(payments, banking, investment platforms, cryptocurrency, financial software, etc.)" |
| | elif "healthcare" in industry.lower() or "health" in industry.lower(): |
| | industry_examples = "(medical devices, health tech, biotech, telemedicine, health software, etc.)" |
| | elif "saas" in industry.lower(): |
| | industry_examples = "(B2B software, enterprise tools, cloud platforms, productivity software, etc.)" |
| | else: |
| | industry_examples = f"({industry} related technologies and services)" |
| |
|
| | prompt = ( |
| | "Return only a valid JSON array of objects with exactly two fields: " |
| | "company_name, domain.\n\n" |
| | f"Find {num_companies} real, actively operating {industry} companies based in the {location} area. " |
| | "These should be companies you have high confidence actually exist.\n\n" |
| |
|
| | "DOMAIN REQUIREMENTS:\n" |
| | "- Provide the company's primary website domain (e.g., 'acmesolar.com', NOT 'www.acmesolar.com' or 'https://acmesolar.com')\n" |
| | "- The domain should be the company's actual corporate domain\n" |
| | "- Do NOT include protocol (http/https) or subdomains (www)\n" |
| | "- ONLY include domains you are highly confident are correct\n" |
| | "- If you cannot find the correct domain for a company, SKIP IT entirely\n\n" |
| |
|
| | "COMPANY REQUIREMENTS:\n" |
| | f"- Only include companies working in {industry} {industry_examples}\n" |
| | f"- Companies must be based in or have significant presence in {location}\n" |
| | "- **CRITICAL: ONLY include startups and early-stage companies (NOT established enterprises)**\n" |
| | "- Startups typically have more open internship opportunities and are more responsive\n" |
| | "- Focus on companies with 10-200 employees (smaller is better)\n" |
| | "- Prefer recently founded companies (last 10 years) that are actively growing\n" |
| | "- Only include companies you have high confidence are real and currently operating\n\n" |
| |
|
| | "QUALITY OVER QUANTITY:\n" |
| | f"- It is better to return fewer than {num_companies} companies with REAL domains\n" |
| | f"- than to return {num_companies} companies with guessed or uncertain domains\n" |
| | "- Each entry should represent a real company you can verify exists\n\n" |
| |
|
| | "OUTPUT FORMAT:\n" |
| | "- Output only valid JSON with no markdown, explanations, or commentary\n" |
| | "- Example: [{\"company_name\": \"Acme Solar\", \"domain\": \"acmesolar.com\"}]\n" |
| | "- Example: [{\"company_name\": \"Green Energy Solutions\", \"domain\": \"greenenergysolutions.com\"}]" |
| | ) |
| |
|
| | response_text = client.create_message(prompt, max_tokens=4096) |
| |
|
| |
|
| | |
| | cleaned_text = response_text.strip() |
| | if cleaned_text.startswith("```json"): |
| | cleaned_text = cleaned_text[7:] |
| | elif cleaned_text.startswith("```"): |
| | cleaned_text = cleaned_text[3:] |
| | if cleaned_text.endswith("```"): |
| | cleaned_text = cleaned_text[:-3] |
| | cleaned_text = cleaned_text.strip() |
| |
|
| | |
| | try: |
| | companies = json.loads(cleaned_text) |
| | except json.JSONDecodeError as e: |
| | print(f"Failed to parse JSON. Error: {e}") |
| | print(f"Cleaned text that failed to parse:\n{cleaned_text}") |
| | raise |
| |
|
| | |
| | if not isinstance(companies, list): |
| | raise ValueError(f"Expected list of companies, but got {type(companies).__name__}") |
| |
|
| | for i, item in enumerate(companies): |
| | if not isinstance(item, dict): |
| | raise ValueError(f"Expected dict at index {i}, but got {type(item).__name__}: {item}") |
| |
|
| | |
| | if 'company_name' not in item: |
| | raise ValueError(f"Company at index {i} missing 'company_name' field: {item}") |
| | if 'domain' not in item: |
| | raise ValueError(f"Company at index {i} missing 'domain' field: {item}") |
| |
|
| | return companies |
| |
|
| |
|
| | def enrichCompaniesWithHunter(companies): |
| | """ |
| | Uses Hunter.io Company Enrichment API to find email addresses and contact names |
| | for a list of companies. Returns ONLY 1 contact per company. |
| | |
| | Args: |
| | companies: List of dicts with keys: company_name, domain |
| | |
| | Returns: |
| | list: List of dicts with keys: company_name, contact_name, email_address |
| | Only includes companies where valid contacts were found (1 contact per company) |
| | """ |
| | contacts = [] |
| |
|
| | for company in companies: |
| | company_name = company['company_name'] |
| | domain = company['domain'] |
| |
|
| | print(f"Searching for contacts at {company_name} ({domain})...") |
| |
|
| | try: |
| | |
| | url = f"https://api.hunter.io/v2/domain-search" |
| | params = { |
| | 'domain': domain, |
| | 'api_key': HUNTER_API_KEY, |
| | 'limit': 10 |
| | } |
| |
|
| | response = requests.get(url, params=params) |
| | response.raise_for_status() |
| |
|
| | data = response.json() |
| |
|
| | |
| | if 'data' not in data: |
| | print(f" No data returned for {company_name}") |
| | continue |
| |
|
| | emails_data = data['data'].get('emails', []) |
| |
|
| | if not emails_data: |
| | print(f" No emails found for {company_name}") |
| | continue |
| |
|
| | |
| | |
| | resume_specific_addresses = ['resume', 'resumes', 'careers', 'jobs', 'hiring', 'intern', 'internships', 'talent'] |
| | role_priority = ['hr', 'recruiting', 'talent', 'careers', 'people'] |
| |
|
| | |
| | def email_score(email_info): |
| | position = email_info.get('position', '').lower() |
| | email = email_info.get('value', '').lower() |
| | email_local_part = email.split('@')[0] if '@' in email else email |
| |
|
| | |
| | score = 0 |
| |
|
| | |
| | for resume_keyword in resume_specific_addresses: |
| | if email_local_part == resume_keyword or email_local_part.startswith(resume_keyword): |
| | score += 20 |
| | break |
| |
|
| | |
| | for role in role_priority: |
| | if role in position or role in email: |
| | score += 10 |
| |
|
| | |
| | if email_info.get('first_name') and email_info.get('last_name'): |
| | score += 5 |
| |
|
| | |
| | if email_info.get('verification', {}).get('status') == 'valid': |
| | score += 3 |
| |
|
| | return score |
| |
|
| | sorted_emails = sorted(emails_data, key=email_score, reverse=True) |
| |
|
| | |
| | for email_info in sorted_emails[:1]: |
| | email_address = email_info.get('value') |
| | first_name = email_info.get('first_name') |
| | last_name = email_info.get('last_name') |
| |
|
| | |
| | if first_name and last_name: |
| | contact_name = f"{first_name} {last_name}" |
| | elif first_name: |
| | contact_name = first_name |
| | else: |
| | contact_name = None |
| |
|
| | contact = { |
| | 'company_name': company_name, |
| | 'contact_name': contact_name, |
| | 'email_address': email_address |
| | } |
| |
|
| | contacts.append(contact) |
| | print(f" Found: {contact_name or 'Generic email'} - {email_address}") |
| |
|
| | except requests.exceptions.RequestException as e: |
| | print(f" Error fetching data for {company_name}: {e}") |
| | continue |
| | except Exception as e: |
| | print(f" Unexpected error for {company_name}: {e}") |
| | continue |
| |
|
| | print(f"\nTotal contacts found: {len(contacts)}") |
| | return contacts |
| |
|
| |
|
| | def createEmailsUsingClaude(contacts, resume_path, api_key=None, industry="Clean Tech", custom_message=""): |
| | """ |
| | Uses OpenRouter API to generate personalized emails for each contact. |
| | |
| | Args: |
| | contacts: List of contact dicts from askClaudeToFindContacts |
| | resume_path: Path to PDF resume file |
| | api_key: Deprecated, kept for backwards compatibility |
| | industry: Industry type to tailor email content (default: "Clean Tech") |
| | custom_message: Optional custom message to incorporate into emails (default: "") |
| | |
| | Returns: |
| | list: List of dicts with company_name, contact_name, email_address, email_body |
| | """ |
| | client = get_client() |
| |
|
| | |
| | resume_text = extract_text_from_pdf(resume_path) |
| | if not resume_text: |
| | raise ValueError(f"Could not extract text from resume at {resume_path}") |
| |
|
| | |
| | contact_text = json.dumps(contacts, indent=2) |
| |
|
| | prompt = ( |
| | f"You are helping draft personalized internship outreach emails for companies in the {industry} industry. " |
| | f"For each company listed below, create a tailored email that:\n\n" |
| | f"1. References specific work or projects the company is doing in {industry}\n" |
| | f"2. Connects the applicant's background (found in the resume) to the company's mission\n" |
| | f"3. Sounds authentic, human, and genuinely interested (NOT AI-generated)\n" |
| | f"4. Is professional but warm and conversational\n" |
| | f"5. Asks for internship opportunities without being pushy\n\n" |
| | f"6. Keeps the email concise (150-200 words)\n\n" |
| | f"7. Does not fabricate any information about the company or the applicant\n\n" |
| | f"{f'8. Incorporates this specific message/requirement: {custom_message}' if custom_message else ''}\n\n" |
| | f"RESUME CONTENT:\n{resume_text}\n\n" |
| | f"Example email structure (adapt this based on the resume and each company):\n\n" |
| | f"Hi [Company Name Team],\n\n" |
| | f"I hope you're well. My name is [Name from resume], and I'm a [major/background from resume] student at [university from resume]. " |
| | f"I recently came across [Company Name]'s work on [specific project/technology in {industry}] and was fascinated by [specific technical aspect]. " |
| | f"I've spent time working on [relevant experience from resume], and I'd love to see how these skills might apply in a real-world, high-impact setting like yours. " |
| | f"My interest is to learn from experienced teams and contribute in any way I can, however small. " |
| | f"If there is a way for me to get involved with the technical side at [Company Name], I'd be grateful for the chance to discuss.\n\n" |
| | f"I've attached my resume for reference. Thank you very much for considering this note, and I appreciate any time or advice you can offer.\n\n" |
| | f"Best,\n[Name from resume]\n\n" |
| | f"IMPORTANT:\n" |
| | f"- Research each company and reference their actual work in {industry}\n" |
| | f"- Extract the applicant's name, university, and major from the resume\n" |
| | f"- Match skills from the resume to each company's focus area\n" |
| | f"- Make each email unique - no copy-paste language between companies\n" |
| | f"- Keep emails concise (150-200 words)\n\n" |
| | f"Company contacts:\n{contact_text}\n\n" |
| | f"Return a JSON array with the same contacts but add an 'email_body' field containing the tailored email body. " |
| | f"Do not include subject line or attachment information. Return only valid JSON with no additional text." |
| | ) |
| |
|
| | response_text = client.create_message(prompt, max_tokens=8000) |
| | |
| |
|
| | |
| | cleaned_text = response_text.strip() |
| | if cleaned_text.startswith("```json"): |
| | cleaned_text = cleaned_text[7:] |
| | elif cleaned_text.startswith("```"): |
| | cleaned_text = cleaned_text[3:] |
| | if cleaned_text.endswith("```"): |
| | cleaned_text = cleaned_text[:-3] |
| | cleaned_text = cleaned_text.strip() |
| |
|
| | |
| | try: |
| | emails_with_bodies = json.loads(cleaned_text) |
| | except json.JSONDecodeError as e: |
| | print(f"Failed to parse JSON. Error: {e}") |
| | print(f"Cleaned text that failed to parse:\n{cleaned_text}") |
| | raise |
| |
|
| | |
| | if not isinstance(emails_with_bodies, list): |
| | raise ValueError(f"Expected list of contacts, but got {type(emails_with_bodies).__name__}") |
| |
|
| | for i, item in enumerate(emails_with_bodies): |
| | if not isinstance(item, dict): |
| | raise ValueError(f"Expected dict at index {i}, but got {type(item).__name__}: {item}") |
| |
|
| | |
| | if 'email_address' not in item: |
| | raise ValueError(f"Contact at index {i} missing 'email_address' field: {item}") |
| | if 'email_body' not in item: |
| | raise ValueError(f"Contact at index {i} missing 'email_body' field: {item}") |
| |
|
| | return emails_with_bodies |
| |
|
| |
|
| | def main( |
| | sender_email, |
| | sender_password, |
| | user_id=None, |
| | user_emails_sent=None, |
| | user_domains_contacted=None, |
| | resume_path="Sumedh_Kothari_Resume.pdf", |
| | location="Atlanta", |
| | industry="Clean Tech", |
| | num_emails=5, |
| | custom_message="", |
| | progress_callback=None, |
| | max_attempts=10 |
| | ): |
| | """ |
| | Execute the complete workflow from contact discovery to email sending. |
| | Uses Claude to find companies, then Hunter.io to find contacts. |
| | Loops until the desired number of unique emails is found or max attempts is reached. |
| | |
| | Args: |
| | sender_email: User's email address for sending emails (SMTP server auto-detected from domain) |
| | sender_password: User's email password or app-specific password |
| | user_id: User ID for tracking sent emails (optional) |
| | user_emails_sent: Set of emails already sent by this user (optional) |
| | user_domains_contacted: Set of domains already contacted by this user (optional) |
| | resume_path: Path to resume PDF file (default: "Sumedh_Kothari_Resume.pdf") |
| | location: City or region to search for companies (default: "Atlanta") |
| | industry: Industry type to target (default: "Clean Tech") |
| | num_emails: Number of unique emails to send (default: 5) |
| | custom_message: Optional custom message to include in emails (default: "") |
| | progress_callback: Optional callback function for progress updates (default: None) |
| | Signature: callback(message, msg_type='in-progress', count=None) |
| | max_attempts: Maximum number of search attempts (default: 10) |
| | |
| | Returns: |
| | dict: Email sending results with success/failure counts and emails_sent list |
| | """ |
| | |
| |
|
| | |
| | if user_emails_sent is None: |
| | user_emails_sent = set() |
| | if user_domains_contacted is None: |
| | user_domains_contacted = set() |
| |
|
| | |
| | |
| | excel_emails, excel_domains = load_legacy_excel_emails() |
| |
|
| | |
| | user_emails_sent = user_emails_sent | excel_emails |
| | user_domains_contacted = user_domains_contacted | excel_domains |
| |
|
| | if len(excel_emails) > 0: |
| | print(f"Merged {len(excel_emails)} legacy emails from Excel with {len(user_emails_sent) - len(excel_emails)} database emails") |
| | print(f"Total emails to avoid: {len(user_emails_sent)}") |
| | print(f"Total domains to avoid: {len(user_domains_contacted)}") |
| |
|
| | |
| | def progress(msg, msg_type='in-progress', count=None): |
| | print(msg) |
| | if progress_callback: |
| | progress_callback(msg, msg_type, count) |
| |
|
| | |
| | all_unique_contacts = [] |
| | session_emails = set() |
| | session_domains = set() |
| | attempt = 0 |
| | batch_size = max(5, num_emails) |
| |
|
| | progress(f"Starting search for {num_emails} unique {industry} contacts in {location}...", 'in-progress') |
| |
|
| | |
| | while len(all_unique_contacts) < num_emails and attempt < max_attempts: |
| | attempt += 1 |
| | progress(f"Search attempt {attempt}/{max_attempts} (found {len(all_unique_contacts)}/{num_emails} unique contacts so far)...", 'in-progress') |
| |
|
| | |
| | progress(f"Searching for {batch_size} {industry} companies...", 'in-progress') |
| | companies = askClaudeToFindCompanies(location=location, industry=industry, num_companies=batch_size) |
| | progress(f"Found {len(companies)} companies", 'success') |
| |
|
| | if len(companies) == 0: |
| | progress("No companies found in this batch", 'error') |
| | break |
| |
|
| | |
| | progress(f"Finding email contacts for {len(companies)} companies...", 'in-progress') |
| | contacts = enrichCompaniesWithHunter(companies) |
| | progress(f"Found {len(contacts)} email contacts", 'success') |
| |
|
| | if len(contacts) == 0: |
| | progress("No email contacts found in this batch", 'error') |
| | continue |
| |
|
| | |
| | progress("Removing duplicates and previously contacted companies...", 'in-progress') |
| |
|
| | |
| | combined_emails = user_emails_sent | session_emails |
| | combined_domains = user_domains_contacted | session_domains |
| |
|
| | cleaned_contacts = FindEmailWorkFlowV2.main(contacts, combined_emails, combined_domains) |
| | progress(f"After deduplication: {len(cleaned_contacts)} new contacts in this batch", 'success') |
| |
|
| | if len(cleaned_contacts) == 0: |
| | progress("No new unique contacts in this batch (all were duplicates)", 'in-progress') |
| | continue |
| |
|
| | |
| | for contact in cleaned_contacts: |
| | if len(all_unique_contacts) >= num_emails: |
| | break |
| |
|
| | all_unique_contacts.append(contact) |
| | session_emails.add(contact['email_address']) |
| |
|
| | |
| | domain = contact['email_address'].split('@')[1] if '@' in contact['email_address'] else None |
| | if domain: |
| | session_domains.add(domain) |
| |
|
| | progress(f"Total unique contacts collected: {len(all_unique_contacts)}/{num_emails}", 'in-progress') |
| |
|
| | |
| | if len(all_unique_contacts) >= num_emails: |
| | progress(f"Successfully found {len(all_unique_contacts)} unique contacts!", 'success') |
| | break |
| |
|
| | |
| | if len(all_unique_contacts) == 0: |
| | progress("Unable to find any new unique contacts. All available contacts have already been contacted or no contacts were found.", 'error') |
| | return {"successful": 0, "failed": 0, "total": 0, "emails_sent": []} |
| |
|
| | if len(all_unique_contacts) < num_emails: |
| | progress(f"Could only find {len(all_unique_contacts)} unique contacts after {attempt} attempts. All other available contacts in {location} {industry} have already been contacted or could not be found.", 'error') |
| | |
| |
|
| | |
| | final_contacts = all_unique_contacts[:num_emails] |
| |
|
| | |
| | progress(f"Generating {len(final_contacts)} personalized emails using AI...", 'in-progress') |
| | emails_with_bodies = createEmailsUsingClaude(final_contacts, resume_path, industry=industry, custom_message=custom_message) |
| | progress(f"Created {len(emails_with_bodies)} personalized emails", 'success') |
| |
|
| | |
| | progress("Sending emails (this may take a few minutes)...", 'in-progress') |
| | results = SendEmailWorkFlowV2.main( |
| | emails_with_bodies, |
| | resume_path, |
| | sender_email, |
| | sender_password |
| | ) |
| |
|
| | |
| | emails_sent = [contact['email_address'] for contact in final_contacts] |
| |
|
| | |
| | final_results = { |
| | "successful": results.get("success", 0), |
| | "failed": results.get("failure", 0), |
| | "total": results.get("total", 0), |
| | "emails_sent": emails_sent, |
| | "contacts_data": emails_with_bodies |
| | } |
| |
|
| | progress(f"Emails sent successfully: {final_results['successful']}/{final_results['total']}", 'success', final_results['successful']) |
| | return final_results |
| |
|
| |
|
| | if __name__ == "__main__": |
| | |
| | |
| | |
| | print("This script should be run through the web application.") |
| | print("To test standalone, call main() with required parameters.") |
| |
|