| import puppeteer, { ElementHandle } from "puppeteer"; |
| import { Business } from "@/types"; |
| import { db } from "@/db"; |
| import { businesses, scrapingJobs } from "@/db/schema"; |
| import { eq } from "drizzle-orm"; |
|
|
| interface ScrapingOptions { |
| keywords: string[]; |
| location?: string; |
| limit?: number; |
| } |
|
|
| |
| |
| |
| |
| export async function scrapeGoogleMapsReal( |
| options: ScrapingOptions, |
| userId: string |
| ): Promise<Partial<Business>[]> { |
| const { keywords, location = "United States", limit = 20 } = options; |
| const results: Partial<Business>[] = []; |
|
|
| let browser; |
| try { |
| |
| browser = await puppeteer.launch({ |
| headless: true, |
| args: ["--no-sandbox", "--disable-setuid-sandbox"], |
| }); |
|
|
| const page = await browser.newPage(); |
|
|
| |
| await page.setUserAgent( |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" |
| ); |
|
|
| for (const keyword of keywords) { |
| try { |
| |
| const searchUrl = `https://www.google.com/maps/search/${encodeURIComponent( |
| keyword + " " + location |
| )}`; |
| |
| await page.goto(searchUrl, { waitUntil: "networkidle2", timeout: 30000 }); |
| |
| |
| await page.waitForSelector('div[role="feed"]', { timeout: 10000 }); |
|
|
| |
| await autoScroll(page); |
|
|
| |
| const businessData = await page.evaluate(() => { |
| const items = Array.from( |
| document.querySelectorAll('div[role="feed"] > div > div[role="article"]') |
| ); |
|
|
| return items.map((item) => { |
| const titleEl = item.querySelector('div.fontHeadlineSmall'); |
| const ratingEl = item.querySelector('span[role="img"]'); |
| const reviewsEl = item.querySelector('span[role="img"] + span'); |
| const addressEl = item.querySelector('div.fontBodyMedium > div:nth-child(2) > div:nth-child(2)'); |
| const descriptionEl = item.querySelector('div.fontBodyMedium span'); |
| const imageEl = item.querySelector('img'); |
| |
| const name = titleEl?.textContent?.trim() || ""; |
| const ratingText = ratingEl?.getAttribute("aria-label") || ""; |
| const rating = parseFloat(ratingText.match(/[\d.]+/)?.[0] || "0"); |
| const reviewsText = reviewsEl?.textContent?.trim() || "0"; |
| const reviewCount = parseInt(reviewsText.replace(/[^\d]/g, "") || "0"); |
| const address = addressEl?.textContent?.trim() || ""; |
| const description = descriptionEl?.textContent?.trim() || ""; |
| const imageUrl = imageEl?.getAttribute("src") || ""; |
|
|
| return { name, rating, reviewCount, address, description, imageUrl }; |
| }); |
| }); |
|
|
| |
| for (const business of businessData.slice(0, limit)) { |
| if (!business.name) continue; |
|
|
| |
| try { |
| const businessName = business.name; |
| const handle = await page.evaluateHandle((name) => { |
| const links = Array.from(document.querySelectorAll("a")); |
| return links.find((el) => el.textContent?.includes(name)) || null; |
| }, businessName); |
| const businessLink = handle.asElement() as ElementHandle<Element> | null; |
| |
| if (businessLink) { |
| await businessLink.click(); |
| await new Promise(resolve => setTimeout(resolve, 2000)); |
|
|
| |
| const details = await page.evaluate(() => { |
| const websiteEl = |
| document.querySelector('a[data-item-id="authority"]') || |
| document.querySelector('a[data-tooltip="Open website"]') || |
| document.querySelector('a[aria-label*="Website"]'); |
|
|
| const phoneEl = |
| document.querySelector('button[data-item-id^="phone"]') || |
| document.querySelector('button[data-tooltip="Copy phone number"]') || |
| document.querySelector('button[aria-label*="Phone"]'); |
|
|
| return { |
| website: websiteEl?.getAttribute("href") || null, |
| phone: phoneEl?.getAttribute("aria-label")?.replace("Phone: ", "") || null, |
| }; |
| }); |
|
|
| |
| const email = details.website |
| ? await extractEmailFromWebsite(details.website) |
| : await generateEmail(businessName); |
|
|
| results.push({ |
| userId, |
| name: business.name, |
| email, |
| phone: details.phone || null, |
| website: details.website || null, |
| address: business.address, |
| rating: business.rating, |
| totalReviews: business.reviewCount, |
| category: keyword, |
| source: "google_maps", |
| emailStatus: "pending", |
| lastContactedAt: null, |
| }); |
|
|
| |
| await page.goBack(); |
| await new Promise(resolve => setTimeout(resolve, 1000)); |
| } |
| } catch (error) { |
| console.error(`Error processing business ${business.name}:`, error); |
| |
| results.push({ |
| userId, |
| name: business.name, |
| email: await generateEmail(business.name), |
| address: business.address, |
| rating: business.rating, |
| totalReviews: business.reviewCount, |
| category: keyword, |
| source: "google_maps", |
| emailStatus: "pending", |
| description: business.description || null, |
| logo: business.imageUrl || null, |
| }); |
| } |
|
|
| if (results.length >= limit) break; |
| } |
|
|
| if (results.length >= limit) break; |
| } catch (error) { |
| console.error(`Error scraping keyword "${keyword}":`, error); |
| } |
| } |
|
|
| return results; |
| } catch (error) { |
| console.error("Error in Google Maps scraping:", error); |
| throw error; |
| } finally { |
| if (browser) { |
| await browser.close(); |
| } |
| } |
| } |
|
|
| |
| |
| |
| async function autoScroll(page: import("puppeteer").Page) { |
| await page.evaluate(async () => { |
| const feed = document.querySelector('div[role="feed"]'); |
| if (feed) { |
| for (let i = 0; i < 3; i++) { |
| feed.scrollTop = feed.scrollHeight; |
| await new Promise((resolve) => setTimeout(resolve, 2000)); |
| } |
| } |
| }); |
| } |
|
|
| |
| |
| |
| async function extractEmailFromWebsite(website: string): Promise<string | null> { |
| try { |
| const axios = (await import("axios")).default; |
| const response = await axios.get(website, { timeout: 5000 }); |
| const html = response.data; |
|
|
| |
| const emailRegex = /[\w.-]+@[\w.-]+\.\w+/g; |
| const emails = html.match(emailRegex); |
|
|
| if (emails && emails.length > 0) { |
| |
| const filtered = emails.filter( |
| (email: string) => |
| !email.includes("example.com") && |
| !email.includes("domain.com") && |
| !email.includes("wixpress.com") |
| ); |
| return filtered[0] || null; |
| } |
|
|
| return null; |
| } catch { |
| return null; |
| } |
| } |
|
|
| |
| |
| |
| function generateEmail(businessName: string): string { |
| const cleaned = businessName |
| .toLowerCase() |
| .replace(/[^a-z0-9\s]/g, "") |
| .replace(/\s+/g, ""); |
| |
| const domain = `${cleaned}.com`; |
| return `info@${domain}`; |
| } |
|
|
| |
| |
| |
| export class RealScrapingService { |
| private isRunning = false; |
|
|
| async start() { |
| this.isRunning = true; |
| console.log("🚀 Real scraping service started"); |
|
|
| while (this.isRunning) { |
| try { |
| |
| const pendingJobs = await db.query.scrapingJobs.findMany({ |
| where: eq(scrapingJobs.status, "pending"), |
| limit: 1, |
| }); |
|
|
| for (const job of pendingJobs) { |
| try { |
| |
| await db |
| .update(scrapingJobs) |
| .set({ status: "running" }) |
| .where(eq(scrapingJobs.id, job.id)); |
|
|
| |
| const results = await scrapeGoogleMapsReal( |
| { keywords: job.keywords as string[], limit: 50 }, |
| job.userId |
| ); |
|
|
| |
| if (results.length > 0) { |
| console.log("🚀 Saving results to database"); |
| |
| await db.insert(businesses).values(results as any); |
| } |
|
|
| |
| await db |
| .update(scrapingJobs) |
| .set({ |
| status: "completed", |
| businessesFound: results.length, |
| completedAt: new Date(), |
| }) |
| .where(eq(scrapingJobs.id, job.id)); |
|
|
| console.log( |
| `✅ Scraping job ${job.id} completed: ${results.length} businesses found` |
| ); |
| } catch (error) { |
| console.error(`❌ Scraping job ${job.id} failed:`, error); |
| await db |
| .update(scrapingJobs) |
| .set({ status: "failed" }) |
| .where(eq(scrapingJobs.id, job.id)); |
| } |
| } |
|
|
| |
| await new Promise((resolve) => setTimeout(resolve, 10000)); |
| } catch (error) { |
| console.error("Error in scraping service:", error); |
| await new Promise((resolve) => setTimeout(resolve, 5000)); |
| } |
| } |
| } |
|
|
| stop() { |
| this.isRunning = false; |
| console.log("🛑 Scraping service stopped"); |
| } |
| } |
|
|