Spaces:
Sleeping
Sleeping
# app/scraper.py (Enhanced version) | |
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import time | |
import re | |
import json | |
from typing import List, Dict, Optional | |
from pathlib import Path | |
import logging | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.chrome.options import Options | |
from selenium.common.exceptions import TimeoutException | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class JupiterFAQScraper: | |
"""Enhanced scraper for Jupiter Money website""" | |
def __init__(self): | |
self.base_url = "https://jupiter.money" | |
self.target_urls = [ | |
"https://jupiter.money/savings-account/", | |
"https://jupiter.money/pro-salary-account/", | |
# ... other URLs | |
] | |
self.session = requests.Session() | |
self.session.headers.update({ | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
}) | |
def test_scraping(self) -> Dict[str, bool]: | |
"""Test if scraping actually works""" | |
results = {} | |
# Test 1: Can we access the website? | |
try: | |
response = self.session.get(self.base_url, timeout=10) | |
results['website_accessible'] = response.status_code == 200 | |
logger.info(f"Website accessible: {results['website_accessible']}") | |
except: | |
results['website_accessible'] = False | |
logger.error("Cannot access Jupiter website") | |
# Test 2: Can we find FAQ content? | |
if results['website_accessible']: | |
try: | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Look for common FAQ indicators | |
faq_indicators = ['faq', 'help', 'support', 'question', 'answer'] | |
content = soup.get_text().lower() | |
results['faq_content_found'] = any(indicator in content for indicator in faq_indicators) | |
logger.info(f"FAQ content found: {results['faq_content_found']}") | |
except: | |
results['faq_content_found'] = False | |
return results | |
def scrape_with_fallback(self) -> pd.DataFrame: | |
"""Try multiple scraping methods with fallback""" | |
all_faqs = [] | |
# Method 1: Try actual scraping | |
logger.info("Attempting to scrape Jupiter website...") | |
test_results = self.test_scraping() | |
if test_results.get('website_accessible'): | |
# Try basic scraping | |
for url in self.target_urls[:3]: # Test with first 3 URLs | |
try: | |
faqs = self.scrape_page_safe(url) | |
all_faqs.extend(faqs) | |
if faqs: | |
logger.info(f"Successfully scraped {len(faqs)} FAQs from {url}") | |
except Exception as e: | |
logger.warning(f"Failed to scrape {url}: {e}") | |
# Method 2: If scraping fails or gets too little data, use fallback | |
if len(all_faqs) < 10: | |
logger.warning("Actual scraping yielded insufficient data. Using fallback FAQ data...") | |
all_faqs = self.get_fallback_faqs() | |
# Create DataFrame | |
df = pd.DataFrame(all_faqs) | |
if not df.empty: | |
df = df.drop_duplicates(subset=['question']) | |
return df | |
def scrape_page_safe(self, url: str) -> List[Dict]: | |
"""Safely scrape a page with error handling""" | |
faqs = [] | |
try: | |
response = self.session.get(url, timeout=10) | |
if response.status_code != 200: | |
logger.warning(f"Got status code {response.status_code} for {url}") | |
return faqs | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Strategy 1: Look for structured data | |
scripts = soup.find_all('script', type='application/ld+json') | |
for script in scripts: | |
try: | |
data = json.loads(script.string) | |
if '@type' in data and 'FAQ' in str(data.get('@type')): | |
# Extract FAQ structured data | |
faqs.extend(self.extract_structured_faqs(data)) | |
except: | |
continue | |
# Strategy 2: Look for FAQ sections | |
faq_sections = soup.find_all(['div', 'section'], | |
class_=re.compile(r'faq|question|help|support', re.I)) | |
for section in faq_sections[:5]: # Limit to prevent too many | |
faqs.extend(self.extract_section_faqs(section, url)) | |
except Exception as e: | |
logger.error(f"Error scraping {url}: {e}") | |
return faqs | |
def extract_structured_faqs(self, data: dict) -> List[Dict]: | |
"""Extract FAQs from structured data""" | |
faqs = [] | |
if isinstance(data, dict): | |
if data.get('@type') == 'FAQPage': | |
for item in data.get('mainEntity', []): | |
if item.get('@type') == 'Question': | |
faqs.append({ | |
'question': self._clean_text(item.get('name', '')), | |
'answer': self._clean_text( | |
item.get('acceptedAnswer', {}).get('text', '') | |
), | |
'category': 'General' | |
}) | |
return faqs | |
def extract_section_faqs(self, section, url: str) -> List[Dict]: | |
"""Extract FAQs from a page section""" | |
faqs = [] | |
category = self._get_category_from_url(url) | |
# Look for Q&A pairs | |
questions = section.find_all(['h2', 'h3', 'h4', 'dt', 'div'], | |
class_=re.compile(r'question|title|header', re.I)) | |
for q in questions[:10]: # Limit to prevent too many | |
# Try to find corresponding answer | |
answer = None | |
# Check next sibling | |
next_elem = q.find_next_sibling() | |
if next_elem and next_elem.name in ['p', 'div', 'dd']: | |
answer = next_elem | |
# Check parent's next sibling | |
if not answer: | |
parent = q.parent | |
if parent: | |
next_elem = parent.find_next_sibling() | |
if next_elem: | |
answer = next_elem.find(['p', 'div']) | |
if answer: | |
faqs.append({ | |
'question': self._clean_text(q.get_text()), | |
'answer': self._clean_text(answer.get_text()), | |
'category': category | |
}) | |
return faqs | |
def get_fallback_faqs(self) -> List[Dict]: | |
"""Return comprehensive fallback FAQs based on Jupiter's services""" | |
# This is the fallback data that will be used if scraping fails | |
# Based on the FAQs you provided earlier | |
return [ | |
# Account | |
{ | |
'question': 'What is the Jupiter All-in-1 Savings Account?', | |
'answer': 'The All-in-1 Savings Account on Jupiter powered by Federal Bank helps you manage your money better with faster payments, smart saving tools, and investment insights—all in one place.', | |
'category': 'Account' | |
}, | |
{ | |
'question': 'How do I open a Jupiter Savings Account?', | |
'answer': 'You can open your Jupiter digital account by following a few simple steps: 1. Install the Jupiter app 2. Tap "Open an all-in-1 Savings Account" while selecting a Jupiter experience 3. Complete your video KYC', | |
'category': 'Account' | |
}, | |
# Add more FAQs here from your data... | |
# (Include all the FAQs you provided) | |
] | |
def _clean_text(self, text: str) -> str: | |
"""Clean text""" | |
if not text: | |
return "" | |
text = ' '.join(text.split()) | |
text = re.sub(r'<[^>]+>', '', text) | |
return text.strip() | |
def _get_category_from_url(self, url: str) -> str: | |
"""Get category from URL""" | |
url_lower = url.lower() | |
if 'account' in url_lower: | |
return 'Account' | |
elif 'payment' in url_lower or 'upi' in url_lower: | |
return 'Payments' | |
elif 'card' in url_lower: | |
return 'Cards' | |
elif 'loan' in url_lower: | |
return 'Loans' | |
elif 'invest' in url_lower or 'mutual' in url_lower: | |
return 'Investments' | |
return 'General' | |
def run_complete_scraping(self) -> pd.DataFrame: | |
"""Main method to run scraping with all fallbacks""" | |
logger.info("Starting Jupiter FAQ scraping process...") | |
# Try scraping with fallback | |
df = self.scrape_with_fallback() | |
if df.empty: | |
logger.error("No FAQ data could be obtained!") | |
else: | |
logger.info(f"Total FAQs collected: {len(df)}") | |
# Save to CSV | |
self.save_to_csv(df) | |
return df | |
def save_to_csv(self, df: pd.DataFrame, filename: str = "data/faqs.csv"): | |
"""Save FAQs to CSV""" | |
import os | |
os.makedirs(os.path.dirname(filename), exist_ok=True) | |
if not df.empty: | |
df = df[['question', 'answer', 'category']] | |
df.to_csv(filename, index=False) | |
logger.info(f"Saved {len(df)} FAQs to {filename}") | |
class FAQUpdater: | |
"""Manages FAQ updates with reliability checks""" | |
def __init__(self): | |
self.scraper = JupiterFAQScraper() | |
self.faq_file = "data/faqs.csv" | |
def check_and_update(self, force_update: bool = False) -> pd.DataFrame: | |
"""Check and update FAQs with verification""" | |
import os | |
from datetime import datetime, timedelta | |
# First, check if we have existing FAQ data | |
if os.path.exists(self.faq_file) and not force_update: | |
# Load existing data | |
existing_df = pd.read_csv(self.faq_file) | |
# Check file age | |
file_time = datetime.fromtimestamp(os.path.getmtime(self.faq_file)) | |
if datetime.now() - file_time < timedelta(days=7): | |
logger.info(f"FAQ data is recent (updated {file_time.strftime('%Y-%m-%d')})") | |
return existing_df | |
# Try to update | |
logger.info("Attempting to update FAQ data...") | |
# Test if scraping works | |
test_results = self.scraper.test_scraping() | |
if not test_results.get('website_accessible'): | |
logger.warning("Cannot access Jupiter website. Using existing/fallback data.") | |
if os.path.exists(self.faq_file): | |
return pd.read_csv(self.faq_file) | |
else: | |
# Use fallback data | |
fallback_faqs = self.scraper.get_fallback_faqs() | |
df = pd.DataFrame(fallback_faqs) | |
self.scraper.save_to_csv(df) | |
return df | |
# Try scraping | |
new_df = self.scraper.run_complete_scraping() | |
# Verify the scraped data | |
if self.verify_scraped_data(new_df): | |
# Backup old file if exists | |
if os.path.exists(self.faq_file): | |
backup_name = f"data/faqs_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" | |
os.rename(self.faq_file, backup_name) | |
logger.info(f"Backed up old FAQs to {backup_name}") | |
return new_df | |
else: | |
logger.warning("Scraped data failed verification. Using existing/fallback data.") | |
if os.path.exists(self.faq_file): | |
return pd.read_csv(self.faq_file) | |
else: | |
# Use comprehensive fallback | |
return self.create_comprehensive_fallback() | |
def verify_scraped_data(self, df: pd.DataFrame) -> bool: | |
"""Verify if scraped data is valid""" | |
if df.empty: | |
return False | |
# Check minimum requirements | |
if len(df) < 20: # Expecting at least 20 FAQs | |
logger.warning(f"Only {len(df)} FAQs scraped, seems too low") | |
return False | |
# Check if we have multiple categories | |
if df['category'].nunique() < 3: | |
logger.warning("Not enough FAQ categories") | |
return False | |
# Check answer quality | |
avg_answer_length = df['answer'].str.len().mean() | |
if avg_answer_length < 50: | |
logger.warning("Answers seem too short") | |
return False | |
return True | |
def create_comprehensive_fallback(self) -> pd.DataFrame: | |
"""Create comprehensive fallback FAQ data""" | |
# This includes ALL the FAQs you provided | |
fallback_data = [ | |
# Account FAQs | |
{"question": "What is the Jupiter All-in-1 Savings Account?", "answer": "The All-in-1 Savings Account on Jupiter powered by Federal Bank helps you manage your money better with faster payments, smart saving tools, and investment insights—all in one place.", "category": "Account"}, | |
{"question": "How do I open a Jupiter Savings Account?", "answer": "You can open your Jupiter digital account by following a few simple steps: 1. Install the Jupiter app 2. Tap 'Open an all-in-1 Savings Account' while selecting a Jupiter experience 3. Complete your video KYC", "category": "Account"}, | |
{"question": "Do I earn Jewels for making payments?", "answer": "Yes! You earn up to 1% cashback as Jewels on: • UPI payments • Debit Card spends (online & offline) • Investments in Digital Gold", "category": "Rewards"}, | |
{"question": "Can I use my Jupiter Debit Card outside India?", "answer": "Absolutely. You can spend in over 120 countries with 0% forex fee on international transactions using your Jupiter Debit Card.", "category": "Card"}, | |
{"question": "Do I earn Jewels on International payments?", "answer": "Yes, you also earn up to 1% cashback on online and offline international spends.", "category": "Rewards"}, | |
{"question": "What payment modes are available with the Jupiter account?", "answer": "You can make superfast payments with UPI, IMPS, and debit card—whether it's for recharges, bills, or merchant transactions.", "category": "Payments"}, | |
{"question": "Can I invest using my Jupiter account?", "answer": "Yes! You can invest in Mutual Funds and Digital Gold with up to 1.5% extra returns on curated mutual fund plans.", "category": "Investments"}, | |
{"question": "What additional benefits do I get with the Savings Account?", "answer": "You earn up to 1% cashback as Jewels on: • Free cheque book • Free IMPS transfers • ATM withdrawals", "category": "Account"}, | |
# Include ALL other FAQs from your data here... | |
] | |
df = pd.DataFrame(fallback_data) | |
self.scraper.save_to_csv(df) | |
return df | |
def get_scraping_stats(self, df: pd.DataFrame) -> Dict: | |
"""Get statistics about FAQ data""" | |
return { | |
'total_faqs': len(df), | |
'categories': df['category'].nunique(), | |
'category_distribution': df['category'].value_counts().to_dict(), | |
'avg_question_length': df['question'].str.len().mean(), | |
'avg_answer_length': df['answer'].str.len().mean(), | |
'data_source': 'scraped' if len(df) > 50 else 'fallback' | |
} | |
# Create a simple test script | |
def test_scraper(): | |
"""Test if the scraper can actually get data""" | |
print("Testing Jupiter FAQ Scraper...") | |
print("-" * 50) | |
scraper = JupiterFAQScraper() | |
# Test 1: Website accessibility | |
test_results = scraper.test_scraping() | |
print(f"Website accessible: {test_results.get('website_accessible', False)}") | |
print(f"FAQ content found: {test_results.get('faq_content_found', False)}") | |
# Test 2: Try scraping one page | |
print("\nTesting page scraping...") | |
test_url = "https://jupiter.money/savings-account/" | |
faqs = scraper.scrape_page_safe(test_url) | |
print(f"FAQs found on {test_url}: {len(faqs)}") | |
if faqs: | |
print("\nSample FAQ:") | |
print(f"Q: {faqs[0]['question'][:100]}...") | |
print(f"A: {faqs[0]['answer'][:100]}...") | |
# Test 3: Full scraping | |
print("\nRunning full scraping process...") | |
df = scraper.run_complete_scraping() | |
print(f"Total FAQs collected: {len(df)}") | |
if not df.empty: | |
print(f"Categories: {df['category'].unique()}") | |
print(f"Data saved to: data/faqs.csv") | |
return df | |
if __name__ == "__main__": | |
# Run the test | |
test_scraper() |