Spaces:

Refat81
/

Social_Media_Data_Extractor_Chatbot

Sleeping

App Files Files Community

Refat81 commited on 20 days ago

Commit

dfdb161

verified ·

1 Parent(s): 073e18f

Update pages/facebook_extractor.py

Browse files

Files changed (1) hide show

pages/facebook_extractor.py +274 -342

pages/facebook_extractor.py CHANGED Viewed

@@ -8,6 +8,7 @@ from datetime import datetime
 from typing import List, Dict
 import os
 import tempfile
 # Import your existing AI components
 from langchain_text_splitters import CharacterTextSplitter
@@ -24,262 +25,264 @@ st.set_page_config(
     layout="wide"
 )
-class FacebookDataSimulator:
-    """Simulate Facebook data extraction with demo data"""
     def __init__(self):
-        self.demo_data = self._create_demo_data()
     def extract_data(self, url: str, data_type: str) -> Dict:
-        """Extract or simulate Facebook data"""
         try:
-            st.info(f"🔍 Analyzing: {url}")
-            # Try real extraction first
-            real_data = self._try_real_extraction(url)
-            if real_data.get("status") == "success":
-                return real_data
-            # If real extraction fails, use demo data
-            st.warning("⚠️ Using demo data (Facebook restrictions active)")
-            return self._get_demo_data(url, data_type)
         except Exception as e:
-            st.error(f"Extraction failed, using demo data: {str(e)}")
-            return self._get_demo_data(url, data_type)
-    def _try_real_extraction(self, url: str) -> Dict:
-        """Try real extraction with better error handling"""
         try:
-            # Use a proxy-like approach with different user agents
-            headers = {
-                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                 'Accept-Language': 'en-US,en;q=0.5',
                 'Accept-Encoding': 'gzip, deflate, br',
-                'DNT': '1',
-                'Connection': 'keep-alive',
-                'Upgrade-Insecure-Requests': '1',
             }
-            # Try with shorter timeout
-            response = requests.get(url, headers=headers, timeout=10, verify=False)
             if response.status_code == 200:
-                soup = BeautifulSoup(response.text, 'html.parser')
-                # Extract basic info
-                title = soup.find('title')
-                description = soup.find('meta', attrs={'name': 'description'})
                 return {
                     "page_info": {
-                        "title": title.text if title else "Facebook Content",
-                        "description": description['content'] if description else "",
                         "url": url,
-                        "response_code": 200,
-                        "content_length": len(response.text)
                     },
-                    "content_blocks": self._extract_real_content(soup),
                     "extraction_time": datetime.now().isoformat(),
-                    "data_type": "page",
-                    "status": "success",
-                    "source": "real"
                 }
             else:
-                return {"status": "error", "source": "real"}
-        except Exception:
-            return {"status": "error", "source": "real"}
-    def _extract_real_content(self, soup) -> List[Dict]:
-        """Extract content from real page"""
-        blocks = []
-        text = soup.get_text()
-        paragraphs = [p.strip() for p in text.split('.') if p.strip() and len(p.strip()) > 30]
-        for i, paragraph in enumerate(paragraphs[:8]):
-            blocks.append({
-                "id": i + 1,
-                "content": paragraph,
-                "length": len(paragraph),
-                "word_count": len(paragraph.split()),
-                "content_type": "real_content",
-                "is_public_content": True
-            })
-        return blocks
-    def _get_demo_data(self, url: str, data_type: str) -> Dict:
-        """Get realistic demo data based on URL type"""
-        url_type = self._analyze_url_type(url)
-        if 'group' in url_type.lower():
-            return self._get_group_demo_data(url, data_type)
-        elif 'page' in url_type.lower():
-            return self._get_page_demo_data(url, data_type)
-        else:
-            return self._get_general_demo_data(url, data_type)
-    def _analyze_url_type(self, url: str) -> str:
-        """Analyze URL type for realistic demo data"""
-        url_lower = url.lower()
-        if 'group' in url_lower:
-            return "Facebook Group"
-        elif 'page' in url_lower or 'facebook.com/' in url_lower and '/pages/' not in url_lower:
-            return "Facebook Page"
-        elif 'event' in url_lower:
-            return "Facebook Event"
-        elif 'marketplace' in url_lower:
-            return "Facebook Marketplace"
         else:
-            return "Facebook Content"
-    def _get_group_demo_data(self, url: str, data_type: str) -> Dict:
-        """Get realistic group demo data"""
-        group_name = self._extract_name_from_url(url) or "Gaming Community"
         return {
             "page_info": {
-                "title": f"{group_name} | Facebook Group",
-                "description": f"A community of {group_name} enthusiasts sharing content, discussions, and events.",
-                "member_count": "15.7K members",
                 "url": url,
-                "response_code": 200,
-                "content_length": 15000,
-                "access_note": "Public group - Limited data due to platform restrictions"
             },
             "content_blocks": [
                 {
                     "id": 1,
-                    "content": f"Welcome to {group_name}! This is a community for fans and enthusiasts to share their experiences, ask questions, and connect with like-minded people.",
                     "length": 120,
-                    "word_count": 25,
-                    "content_type": "welcome_message",
                     "is_public_content": True
                 },
                 {
                     "id": 2,
-                    "content": "Just shared my latest project in the group! Would love to get some feedback from the community on the new features we're implementing.",
-                    "length": 95,
                     "word_count": 18,
-                    "content_type": "member_post",
-                    "is_public_content": True
-                },
-                {
-                    "id": 3,
-                    "content": "Does anyone have experience with this issue? I've been trying to solve it for a while and could use some community wisdom.",
-                    "length": 88,
-                    "word_count": 16,
-                    "content_type": "question_post",
-                    "is_public_content": True
-                },
-                {
-                    "id": 4,
-                    "content": "Our monthly meetup is scheduled for next Saturday! Don't forget to RSVP so we can plan accordingly. Looking forward to seeing everyone there.",
-                    "length": 102,
-                    "word_count": 19,
-                    "content_type": "event_announcement",
-                    "is_public_content": True
-                },
-                {
-                    "id": 5,
-                    "content": "The community guidelines: Be respectful, no spam, keep discussions relevant to the group's topic, and help each other grow.",
-                    "length": 78,
-                    "word_count": 14,
-                    "content_type": "community_guidelines",
-                    "is_public_content": True
-                }
-            ],
-            "url_type": "Facebook Group",
-            "extraction_time": datetime.now().isoformat(),
-            "data_type": data_type,
-            "status": "success",
-            "source": "demo"
-        }
-    def _get_page_demo_data(self, url: str, data_type: str) -> Dict:
-        """Get realistic page demo data"""
-        page_name = self._extract_name_from_url(url) or "Brand Page"
-        return {
-            "page_info": {
-                "title": f"{page_name} | Facebook Page",
-                "description": f"Official Facebook page of {page_name}. Stay updated with our latest news, products, and community events.",
-                "follower_count": "45.2K followers",
-                "url": url,
-                "response_code": 200,
-                "content_length": 12000,
-                "access_note": "Public page - Limited data due to platform restrictions"
-            },
-            "content_blocks": [
-                {
-                    "id": 1,
-                    "content": f"Welcome to the official {page_name} Facebook page! Here you'll find the latest updates, news, and announcements from our team.",
-                    "length": 98,
-                    "word_count": 15,
-                    "content_type": "welcome_message",
-                    "is_public_content": True
-                },
-                {
-                    "id": 2,
-                    "content": "We're excited to announce our new product launch next week! Stay tuned for more details and special offers for our Facebook community.",
-                    "length": 92,
-                    "word_count": 16,
-                    "content_type": "announcement",
-                    "is_public_content": True
-                },
-                {
-                    "id": 3,
-                    "content": "Thank you to everyone who participated in our recent event! The feedback has been incredible and we're already planning the next one.",
-                    "length": 87,
-                    "word_count": 14,
-                    "content_type": "event_followup",
-                    "is_public_content": True
-                },
-                {
-                    "id": 4,
-                    "content": "Customer support hours: Monday-Friday 9AM-6PM. For urgent issues, please message us directly and we'll respond as soon as possible.",
-                    "length": 85,
-                    "word_count": 15,
-                    "content_type": "support_info",
-                    "is_public_content": True
-                }
-            ],
-            "url_type": "Facebook Page",
-            "extraction_time": datetime.now().isoformat(),
-            "data_type": data_type,
-            "status": "success",
-            "source": "demo"
-        }
-    def _get_general_demo_data(self, url: str, data_type: str) -> Dict:
-        """Get general demo data"""
-        return {
-            "page_info": {
-                "title": "Facebook Content",
-                "description": "Social media content and community interactions",
-                "url": url,
-                "response_code": 200,
-                "content_length": 8000,
-                "access_note": "Public content - Platform restrictions apply"
-            },
-            "content_blocks": [
-                {
-                    "id": 1,
-                    "content": "Community engagement and social interactions are key aspects of this platform. Users share content, connect with friends, and participate in discussions.",
-                    "length": 105,
-                    "word_count": 16,
-                    "content_type": "general_content",
-                    "is_public_content": True
-                },
-                {
-                    "id": 2,
-                    "content": "Recent updates have improved user experience with better content discovery and enhanced privacy controls for community members.",
-                    "length": 82,
-                    "word_count": 12,
-                    "content_type": "platform_updates",
                     "is_public_content": True
                 }
             ],
@@ -287,35 +290,10 @@ class FacebookDataSimulator:
             "extraction_time": datetime.now().isoformat(),
             "data_type": data_type,
             "status": "success",
-            "source": "demo"
-        }
-    def _extract_name_from_url(self, url: str) -> str:
-        """Extract name from URL for realistic demo data"""
-        # Extract name from URL for more realistic demo data
-        match = re.search(r'facebook\.com/(?:groups/|pages/)?([^/?]+)', url)
-        if match:
-            name = match.group(1)
-            # Clean up the name
-            name = name.replace('-', ' ').title()
-            return name
-        return ""
-    def _create_demo_data(self) -> Dict:
-        """Create comprehensive demo data"""
-        return {
-            "groups": {
-                "gamersofbangladesh2": "Gaming Community Bangladesh",
-                "programmingcommunity": "Programming Community",
-                "startupdiscussions": "Startup Discussions"
-            },
-            "pages": {
-                "meta": "Meta Official",
-                "starbucks": "Starbucks Coffee",
-                "nasa": "NASA"
-            }
         }
 def get_embeddings():
     """Initialize embeddings with better error handling and cache management"""
     try:
@@ -420,7 +398,7 @@ def simple_chat_analysis(user_input: str, extracted_data: Dict) -> str:
         page_info = extracted_data.get('page_info', {})
         content_blocks = extracted_data.get('content_blocks', [])
         url_type = extracted_data.get('url_type', 'Facebook Content')
-        source = extracted_data.get('source', 'demo')
         user_input_lower = user_input.lower()
@@ -433,7 +411,7 @@ def simple_chat_analysis(user_input: str, extracted_data: Dict) -> str:
                 f"**Data Source:** {source.upper()}",
                 f"**Description:** {page_info.get('description', 'No description available')}",
                 "",
-                f"This appears to be a {url_type.lower()} with {len(content_blocks)} content blocks of public information.",
                 "",
                 "**Key Content Types:**",
                 f"{', '.join(set(block['content_type'] for block in content_blocks))}",
@@ -505,7 +483,7 @@ def process_facebook_data(extracted_data):
     page_info = extracted_data['page_info']
     content_blocks = extracted_data['content_blocks']
-    url_type = extracted_data['url_type']
     source = extracted_data.get('source', 'unknown')
     all_text = f"FACEBOOK DATA ANALYSIS\n{'='*50}\n\n"
@@ -545,7 +523,7 @@ def process_facebook_data(extracted_data):
     chunks = splitter.split_text(all_text)
     documents = [Document(page_content=chunk) for chunk in chunks]
-    return "simple", documents  # Return simple mode instead of vectorstore
 def create_chatbot(vectorstore):
     """Create conversational chatbot"""
@@ -573,15 +551,15 @@ def create_chatbot(vectorstore):
         return "simple"  # Fallback to simple mode
 def main():
-    st.title("📘 Facebook Data Extractor")
-    st.markdown("**University Project** - Real data when possible, realistic demo data when restricted")
     if st.button("← Back to Main Dashboard"):
         st.switch_page("app.py")
-    # Initialize session state - WITH DUPLICATION PROTECTION
     if "extractor" not in st.session_state:
-        st.session_state.extractor = FacebookDataSimulator()
     if "facebook_data" not in st.session_state:
         st.session_state.facebook_data = None
     if "vectorstore" not in st.session_state:
@@ -591,9 +569,9 @@ def main():
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = []
     if "processing_mode" not in st.session_state:
-        st.session_state.processing_mode = "ai"  # ai or simple
     if "last_user_input" not in st.session_state:
-        st.session_state.last_user_input = ""  # ADDED: Prevent duplication
     # Sidebar
     with st.sidebar:
@@ -608,19 +586,9 @@ def main():
         facebook_url = st.text_input(
             "Facebook URL",
             placeholder="https://www.facebook.com/groups/gamersofbangladesh2",
-            help="Enter any Facebook URL for analysis"
         )
-        # Processing mode
-        st.subheader("🔧 Processing Mode")
-        processing_mode = st.radio(
-            "Choose analysis mode:",
-            ["AI Analysis (Recommended)", "Simple Analysis"],
-            help="AI Analysis uses embeddings, Simple uses rule-based"
-        )
-        st.session_state.processing_mode = "ai" if processing_mode == "AI Analysis (Recommended)" else "simple"
         # Quick test URLs
         st.markdown("### 🚀 Test URLs")
         test_urls = {
@@ -634,7 +602,7 @@ def main():
                 st.session_state.current_fb_url = url
                 st.rerun()
-        if st.button("🚀 Extract Facebook Data", type="primary"):
             url_to_use = facebook_url or getattr(st.session_state, 'current_fb_url', '')
             if not url_to_use:
@@ -642,37 +610,22 @@ def main():
             elif 'facebook.com' not in url_to_use:
                 st.error("❌ Please enter a valid Facebook URL")
             else:
-                with st.spinner("🔄 Analyzing Facebook data..."):
                     extracted_data = st.session_state.extractor.extract_data(url_to_use, data_type)
                     if extracted_data.get("status") == "success":
                         st.session_state.facebook_data = extracted_data
-                        # Process based on selected mode
-                        if st.session_state.processing_mode == "ai":
-                            result = process_facebook_data(extracted_data)
-                            if result and result[0] != "simple":
-                                st.session_state.vectorstore = result[0]
-                                st.session_state.chatbot = create_chatbot(result[0])
-                                st.session_state.chat_history = []
-                                st.session_state.last_user_input = ""  # Reset
-                                st.success("✅ AI analysis ready!")
-                            else:
-                                st.warning("⚠️ Using simple analysis (AI features limited)")
-                                st.session_state.chatbot = "simple"
-                                st.session_state.chat_history = []
-                                st.session_state.last_user_input = ""  # Reset
-                        else:
-                            st.session_state.chatbot = "simple"
-                            st.session_state.chat_history = []
-                            st.session_state.last_user_input = ""  # Reset
-                            st.success("✅ Simple analysis ready!")
                         source = extracted_data.get('source', 'unknown')
-                        if source == 'demo':
-                            st.warning("📝 Using realistic demo data (Facebook restrictions active)")
                         else:
-                            st.success("✅ Real data extracted successfully!")
                     else:
                         error_msg = extracted_data.get("error", "Unknown error")
                         st.error(f"❌ Extraction failed: {error_msg}")
@@ -687,7 +640,7 @@ def main():
                 st.session_state.last_user_input = ""
                 st.rerun()
-    # Main content - RESTRUCTURED LAYOUT
     st.header("📊 Extraction Results")
     if st.session_state.facebook_data:
@@ -696,38 +649,26 @@ def main():
         content_blocks = data['content_blocks']
         source = data.get('source', 'unknown')
-        if source == 'demo':
-            st.warning("📝 **Demo Data** - Realistic simulation (Facebook restrictions)")
-        else:
-            st.success("✅ **Real Data** - Successfully extracted")
-        # Show processing mode
-        if st.session_state.processing_mode == "simple":
-            st.info("🔧 **Simple Analysis Mode** - Rule-based processing")
         else:
-            st.info("🤖 **AI Analysis Mode** - Embedding-based processing")
         # Metrics
         col1, col2, col3 = st.columns(3)
         with col1:
             st.metric("Content Blocks", len(content_blocks))
         with col2:
-            st.metric("Data Source", source.upper())
         with col3:
-            st.metric("Analysis Mode", "AI" if st.session_state.processing_mode == "ai" else "Simple")
         # Page info
         st.subheader("🏷️ Page Information")
         st.write(f"**Title:** {page_info['title']}")
-        st.write(f"**URL Type:** {data['url_type']}")
         st.write(f"**Description:** {page_info.get('description', 'No description')}")
-        if page_info.get('member_count'):
-            st.write(f"**Members:** {page_info['member_count']}")
-        elif page_info.get('follower_count'):
-            st.write(f"**Followers:** {page_info['follower_count']}")
-        st.write(f"**Access:** {page_info.get('access_note', 'Public content')}")
         # Content samples
         st.subheader("📝 Content Analysis")
@@ -738,28 +679,30 @@ def main():
     else:
         st.info("""
-        ## 📘 Facebook Data Extractor
-        **University Project Feature**
-        **How it works:**
-        1. Enter any Facebook URL
-        2. System tries real data extraction
-        3. If blocked, uses **realistic demo data**
-        4. Choose between AI or Simple analysis
-        **Analysis Modes:**
-        - 🤖 **AI Analysis**: Uses embeddings and Mistral AI
-        - 🔧 **Simple Analysis**: Rule-based (works without embeddings)
-        **Perfect for demonstrating:**
-        - Social media data extraction concepts
-        - AI analysis capabilities
-        - Platform integration
-        - Error handling strategies
         """)
-    # Chat section - SEPARATE from main content
     st.markdown("---")
     st.header("💬 Analysis Chat")
@@ -777,10 +720,10 @@ def main():
         if not st.session_state.chat_history:
             st.subheader("💡 Try asking:")
             suggestions = [
-                "What is this Facebook group/page about?",
-                "Summarize the main content and purpose",
-                "What kind of community is this?",
-                "Analyze the engagement and activity level"
             ]
             cols = st.columns(len(suggestions))
@@ -794,29 +737,18 @@ def main():
     else:
         st.info("🔍 Extract Facebook data to enable analysis")
-    # CHAT INPUT - AT THE VERY BOTTOM, OUTSIDE ALL CONTAINERS
     if st.session_state.chatbot and st.session_state.facebook_data:
         user_input = st.chat_input("Ask about the Facebook data...")
-        # ADDED: Duplication protection
         if user_input and user_input != st.session_state.last_user_input:
-            # Store current input to prevent duplication
             st.session_state.last_user_input = user_input
-            # Add user message
             st.session_state.chat_history.append({"role": "user", "content": user_input})
             with st.spinner("🤔 Analyzing..."):
                 try:
-                    if st.session_state.chatbot == "simple":
-                        # Use simple analysis
-                        response = simple_chat_analysis(user_input, st.session_state.facebook_data)
-                        st.session_state.chat_history.append({"role": "assistant", "content": response})
-                    else:
-                        # Use AI chatbot
-                        response = st.session_state.chatbot.invoke({"question": user_input})
-                        answer = response.get("answer", "I couldn't generate a response.")
-                        st.session_state.chat_history.append({"role": "assistant", "content": answer})
                     st.rerun()
                 except Exception as e:
                     error_msg = f"Analysis Error: {str(e)}"

 from typing import List, Dict
 import os
 import tempfile
+import random
 # Import your existing AI components
 from langchain_text_splitters import CharacterTextSplitter
     layout="wide"
 )
+class FacebookRealExtractor:
+    """Aggressive Facebook data extractor that tries multiple approaches"""
     def __init__(self):
+        self.session = requests.Session()
+        self.setup_session()
+    def setup_session(self):
+        """Setup requests session with rotating headers"""
+        self.user_agents = [
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
+            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0'
+        ]
     def extract_data(self, url: str, data_type: str) -> Dict:
+        """Extract real Facebook data with multiple attempts"""
+        st.info(f"🔍 Attempting real extraction: {url}")
+        # Try multiple extraction methods
+        methods = [
+            self._try_direct_extraction,
+            self._try_mobile_extraction,
+            self._try_text_only_extraction
+        ]
+        for method in methods:
+            result = method(url)
+            if result.get("status") == "success":
+                st.success("✅ Real Facebook data extracted!")
+                result["source"] = "real"
+                result["data_type"] = data_type
+                return result
+        # If all methods fail, provide better error info
+        st.error("❌ All real extraction methods failed. Facebook has strong anti-bot protection.")
+        st.info("""
+        **Why this happens:**
+        - Facebook blocks automated requests
+        - Requires JavaScript execution
+        - Needs cookies and session management
+        - Heavy anti-bot detection
+        **For your university project, you can:**
+        1. Use the demo data to demonstrate functionality
+        2. Explain these technical limitations in your report
+        3. Show that LinkedIn works (no restrictions)
+        4. Discuss platform security differences
+        """)
+        # Only use demo data as last resort
+        return self._get_minimal_demo_data(url, data_type)
+    def _try_direct_extraction(self, url: str) -> Dict:
+        """Try direct extraction with rotating headers"""
         try:
+            headers = {
+                'User-Agent': random.choice(self.user_agents),
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/avif,*/*;q=0.8',
+                'Accept-Language': 'en-US,en;q=0.5',
+                'Accept-Encoding': 'gzip, deflate, br',
+                'DNT': '1',
+                'Connection': 'keep-alive',
+                'Upgrade-Insecure-Requests': '1',
+                'Sec-Fetch-Dest': 'document',
+                'Sec-Fetch-Mode': 'navigate',
+                'Sec-Fetch-Site': 'none',
+                'Cache-Control': 'max-age=0',
+            }
+            # Try with different timeouts and settings
+            response = self.session.get(
+                url,
+                headers=headers,
+                timeout=15,
+                allow_redirects=True
+            )
+            if response.status_code == 200:
+                return self._parse_facebook_response(response, url)
+            else:
+                return {"status": "error", "reason": f"HTTP {response.status_code}"}
         except Exception as e:
+            return {"status": "error", "reason": str(e)}
+    def _try_mobile_extraction(self, url: str) -> Dict:
+        """Try mobile version extraction"""
         try:
+            mobile_headers = {
+                'User-Agent': 'Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36',
                 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                 'Accept-Language': 'en-US,en;q=0.5',
                 'Accept-Encoding': 'gzip, deflate, br',
             }
+            response = self.session.get(url, headers=mobile_headers, timeout=15)
             if response.status_code == 200:
+                return self._parse_facebook_response(response, url)
+            else:
+                return {"status": "error", "reason": f"Mobile HTTP {response.status_code}"}
+        except Exception as e:
+            return {"status": "error", "reason": str(e)}
+    def _try_text_only_extraction(self, url: str) -> Dict:
+        """Try text-only version or alternative approaches"""
+        try:
+            # Try textise.iitty
+            textise_url = f"https://r.jina.ai/{url}"
+            response = self.session.get(textise_url, timeout=20)
+            if response.status_code == 200:
+                return self._parse_textise_response(response, url)
+            else:
+                return {"status": "error", "reason": "Textise failed"}
+        except Exception as e:
+            return {"status": "error", "reason": str(e)}
+    def _parse_facebook_response(self, response, url: str) -> Dict:
+        """Parse Facebook response for real data"""
+        try:
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Extract basic information
+            title = soup.find('title')
+            description = soup.find('meta', attrs={'name': 'description'})
+            og_title = soup.find('meta', property='og:title')
+            og_description = soup.find('meta', property='og:description')
+            # Try to find meaningful content
+            content_elements = soup.find_all(['p', 'div', 'span'], string=True)
+            meaningful_text = []
+            for element in content_elements:
+                text = element.get_text().strip()
+                if (len(text) > 20 and
+                    not any(word in text.lower() for word in ['cookie', 'login', 'sign up', 'facebook']) and
+                    len(text.split()) > 3):
+                    meaningful_text.append(text)
+            # Create content blocks from real data
+            content_blocks = []
+            for i, text in enumerate(meaningful_text[:10]):  # Limit to first 10 meaningful texts
+                content_blocks.append({
+                    "id": i + 1,
+                    "content": text,
+                    "length": len(text),
+                    "word_count": len(text.split()),
+                    "content_type": self._classify_content(text),
+                    "is_public_content": True
+                })
+            if content_blocks:
                 return {
                     "page_info": {
+                        "title": og_title['content'] if og_title else (title.text if title else "Facebook Content"),
+                        "description": og_description['content'] if og_description else (description['content'] if description else ""),
                         "url": url,
+                        "response_code": response.status_code,
+                        "content_length": len(response.text),
+                        "access_note": "Real data extracted successfully"
                     },
+                    "content_blocks": content_blocks,
                     "extraction_time": datetime.now().isoformat(),
+                    "status": "success"
                 }
             else:
+                return {"status": "error", "reason": "No meaningful content found"}
+        except Exception as e:
+            return {"status": "error", "reason": f"Parsing error: {str(e)}"}
+    def _parse_textise_response(self, response, url: str) -> Dict:
+        """Parse textise response"""
+        try:
+            # Textise provides cleaner text content
+            lines = response.text.split('\n')
+            meaningful_lines = [line.strip() for line in lines if len(line.strip()) > 30]
+            content_blocks = []
+            for i, line in enumerate(meaningful_lines[:8]):
+                content_blocks.append({
+                    "id": i + 1,
+                    "content": line,
+                    "length": len(line),
+                    "word_count": len(line.split()),
+                    "content_type": self._classify_content(line),
+                    "is_public_content": True
+                })
+            if content_blocks:
+                return {
+                    "page_info": {
+                        "title": "Facebook Content (via Textise)",
+                        "description": "Content extracted using text-only method",
+                        "url": url,
+                        "response_code": response.status_code,
+                        "content_length": len(response.text),
+                        "access_note": "Real data via text-only extraction"
+                    },
+                    "content_blocks": content_blocks,
+                    "extraction_time": datetime.now().isoformat(),
+                    "status": "success"
+                }
+            else:
+                return {"status": "error", "reason": "No content from textise"}
+        except Exception as e:
+            return {"status": "error", "reason": str(e)}
+    def _classify_content(self, text: str) -> str:
+        """Classify content type"""
+        text_lower = text.lower()
+        if any(word in text_lower for word in ['welcome', 'join', 'community']):
+            return "welcome_message"
+        elif any(word in text_lower for word in ['event', 'meetup', 'schedule']):
+            return "event_info"
+        elif any(word in text_lower for word in ['post', 'share', 'comment']):
+            return "social_content"
+        elif any(word in text_lower for word in ['question', 'help', 'advice']):
+            return "question_post"
         else:
+            return "general_content"
+    def _get_minimal_demo_data(self, url: str, data_type: str) -> Dict:
+        """Only use demo data as absolute last resort"""
+        st.warning("🔄 Using minimal demo data for demonstration purposes")
         return {
             "page_info": {
+                "title": "Facebook Content (Demo - Real extraction blocked)",
+                "description": "This would show real Facebook data if not blocked by platform restrictions",
                 "url": url,
+                "response_code": 403,
+                "content_length": 0,
+                "access_note": "DEMO: Facebook blocked real data extraction"
             },
             "content_blocks": [
                 {
                     "id": 1,
+                    "content": "This is a demonstration of what real Facebook data would look like. Actual extraction is blocked by Facebook's anti-bot protection.",
                     "length": 120,
+                    "word_count": 20,
+                    "content_type": "demo_notice",
                     "is_public_content": True
                 },
                 {
                     "id": 2,
+                    "content": "For your university project, you can discuss these technical limitations and how social media platforms implement security measures.",
+                    "length": 130,
                     "word_count": 18,
+                    "content_type": "educational_note",
                     "is_public_content": True
                 }
             ],
             "extraction_time": datetime.now().isoformat(),
             "data_type": data_type,
             "status": "success",
+            "source": "demo_fallback"
         }
+# Rest of the functions remain the same (get_embeddings, get_llm, simple_chat_analysis, etc.)
 def get_embeddings():
     """Initialize embeddings with better error handling and cache management"""
     try:
         page_info = extracted_data.get('page_info', {})
         content_blocks = extracted_data.get('content_blocks', [])
         url_type = extracted_data.get('url_type', 'Facebook Content')
+        source = extracted_data.get('source', 'unknown')
         user_input_lower = user_input.lower()
                 f"**Data Source:** {source.upper()}",
                 f"**Description:** {page_info.get('description', 'No description available')}",
                 "",
+                f"This appears to be a {url_type.lower()} with {len(content_blocks)} content blocks.",
                 "",
                 "**Key Content Types:**",
                 f"{', '.join(set(block['content_type'] for block in content_blocks))}",
     page_info = extracted_data['page_info']
     content_blocks = extracted_data['content_blocks']
+    url_type = extracted_data.get('url_type', 'Facebook Content')
     source = extracted_data.get('source', 'unknown')
     all_text = f"FACEBOOK DATA ANALYSIS\n{'='*50}\n\n"
     chunks = splitter.split_text(all_text)
     documents = [Document(page_content=chunk) for chunk in chunks]
+    return "simple", documents
 def create_chatbot(vectorstore):
     """Create conversational chatbot"""
         return "simple"  # Fallback to simple mode
 def main():
+    st.title("📘 Facebook Data Extractor - REAL DATA ATTEMPT")
+    st.markdown("**Aggressive real data extraction - No automatic demo fallback**")
     if st.button("← Back to Main Dashboard"):
         st.switch_page("app.py")
+    # Initialize session state
     if "extractor" not in st.session_state:
+        st.session_state.extractor = FacebookRealExtractor()  # Changed to real extractor
     if "facebook_data" not in st.session_state:
         st.session_state.facebook_data = None
     if "vectorstore" not in st.session_state:
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = []
     if "processing_mode" not in st.session_state:
+        st.session_state.processing_mode = "ai"
     if "last_user_input" not in st.session_state:
+        st.session_state.last_user_input = ""
     # Sidebar
     with st.sidebar:
         facebook_url = st.text_input(
             "Facebook URL",
             placeholder="https://www.facebook.com/groups/gamersofbangladesh2",
+            help="Enter any Facebook URL for REAL data extraction"
         )
         # Quick test URLs
         st.markdown("### 🚀 Test URLs")
         test_urls = {
                 st.session_state.current_fb_url = url
                 st.rerun()
+        if st.button("🚀 EXTRACT REAL DATA", type="primary"):
             url_to_use = facebook_url or getattr(st.session_state, 'current_fb_url', '')
             if not url_to_use:
             elif 'facebook.com' not in url_to_use:
                 st.error("❌ Please enter a valid Facebook URL")
             else:
+                with st.spinner("🔄 Aggressively extracting REAL Facebook data..."):
                     extracted_data = st.session_state.extractor.extract_data(url_to_use, data_type)
                     if extracted_data.get("status") == "success":
                         st.session_state.facebook_data = extracted_data
+                        st.session_state.chatbot = "simple"
+                        st.session_state.chat_history = []
+                        st.session_state.last_user_input = ""
                         source = extracted_data.get('source', 'unknown')
+                        if source == 'real':
+                            st.success("🎉 SUCCESS: Real Facebook data extracted!")
+                            st.balloons()
                         else:
+                            st.warning("⚠️ Using fallback data - Facebook blocked real extraction")
                     else:
                         error_msg = extracted_data.get("error", "Unknown error")
                         st.error(f"❌ Extraction failed: {error_msg}")
                 st.session_state.last_user_input = ""
                 st.rerun()
+    # Main content
     st.header("📊 Extraction Results")
     if st.session_state.facebook_data:
         content_blocks = data['content_blocks']
         source = data.get('source', 'unknown')
+        if source == 'real':
+            st.success("✅ **REAL DATA** - Successfully extracted from Facebook!")
         else:
+            st.warning("📝 **FALLBACK DATA** - Facebook blocked real extraction")
         # Metrics
         col1, col2, col3 = st.columns(3)
         with col1:
             st.metric("Content Blocks", len(content_blocks))
         with col2:
+            st.metric("Data Source", "REAL" if source == 'real' else "FALLBACK")
         with col3:
+            st.metric("Status", "Success")
         # Page info
         st.subheader("🏷️ Page Information")
         st.write(f"**Title:** {page_info['title']}")
         st.write(f"**Description:** {page_info.get('description', 'No description')}")
+        st.write(f"**Access Note:** {page_info.get('access_note', 'Public content')}")
+        st.write(f"**Response Code:** {page_info.get('response_code', 'N/A')}")
         # Content samples
         st.subheader("📝 Content Analysis")
     else:
         st.info("""
+        ## 📘 Facebook Real Data Extractor
+        **Aggressive Approach - No Automatic Demo**
+        **This version:**
+        - Tries multiple extraction methods
+        - Uses rotating user agents
+        - Attempts mobile versions
+        - Tries text-only alternatives
+        - Only uses demo data as LAST RESORT
+        **Technical Challenges:**
+        - Facebook has strong anti-bot protection
+        - Requires JavaScript execution
+        - Needs session management
+        - Heavy rate limiting
+        **For your project:**
+        - Shows real technical limitations
+        - Demonstrates platform security
+        - Provides educational value
         """)
+    # Chat section
     st.markdown("---")
     st.header("💬 Analysis Chat")
         if not st.session_state.chat_history:
             st.subheader("💡 Try asking:")
             suggestions = [
+                "What is this Facebook content about?",
+                "Summarize the extracted data",
+                "What kind of information was found?",
+                "Analyze the content structure"
             ]
             cols = st.columns(len(suggestions))
     else:
         st.info("🔍 Extract Facebook data to enable analysis")
+    # CHAT INPUT
     if st.session_state.chatbot and st.session_state.facebook_data:
         user_input = st.chat_input("Ask about the Facebook data...")
         if user_input and user_input != st.session_state.last_user_input:
             st.session_state.last_user_input = user_input
             st.session_state.chat_history.append({"role": "user", "content": user_input})
             with st.spinner("🤔 Analyzing..."):
                 try:
+                    response = simple_chat_analysis(user_input, st.session_state.facebook_data)
+                    st.session_state.chat_history.append({"role": "assistant", "content": response})
                     st.rerun()
                 except Exception as e:
                     error_msg = f"Analysis Error: {str(e)}"