Refat81 commited on
Commit
dfdb161
Β·
verified Β·
1 Parent(s): 073e18f

Update pages/facebook_extractor.py

Browse files
Files changed (1) hide show
  1. pages/facebook_extractor.py +274 -342
pages/facebook_extractor.py CHANGED
@@ -8,6 +8,7 @@ from datetime import datetime
8
  from typing import List, Dict
9
  import os
10
  import tempfile
 
11
 
12
  # Import your existing AI components
13
  from langchain_text_splitters import CharacterTextSplitter
@@ -24,262 +25,264 @@ st.set_page_config(
24
  layout="wide"
25
  )
26
 
27
- class FacebookDataSimulator:
28
- """Simulate Facebook data extraction with demo data"""
29
 
30
  def __init__(self):
31
- self.demo_data = self._create_demo_data()
 
32
 
 
 
 
 
 
 
 
 
 
 
33
  def extract_data(self, url: str, data_type: str) -> Dict:
34
- """Extract or simulate Facebook data"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  try:
36
- st.info(f"πŸ” Analyzing: {url}")
37
-
38
- # Try real extraction first
39
- real_data = self._try_real_extraction(url)
40
- if real_data.get("status") == "success":
41
- return real_data
 
 
 
 
 
 
 
42
 
43
- # If real extraction fails, use demo data
44
- st.warning("⚠️ Using demo data (Facebook restrictions active)")
45
- return self._get_demo_data(url, data_type)
 
 
 
 
46
 
 
 
 
 
 
47
  except Exception as e:
48
- st.error(f"Extraction failed, using demo data: {str(e)}")
49
- return self._get_demo_data(url, data_type)
50
 
51
- def _try_real_extraction(self, url: str) -> Dict:
52
- """Try real extraction with better error handling"""
53
  try:
54
- # Use a proxy-like approach with different user agents
55
- headers = {
56
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
57
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
58
  'Accept-Language': 'en-US,en;q=0.5',
59
  'Accept-Encoding': 'gzip, deflate, br',
60
- 'DNT': '1',
61
- 'Connection': 'keep-alive',
62
- 'Upgrade-Insecure-Requests': '1',
63
  }
64
 
65
- # Try with shorter timeout
66
- response = requests.get(url, headers=headers, timeout=10, verify=False)
67
 
68
  if response.status_code == 200:
69
- soup = BeautifulSoup(response.text, 'html.parser')
 
 
70
 
71
- # Extract basic info
72
- title = soup.find('title')
73
- description = soup.find('meta', attrs={'name': 'description'})
 
 
 
 
 
 
 
 
 
 
 
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  return {
76
  "page_info": {
77
- "title": title.text if title else "Facebook Content",
78
- "description": description['content'] if description else "",
79
  "url": url,
80
- "response_code": 200,
81
- "content_length": len(response.text)
 
82
  },
83
- "content_blocks": self._extract_real_content(soup),
84
  "extraction_time": datetime.now().isoformat(),
85
- "data_type": "page",
86
- "status": "success",
87
- "source": "real"
88
  }
89
  else:
90
- return {"status": "error", "source": "real"}
91
 
92
- except Exception:
93
- return {"status": "error", "source": "real"}
94
-
95
- def _extract_real_content(self, soup) -> List[Dict]:
96
- """Extract content from real page"""
97
- blocks = []
98
- text = soup.get_text()
99
- paragraphs = [p.strip() for p in text.split('.') if p.strip() and len(p.strip()) > 30]
100
-
101
- for i, paragraph in enumerate(paragraphs[:8]):
102
- blocks.append({
103
- "id": i + 1,
104
- "content": paragraph,
105
- "length": len(paragraph),
106
- "word_count": len(paragraph.split()),
107
- "content_type": "real_content",
108
- "is_public_content": True
109
- })
110
-
111
- return blocks
112
 
113
- def _get_demo_data(self, url: str, data_type: str) -> Dict:
114
- """Get realistic demo data based on URL type"""
115
- url_type = self._analyze_url_type(url)
116
-
117
- if 'group' in url_type.lower():
118
- return self._get_group_demo_data(url, data_type)
119
- elif 'page' in url_type.lower():
120
- return self._get_page_demo_data(url, data_type)
121
- else:
122
- return self._get_general_demo_data(url, data_type)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
- def _analyze_url_type(self, url: str) -> str:
125
- """Analyze URL type for realistic demo data"""
126
- url_lower = url.lower()
127
-
128
- if 'group' in url_lower:
129
- return "Facebook Group"
130
- elif 'page' in url_lower or 'facebook.com/' in url_lower and '/pages/' not in url_lower:
131
- return "Facebook Page"
132
- elif 'event' in url_lower:
133
- return "Facebook Event"
134
- elif 'marketplace' in url_lower:
135
- return "Facebook Marketplace"
136
  else:
137
- return "Facebook Content"
138
 
139
- def _get_group_demo_data(self, url: str, data_type: str) -> Dict:
140
- """Get realistic group demo data"""
141
- group_name = self._extract_name_from_url(url) or "Gaming Community"
142
 
143
  return {
144
  "page_info": {
145
- "title": f"{group_name} | Facebook Group",
146
- "description": f"A community of {group_name} enthusiasts sharing content, discussions, and events.",
147
- "member_count": "15.7K members",
148
  "url": url,
149
- "response_code": 200,
150
- "content_length": 15000,
151
- "access_note": "Public group - Limited data due to platform restrictions"
152
  },
153
  "content_blocks": [
154
  {
155
  "id": 1,
156
- "content": f"Welcome to {group_name}! This is a community for fans and enthusiasts to share their experiences, ask questions, and connect with like-minded people.",
157
  "length": 120,
158
- "word_count": 25,
159
- "content_type": "welcome_message",
160
  "is_public_content": True
161
  },
162
  {
163
  "id": 2,
164
- "content": "Just shared my latest project in the group! Would love to get some feedback from the community on the new features we're implementing.",
165
- "length": 95,
166
  "word_count": 18,
167
- "content_type": "member_post",
168
- "is_public_content": True
169
- },
170
- {
171
- "id": 3,
172
- "content": "Does anyone have experience with this issue? I've been trying to solve it for a while and could use some community wisdom.",
173
- "length": 88,
174
- "word_count": 16,
175
- "content_type": "question_post",
176
- "is_public_content": True
177
- },
178
- {
179
- "id": 4,
180
- "content": "Our monthly meetup is scheduled for next Saturday! Don't forget to RSVP so we can plan accordingly. Looking forward to seeing everyone there.",
181
- "length": 102,
182
- "word_count": 19,
183
- "content_type": "event_announcement",
184
- "is_public_content": True
185
- },
186
- {
187
- "id": 5,
188
- "content": "The community guidelines: Be respectful, no spam, keep discussions relevant to the group's topic, and help each other grow.",
189
- "length": 78,
190
- "word_count": 14,
191
- "content_type": "community_guidelines",
192
- "is_public_content": True
193
- }
194
- ],
195
- "url_type": "Facebook Group",
196
- "extraction_time": datetime.now().isoformat(),
197
- "data_type": data_type,
198
- "status": "success",
199
- "source": "demo"
200
- }
201
-
202
- def _get_page_demo_data(self, url: str, data_type: str) -> Dict:
203
- """Get realistic page demo data"""
204
- page_name = self._extract_name_from_url(url) or "Brand Page"
205
-
206
- return {
207
- "page_info": {
208
- "title": f"{page_name} | Facebook Page",
209
- "description": f"Official Facebook page of {page_name}. Stay updated with our latest news, products, and community events.",
210
- "follower_count": "45.2K followers",
211
- "url": url,
212
- "response_code": 200,
213
- "content_length": 12000,
214
- "access_note": "Public page - Limited data due to platform restrictions"
215
- },
216
- "content_blocks": [
217
- {
218
- "id": 1,
219
- "content": f"Welcome to the official {page_name} Facebook page! Here you'll find the latest updates, news, and announcements from our team.",
220
- "length": 98,
221
- "word_count": 15,
222
- "content_type": "welcome_message",
223
- "is_public_content": True
224
- },
225
- {
226
- "id": 2,
227
- "content": "We're excited to announce our new product launch next week! Stay tuned for more details and special offers for our Facebook community.",
228
- "length": 92,
229
- "word_count": 16,
230
- "content_type": "announcement",
231
- "is_public_content": True
232
- },
233
- {
234
- "id": 3,
235
- "content": "Thank you to everyone who participated in our recent event! The feedback has been incredible and we're already planning the next one.",
236
- "length": 87,
237
- "word_count": 14,
238
- "content_type": "event_followup",
239
- "is_public_content": True
240
- },
241
- {
242
- "id": 4,
243
- "content": "Customer support hours: Monday-Friday 9AM-6PM. For urgent issues, please message us directly and we'll respond as soon as possible.",
244
- "length": 85,
245
- "word_count": 15,
246
- "content_type": "support_info",
247
- "is_public_content": True
248
- }
249
- ],
250
- "url_type": "Facebook Page",
251
- "extraction_time": datetime.now().isoformat(),
252
- "data_type": data_type,
253
- "status": "success",
254
- "source": "demo"
255
- }
256
-
257
- def _get_general_demo_data(self, url: str, data_type: str) -> Dict:
258
- """Get general demo data"""
259
- return {
260
- "page_info": {
261
- "title": "Facebook Content",
262
- "description": "Social media content and community interactions",
263
- "url": url,
264
- "response_code": 200,
265
- "content_length": 8000,
266
- "access_note": "Public content - Platform restrictions apply"
267
- },
268
- "content_blocks": [
269
- {
270
- "id": 1,
271
- "content": "Community engagement and social interactions are key aspects of this platform. Users share content, connect with friends, and participate in discussions.",
272
- "length": 105,
273
- "word_count": 16,
274
- "content_type": "general_content",
275
- "is_public_content": True
276
- },
277
- {
278
- "id": 2,
279
- "content": "Recent updates have improved user experience with better content discovery and enhanced privacy controls for community members.",
280
- "length": 82,
281
- "word_count": 12,
282
- "content_type": "platform_updates",
283
  "is_public_content": True
284
  }
285
  ],
@@ -287,35 +290,10 @@ class FacebookDataSimulator:
287
  "extraction_time": datetime.now().isoformat(),
288
  "data_type": data_type,
289
  "status": "success",
290
- "source": "demo"
291
- }
292
-
293
- def _extract_name_from_url(self, url: str) -> str:
294
- """Extract name from URL for realistic demo data"""
295
- # Extract name from URL for more realistic demo data
296
- match = re.search(r'facebook\.com/(?:groups/|pages/)?([^/?]+)', url)
297
- if match:
298
- name = match.group(1)
299
- # Clean up the name
300
- name = name.replace('-', ' ').title()
301
- return name
302
- return ""
303
-
304
- def _create_demo_data(self) -> Dict:
305
- """Create comprehensive demo data"""
306
- return {
307
- "groups": {
308
- "gamersofbangladesh2": "Gaming Community Bangladesh",
309
- "programmingcommunity": "Programming Community",
310
- "startupdiscussions": "Startup Discussions"
311
- },
312
- "pages": {
313
- "meta": "Meta Official",
314
- "starbucks": "Starbucks Coffee",
315
- "nasa": "NASA"
316
- }
317
  }
318
 
 
319
  def get_embeddings():
320
  """Initialize embeddings with better error handling and cache management"""
321
  try:
@@ -420,7 +398,7 @@ def simple_chat_analysis(user_input: str, extracted_data: Dict) -> str:
420
  page_info = extracted_data.get('page_info', {})
421
  content_blocks = extracted_data.get('content_blocks', [])
422
  url_type = extracted_data.get('url_type', 'Facebook Content')
423
- source = extracted_data.get('source', 'demo')
424
 
425
  user_input_lower = user_input.lower()
426
 
@@ -433,7 +411,7 @@ def simple_chat_analysis(user_input: str, extracted_data: Dict) -> str:
433
  f"**Data Source:** {source.upper()}",
434
  f"**Description:** {page_info.get('description', 'No description available')}",
435
  "",
436
- f"This appears to be a {url_type.lower()} with {len(content_blocks)} content blocks of public information.",
437
  "",
438
  "**Key Content Types:**",
439
  f"{', '.join(set(block['content_type'] for block in content_blocks))}",
@@ -505,7 +483,7 @@ def process_facebook_data(extracted_data):
505
 
506
  page_info = extracted_data['page_info']
507
  content_blocks = extracted_data['content_blocks']
508
- url_type = extracted_data['url_type']
509
  source = extracted_data.get('source', 'unknown')
510
 
511
  all_text = f"FACEBOOK DATA ANALYSIS\n{'='*50}\n\n"
@@ -545,7 +523,7 @@ def process_facebook_data(extracted_data):
545
  chunks = splitter.split_text(all_text)
546
  documents = [Document(page_content=chunk) for chunk in chunks]
547
 
548
- return "simple", documents # Return simple mode instead of vectorstore
549
 
550
  def create_chatbot(vectorstore):
551
  """Create conversational chatbot"""
@@ -573,15 +551,15 @@ def create_chatbot(vectorstore):
573
  return "simple" # Fallback to simple mode
574
 
575
  def main():
576
- st.title("πŸ“˜ Facebook Data Extractor")
577
- st.markdown("**University Project** - Real data when possible, realistic demo data when restricted")
578
 
579
  if st.button("← Back to Main Dashboard"):
580
  st.switch_page("app.py")
581
 
582
- # Initialize session state - WITH DUPLICATION PROTECTION
583
  if "extractor" not in st.session_state:
584
- st.session_state.extractor = FacebookDataSimulator()
585
  if "facebook_data" not in st.session_state:
586
  st.session_state.facebook_data = None
587
  if "vectorstore" not in st.session_state:
@@ -591,9 +569,9 @@ def main():
591
  if "chat_history" not in st.session_state:
592
  st.session_state.chat_history = []
593
  if "processing_mode" not in st.session_state:
594
- st.session_state.processing_mode = "ai" # ai or simple
595
  if "last_user_input" not in st.session_state:
596
- st.session_state.last_user_input = "" # ADDED: Prevent duplication
597
 
598
  # Sidebar
599
  with st.sidebar:
@@ -608,19 +586,9 @@ def main():
608
  facebook_url = st.text_input(
609
  "Facebook URL",
610
  placeholder="https://www.facebook.com/groups/gamersofbangladesh2",
611
- help="Enter any Facebook URL for analysis"
612
  )
613
 
614
- # Processing mode
615
- st.subheader("πŸ”§ Processing Mode")
616
- processing_mode = st.radio(
617
- "Choose analysis mode:",
618
- ["AI Analysis (Recommended)", "Simple Analysis"],
619
- help="AI Analysis uses embeddings, Simple uses rule-based"
620
- )
621
-
622
- st.session_state.processing_mode = "ai" if processing_mode == "AI Analysis (Recommended)" else "simple"
623
-
624
  # Quick test URLs
625
  st.markdown("### πŸš€ Test URLs")
626
  test_urls = {
@@ -634,7 +602,7 @@ def main():
634
  st.session_state.current_fb_url = url
635
  st.rerun()
636
 
637
- if st.button("πŸš€ Extract Facebook Data", type="primary"):
638
  url_to_use = facebook_url or getattr(st.session_state, 'current_fb_url', '')
639
 
640
  if not url_to_use:
@@ -642,37 +610,22 @@ def main():
642
  elif 'facebook.com' not in url_to_use:
643
  st.error("❌ Please enter a valid Facebook URL")
644
  else:
645
- with st.spinner("πŸ”„ Analyzing Facebook data..."):
646
  extracted_data = st.session_state.extractor.extract_data(url_to_use, data_type)
647
 
648
  if extracted_data.get("status") == "success":
649
  st.session_state.facebook_data = extracted_data
650
-
651
- # Process based on selected mode
652
- if st.session_state.processing_mode == "ai":
653
- result = process_facebook_data(extracted_data)
654
- if result and result[0] != "simple":
655
- st.session_state.vectorstore = result[0]
656
- st.session_state.chatbot = create_chatbot(result[0])
657
- st.session_state.chat_history = []
658
- st.session_state.last_user_input = "" # Reset
659
- st.success("βœ… AI analysis ready!")
660
- else:
661
- st.warning("⚠️ Using simple analysis (AI features limited)")
662
- st.session_state.chatbot = "simple"
663
- st.session_state.chat_history = []
664
- st.session_state.last_user_input = "" # Reset
665
- else:
666
- st.session_state.chatbot = "simple"
667
- st.session_state.chat_history = []
668
- st.session_state.last_user_input = "" # Reset
669
- st.success("βœ… Simple analysis ready!")
670
 
671
  source = extracted_data.get('source', 'unknown')
672
- if source == 'demo':
673
- st.warning("πŸ“ Using realistic demo data (Facebook restrictions active)")
 
674
  else:
675
- st.success("βœ… Real data extracted successfully!")
 
676
  else:
677
  error_msg = extracted_data.get("error", "Unknown error")
678
  st.error(f"❌ Extraction failed: {error_msg}")
@@ -687,7 +640,7 @@ def main():
687
  st.session_state.last_user_input = ""
688
  st.rerun()
689
 
690
- # Main content - RESTRUCTURED LAYOUT
691
  st.header("πŸ“Š Extraction Results")
692
 
693
  if st.session_state.facebook_data:
@@ -696,38 +649,26 @@ def main():
696
  content_blocks = data['content_blocks']
697
  source = data.get('source', 'unknown')
698
 
699
- if source == 'demo':
700
- st.warning("πŸ“ **Demo Data** - Realistic simulation (Facebook restrictions)")
701
- else:
702
- st.success("βœ… **Real Data** - Successfully extracted")
703
-
704
- # Show processing mode
705
- if st.session_state.processing_mode == "simple":
706
- st.info("πŸ”§ **Simple Analysis Mode** - Rule-based processing")
707
  else:
708
- st.info("πŸ€– **AI Analysis Mode** - Embedding-based processing")
709
 
710
  # Metrics
711
  col1, col2, col3 = st.columns(3)
712
  with col1:
713
  st.metric("Content Blocks", len(content_blocks))
714
  with col2:
715
- st.metric("Data Source", source.upper())
716
  with col3:
717
- st.metric("Analysis Mode", "AI" if st.session_state.processing_mode == "ai" else "Simple")
718
 
719
  # Page info
720
  st.subheader("🏷️ Page Information")
721
  st.write(f"**Title:** {page_info['title']}")
722
- st.write(f"**URL Type:** {data['url_type']}")
723
  st.write(f"**Description:** {page_info.get('description', 'No description')}")
724
-
725
- if page_info.get('member_count'):
726
- st.write(f"**Members:** {page_info['member_count']}")
727
- elif page_info.get('follower_count'):
728
- st.write(f"**Followers:** {page_info['follower_count']}")
729
-
730
- st.write(f"**Access:** {page_info.get('access_note', 'Public content')}")
731
 
732
  # Content samples
733
  st.subheader("πŸ“ Content Analysis")
@@ -738,28 +679,30 @@ def main():
738
 
739
  else:
740
  st.info("""
741
- ## πŸ“˜ Facebook Data Extractor
742
-
743
- **University Project Feature**
744
-
745
- **How it works:**
746
- 1. Enter any Facebook URL
747
- 2. System tries real data extraction
748
- 3. If blocked, uses **realistic demo data**
749
- 4. Choose between AI or Simple analysis
750
-
751
- **Analysis Modes:**
752
- - πŸ€– **AI Analysis**: Uses embeddings and Mistral AI
753
- - πŸ”§ **Simple Analysis**: Rule-based (works without embeddings)
754
-
755
- **Perfect for demonstrating:**
756
- - Social media data extraction concepts
757
- - AI analysis capabilities
758
- - Platform integration
759
- - Error handling strategies
 
 
760
  """)
761
 
762
- # Chat section - SEPARATE from main content
763
  st.markdown("---")
764
  st.header("πŸ’¬ Analysis Chat")
765
 
@@ -777,10 +720,10 @@ def main():
777
  if not st.session_state.chat_history:
778
  st.subheader("πŸ’‘ Try asking:")
779
  suggestions = [
780
- "What is this Facebook group/page about?",
781
- "Summarize the main content and purpose",
782
- "What kind of community is this?",
783
- "Analyze the engagement and activity level"
784
  ]
785
 
786
  cols = st.columns(len(suggestions))
@@ -794,29 +737,18 @@ def main():
794
  else:
795
  st.info("πŸ” Extract Facebook data to enable analysis")
796
 
797
- # CHAT INPUT - AT THE VERY BOTTOM, OUTSIDE ALL CONTAINERS
798
  if st.session_state.chatbot and st.session_state.facebook_data:
799
  user_input = st.chat_input("Ask about the Facebook data...")
800
 
801
- # ADDED: Duplication protection
802
  if user_input and user_input != st.session_state.last_user_input:
803
- # Store current input to prevent duplication
804
  st.session_state.last_user_input = user_input
805
-
806
- # Add user message
807
  st.session_state.chat_history.append({"role": "user", "content": user_input})
808
 
809
  with st.spinner("πŸ€” Analyzing..."):
810
  try:
811
- if st.session_state.chatbot == "simple":
812
- # Use simple analysis
813
- response = simple_chat_analysis(user_input, st.session_state.facebook_data)
814
- st.session_state.chat_history.append({"role": "assistant", "content": response})
815
- else:
816
- # Use AI chatbot
817
- response = st.session_state.chatbot.invoke({"question": user_input})
818
- answer = response.get("answer", "I couldn't generate a response.")
819
- st.session_state.chat_history.append({"role": "assistant", "content": answer})
820
  st.rerun()
821
  except Exception as e:
822
  error_msg = f"Analysis Error: {str(e)}"
 
8
  from typing import List, Dict
9
  import os
10
  import tempfile
11
+ import random
12
 
13
  # Import your existing AI components
14
  from langchain_text_splitters import CharacterTextSplitter
 
25
  layout="wide"
26
  )
27
 
28
+ class FacebookRealExtractor:
29
+ """Aggressive Facebook data extractor that tries multiple approaches"""
30
 
31
  def __init__(self):
32
+ self.session = requests.Session()
33
+ self.setup_session()
34
 
35
+ def setup_session(self):
36
+ """Setup requests session with rotating headers"""
37
+ self.user_agents = [
38
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
39
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
40
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
41
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
42
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0'
43
+ ]
44
+
45
  def extract_data(self, url: str, data_type: str) -> Dict:
46
+ """Extract real Facebook data with multiple attempts"""
47
+ st.info(f"πŸ” Attempting real extraction: {url}")
48
+
49
+ # Try multiple extraction methods
50
+ methods = [
51
+ self._try_direct_extraction,
52
+ self._try_mobile_extraction,
53
+ self._try_text_only_extraction
54
+ ]
55
+
56
+ for method in methods:
57
+ result = method(url)
58
+ if result.get("status") == "success":
59
+ st.success("βœ… Real Facebook data extracted!")
60
+ result["source"] = "real"
61
+ result["data_type"] = data_type
62
+ return result
63
+
64
+ # If all methods fail, provide better error info
65
+ st.error("❌ All real extraction methods failed. Facebook has strong anti-bot protection.")
66
+ st.info("""
67
+ **Why this happens:**
68
+ - Facebook blocks automated requests
69
+ - Requires JavaScript execution
70
+ - Needs cookies and session management
71
+ - Heavy anti-bot detection
72
+
73
+ **For your university project, you can:**
74
+ 1. Use the demo data to demonstrate functionality
75
+ 2. Explain these technical limitations in your report
76
+ 3. Show that LinkedIn works (no restrictions)
77
+ 4. Discuss platform security differences
78
+ """)
79
+
80
+ # Only use demo data as last resort
81
+ return self._get_minimal_demo_data(url, data_type)
82
+
83
+ def _try_direct_extraction(self, url: str) -> Dict:
84
+ """Try direct extraction with rotating headers"""
85
  try:
86
+ headers = {
87
+ 'User-Agent': random.choice(self.user_agents),
88
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/avif,*/*;q=0.8',
89
+ 'Accept-Language': 'en-US,en;q=0.5',
90
+ 'Accept-Encoding': 'gzip, deflate, br',
91
+ 'DNT': '1',
92
+ 'Connection': 'keep-alive',
93
+ 'Upgrade-Insecure-Requests': '1',
94
+ 'Sec-Fetch-Dest': 'document',
95
+ 'Sec-Fetch-Mode': 'navigate',
96
+ 'Sec-Fetch-Site': 'none',
97
+ 'Cache-Control': 'max-age=0',
98
+ }
99
 
100
+ # Try with different timeouts and settings
101
+ response = self.session.get(
102
+ url,
103
+ headers=headers,
104
+ timeout=15,
105
+ allow_redirects=True
106
+ )
107
 
108
+ if response.status_code == 200:
109
+ return self._parse_facebook_response(response, url)
110
+ else:
111
+ return {"status": "error", "reason": f"HTTP {response.status_code}"}
112
+
113
  except Exception as e:
114
+ return {"status": "error", "reason": str(e)}
 
115
 
116
+ def _try_mobile_extraction(self, url: str) -> Dict:
117
+ """Try mobile version extraction"""
118
  try:
119
+ mobile_headers = {
120
+ 'User-Agent': 'Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36',
 
121
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
122
  'Accept-Language': 'en-US,en;q=0.5',
123
  'Accept-Encoding': 'gzip, deflate, br',
 
 
 
124
  }
125
 
126
+ response = self.session.get(url, headers=mobile_headers, timeout=15)
 
127
 
128
  if response.status_code == 200:
129
+ return self._parse_facebook_response(response, url)
130
+ else:
131
+ return {"status": "error", "reason": f"Mobile HTTP {response.status_code}"}
132
 
133
+ except Exception as e:
134
+ return {"status": "error", "reason": str(e)}
135
+
136
+ def _try_text_only_extraction(self, url: str) -> Dict:
137
+ """Try text-only version or alternative approaches"""
138
+ try:
139
+ # Try textise.iitty
140
+ textise_url = f"https://r.jina.ai/{url}"
141
+ response = self.session.get(textise_url, timeout=20)
142
+
143
+ if response.status_code == 200:
144
+ return self._parse_textise_response(response, url)
145
+ else:
146
+ return {"status": "error", "reason": "Textise failed"}
147
 
148
+ except Exception as e:
149
+ return {"status": "error", "reason": str(e)}
150
+
151
+ def _parse_facebook_response(self, response, url: str) -> Dict:
152
+ """Parse Facebook response for real data"""
153
+ try:
154
+ soup = BeautifulSoup(response.text, 'html.parser')
155
+
156
+ # Extract basic information
157
+ title = soup.find('title')
158
+ description = soup.find('meta', attrs={'name': 'description'})
159
+ og_title = soup.find('meta', property='og:title')
160
+ og_description = soup.find('meta', property='og:description')
161
+
162
+ # Try to find meaningful content
163
+ content_elements = soup.find_all(['p', 'div', 'span'], string=True)
164
+ meaningful_text = []
165
+
166
+ for element in content_elements:
167
+ text = element.get_text().strip()
168
+ if (len(text) > 20 and
169
+ not any(word in text.lower() for word in ['cookie', 'login', 'sign up', 'facebook']) and
170
+ len(text.split()) > 3):
171
+ meaningful_text.append(text)
172
+
173
+ # Create content blocks from real data
174
+ content_blocks = []
175
+ for i, text in enumerate(meaningful_text[:10]): # Limit to first 10 meaningful texts
176
+ content_blocks.append({
177
+ "id": i + 1,
178
+ "content": text,
179
+ "length": len(text),
180
+ "word_count": len(text.split()),
181
+ "content_type": self._classify_content(text),
182
+ "is_public_content": True
183
+ })
184
+
185
+ if content_blocks:
186
  return {
187
  "page_info": {
188
+ "title": og_title['content'] if og_title else (title.text if title else "Facebook Content"),
189
+ "description": og_description['content'] if og_description else (description['content'] if description else ""),
190
  "url": url,
191
+ "response_code": response.status_code,
192
+ "content_length": len(response.text),
193
+ "access_note": "Real data extracted successfully"
194
  },
195
+ "content_blocks": content_blocks,
196
  "extraction_time": datetime.now().isoformat(),
197
+ "status": "success"
 
 
198
  }
199
  else:
200
+ return {"status": "error", "reason": "No meaningful content found"}
201
 
202
+ except Exception as e:
203
+ return {"status": "error", "reason": f"Parsing error: {str(e)}"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
+ def _parse_textise_response(self, response, url: str) -> Dict:
206
+ """Parse textise response"""
207
+ try:
208
+ # Textise provides cleaner text content
209
+ lines = response.text.split('\n')
210
+ meaningful_lines = [line.strip() for line in lines if len(line.strip()) > 30]
211
+
212
+ content_blocks = []
213
+ for i, line in enumerate(meaningful_lines[:8]):
214
+ content_blocks.append({
215
+ "id": i + 1,
216
+ "content": line,
217
+ "length": len(line),
218
+ "word_count": len(line.split()),
219
+ "content_type": self._classify_content(line),
220
+ "is_public_content": True
221
+ })
222
+
223
+ if content_blocks:
224
+ return {
225
+ "page_info": {
226
+ "title": "Facebook Content (via Textise)",
227
+ "description": "Content extracted using text-only method",
228
+ "url": url,
229
+ "response_code": response.status_code,
230
+ "content_length": len(response.text),
231
+ "access_note": "Real data via text-only extraction"
232
+ },
233
+ "content_blocks": content_blocks,
234
+ "extraction_time": datetime.now().isoformat(),
235
+ "status": "success"
236
+ }
237
+ else:
238
+ return {"status": "error", "reason": "No content from textise"}
239
+
240
+ except Exception as e:
241
+ return {"status": "error", "reason": str(e)}
242
 
243
+ def _classify_content(self, text: str) -> str:
244
+ """Classify content type"""
245
+ text_lower = text.lower()
246
+
247
+ if any(word in text_lower for word in ['welcome', 'join', 'community']):
248
+ return "welcome_message"
249
+ elif any(word in text_lower for word in ['event', 'meetup', 'schedule']):
250
+ return "event_info"
251
+ elif any(word in text_lower for word in ['post', 'share', 'comment']):
252
+ return "social_content"
253
+ elif any(word in text_lower for word in ['question', 'help', 'advice']):
254
+ return "question_post"
255
  else:
256
+ return "general_content"
257
 
258
+ def _get_minimal_demo_data(self, url: str, data_type: str) -> Dict:
259
+ """Only use demo data as absolute last resort"""
260
+ st.warning("πŸ”„ Using minimal demo data for demonstration purposes")
261
 
262
  return {
263
  "page_info": {
264
+ "title": "Facebook Content (Demo - Real extraction blocked)",
265
+ "description": "This would show real Facebook data if not blocked by platform restrictions",
 
266
  "url": url,
267
+ "response_code": 403,
268
+ "content_length": 0,
269
+ "access_note": "DEMO: Facebook blocked real data extraction"
270
  },
271
  "content_blocks": [
272
  {
273
  "id": 1,
274
+ "content": "This is a demonstration of what real Facebook data would look like. Actual extraction is blocked by Facebook's anti-bot protection.",
275
  "length": 120,
276
+ "word_count": 20,
277
+ "content_type": "demo_notice",
278
  "is_public_content": True
279
  },
280
  {
281
  "id": 2,
282
+ "content": "For your university project, you can discuss these technical limitations and how social media platforms implement security measures.",
283
+ "length": 130,
284
  "word_count": 18,
285
+ "content_type": "educational_note",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  "is_public_content": True
287
  }
288
  ],
 
290
  "extraction_time": datetime.now().isoformat(),
291
  "data_type": data_type,
292
  "status": "success",
293
+ "source": "demo_fallback"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  }
295
 
296
+ # Rest of the functions remain the same (get_embeddings, get_llm, simple_chat_analysis, etc.)
297
  def get_embeddings():
298
  """Initialize embeddings with better error handling and cache management"""
299
  try:
 
398
  page_info = extracted_data.get('page_info', {})
399
  content_blocks = extracted_data.get('content_blocks', [])
400
  url_type = extracted_data.get('url_type', 'Facebook Content')
401
+ source = extracted_data.get('source', 'unknown')
402
 
403
  user_input_lower = user_input.lower()
404
 
 
411
  f"**Data Source:** {source.upper()}",
412
  f"**Description:** {page_info.get('description', 'No description available')}",
413
  "",
414
+ f"This appears to be a {url_type.lower()} with {len(content_blocks)} content blocks.",
415
  "",
416
  "**Key Content Types:**",
417
  f"{', '.join(set(block['content_type'] for block in content_blocks))}",
 
483
 
484
  page_info = extracted_data['page_info']
485
  content_blocks = extracted_data['content_blocks']
486
+ url_type = extracted_data.get('url_type', 'Facebook Content')
487
  source = extracted_data.get('source', 'unknown')
488
 
489
  all_text = f"FACEBOOK DATA ANALYSIS\n{'='*50}\n\n"
 
523
  chunks = splitter.split_text(all_text)
524
  documents = [Document(page_content=chunk) for chunk in chunks]
525
 
526
+ return "simple", documents
527
 
528
  def create_chatbot(vectorstore):
529
  """Create conversational chatbot"""
 
551
  return "simple" # Fallback to simple mode
552
 
553
  def main():
554
+ st.title("πŸ“˜ Facebook Data Extractor - REAL DATA ATTEMPT")
555
+ st.markdown("**Aggressive real data extraction - No automatic demo fallback**")
556
 
557
  if st.button("← Back to Main Dashboard"):
558
  st.switch_page("app.py")
559
 
560
+ # Initialize session state
561
  if "extractor" not in st.session_state:
562
+ st.session_state.extractor = FacebookRealExtractor() # Changed to real extractor
563
  if "facebook_data" not in st.session_state:
564
  st.session_state.facebook_data = None
565
  if "vectorstore" not in st.session_state:
 
569
  if "chat_history" not in st.session_state:
570
  st.session_state.chat_history = []
571
  if "processing_mode" not in st.session_state:
572
+ st.session_state.processing_mode = "ai"
573
  if "last_user_input" not in st.session_state:
574
+ st.session_state.last_user_input = ""
575
 
576
  # Sidebar
577
  with st.sidebar:
 
586
  facebook_url = st.text_input(
587
  "Facebook URL",
588
  placeholder="https://www.facebook.com/groups/gamersofbangladesh2",
589
+ help="Enter any Facebook URL for REAL data extraction"
590
  )
591
 
 
 
 
 
 
 
 
 
 
 
592
  # Quick test URLs
593
  st.markdown("### πŸš€ Test URLs")
594
  test_urls = {
 
602
  st.session_state.current_fb_url = url
603
  st.rerun()
604
 
605
+ if st.button("πŸš€ EXTRACT REAL DATA", type="primary"):
606
  url_to_use = facebook_url or getattr(st.session_state, 'current_fb_url', '')
607
 
608
  if not url_to_use:
 
610
  elif 'facebook.com' not in url_to_use:
611
  st.error("❌ Please enter a valid Facebook URL")
612
  else:
613
+ with st.spinner("πŸ”„ Aggressively extracting REAL Facebook data..."):
614
  extracted_data = st.session_state.extractor.extract_data(url_to_use, data_type)
615
 
616
  if extracted_data.get("status") == "success":
617
  st.session_state.facebook_data = extracted_data
618
+ st.session_state.chatbot = "simple"
619
+ st.session_state.chat_history = []
620
+ st.session_state.last_user_input = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
621
 
622
  source = extracted_data.get('source', 'unknown')
623
+ if source == 'real':
624
+ st.success("πŸŽ‰ SUCCESS: Real Facebook data extracted!")
625
+ st.balloons()
626
  else:
627
+ st.warning("⚠️ Using fallback data - Facebook blocked real extraction")
628
+
629
  else:
630
  error_msg = extracted_data.get("error", "Unknown error")
631
  st.error(f"❌ Extraction failed: {error_msg}")
 
640
  st.session_state.last_user_input = ""
641
  st.rerun()
642
 
643
+ # Main content
644
  st.header("πŸ“Š Extraction Results")
645
 
646
  if st.session_state.facebook_data:
 
649
  content_blocks = data['content_blocks']
650
  source = data.get('source', 'unknown')
651
 
652
+ if source == 'real':
653
+ st.success("βœ… **REAL DATA** - Successfully extracted from Facebook!")
 
 
 
 
 
 
654
  else:
655
+ st.warning("πŸ“ **FALLBACK DATA** - Facebook blocked real extraction")
656
 
657
  # Metrics
658
  col1, col2, col3 = st.columns(3)
659
  with col1:
660
  st.metric("Content Blocks", len(content_blocks))
661
  with col2:
662
+ st.metric("Data Source", "REAL" if source == 'real' else "FALLBACK")
663
  with col3:
664
+ st.metric("Status", "Success")
665
 
666
  # Page info
667
  st.subheader("🏷️ Page Information")
668
  st.write(f"**Title:** {page_info['title']}")
 
669
  st.write(f"**Description:** {page_info.get('description', 'No description')}")
670
+ st.write(f"**Access Note:** {page_info.get('access_note', 'Public content')}")
671
+ st.write(f"**Response Code:** {page_info.get('response_code', 'N/A')}")
 
 
 
 
 
672
 
673
  # Content samples
674
  st.subheader("πŸ“ Content Analysis")
 
679
 
680
  else:
681
  st.info("""
682
+ ## πŸ“˜ Facebook Real Data Extractor
683
+
684
+ **Aggressive Approach - No Automatic Demo**
685
+
686
+ **This version:**
687
+ - Tries multiple extraction methods
688
+ - Uses rotating user agents
689
+ - Attempts mobile versions
690
+ - Tries text-only alternatives
691
+ - Only uses demo data as LAST RESORT
692
+
693
+ **Technical Challenges:**
694
+ - Facebook has strong anti-bot protection
695
+ - Requires JavaScript execution
696
+ - Needs session management
697
+ - Heavy rate limiting
698
+
699
+ **For your project:**
700
+ - Shows real technical limitations
701
+ - Demonstrates platform security
702
+ - Provides educational value
703
  """)
704
 
705
+ # Chat section
706
  st.markdown("---")
707
  st.header("πŸ’¬ Analysis Chat")
708
 
 
720
  if not st.session_state.chat_history:
721
  st.subheader("πŸ’‘ Try asking:")
722
  suggestions = [
723
+ "What is this Facebook content about?",
724
+ "Summarize the extracted data",
725
+ "What kind of information was found?",
726
+ "Analyze the content structure"
727
  ]
728
 
729
  cols = st.columns(len(suggestions))
 
737
  else:
738
  st.info("πŸ” Extract Facebook data to enable analysis")
739
 
740
+ # CHAT INPUT
741
  if st.session_state.chatbot and st.session_state.facebook_data:
742
  user_input = st.chat_input("Ask about the Facebook data...")
743
 
 
744
  if user_input and user_input != st.session_state.last_user_input:
 
745
  st.session_state.last_user_input = user_input
 
 
746
  st.session_state.chat_history.append({"role": "user", "content": user_input})
747
 
748
  with st.spinner("πŸ€” Analyzing..."):
749
  try:
750
+ response = simple_chat_analysis(user_input, st.session_state.facebook_data)
751
+ st.session_state.chat_history.append({"role": "assistant", "content": response})
 
 
 
 
 
 
 
752
  st.rerun()
753
  except Exception as e:
754
  error_msg = f"Analysis Error: {str(e)}"