Update pages/facebook_extractor.py
Browse files- pages/facebook_extractor.py +274 -342
pages/facebook_extractor.py
CHANGED
|
@@ -8,6 +8,7 @@ from datetime import datetime
|
|
| 8 |
from typing import List, Dict
|
| 9 |
import os
|
| 10 |
import tempfile
|
|
|
|
| 11 |
|
| 12 |
# Import your existing AI components
|
| 13 |
from langchain_text_splitters import CharacterTextSplitter
|
|
@@ -24,262 +25,264 @@ st.set_page_config(
|
|
| 24 |
layout="wide"
|
| 25 |
)
|
| 26 |
|
| 27 |
-
class
|
| 28 |
-
"""
|
| 29 |
|
| 30 |
def __init__(self):
|
| 31 |
-
self.
|
|
|
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
def extract_data(self, url: str, data_type: str) -> Dict:
|
| 34 |
-
"""Extract
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
try:
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
#
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
except Exception as e:
|
| 48 |
-
|
| 49 |
-
return self._get_demo_data(url, data_type)
|
| 50 |
|
| 51 |
-
def
|
| 52 |
-
"""Try
|
| 53 |
try:
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 57 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 58 |
'Accept-Language': 'en-US,en;q=0.5',
|
| 59 |
'Accept-Encoding': 'gzip, deflate, br',
|
| 60 |
-
'DNT': '1',
|
| 61 |
-
'Connection': 'keep-alive',
|
| 62 |
-
'Upgrade-Insecure-Requests': '1',
|
| 63 |
}
|
| 64 |
|
| 65 |
-
|
| 66 |
-
response = requests.get(url, headers=headers, timeout=10, verify=False)
|
| 67 |
|
| 68 |
if response.status_code == 200:
|
| 69 |
-
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
return {
|
| 76 |
"page_info": {
|
| 77 |
-
"title": title.text if title else "Facebook Content",
|
| 78 |
-
"description": description['content'] if description else "",
|
| 79 |
"url": url,
|
| 80 |
-
"response_code":
|
| 81 |
-
"content_length": len(response.text)
|
|
|
|
| 82 |
},
|
| 83 |
-
"content_blocks":
|
| 84 |
"extraction_time": datetime.now().isoformat(),
|
| 85 |
-
"
|
| 86 |
-
"status": "success",
|
| 87 |
-
"source": "real"
|
| 88 |
}
|
| 89 |
else:
|
| 90 |
-
return {"status": "error", "
|
| 91 |
|
| 92 |
-
except Exception:
|
| 93 |
-
return {"status": "error", "
|
| 94 |
-
|
| 95 |
-
def _extract_real_content(self, soup) -> List[Dict]:
|
| 96 |
-
"""Extract content from real page"""
|
| 97 |
-
blocks = []
|
| 98 |
-
text = soup.get_text()
|
| 99 |
-
paragraphs = [p.strip() for p in text.split('.') if p.strip() and len(p.strip()) > 30]
|
| 100 |
-
|
| 101 |
-
for i, paragraph in enumerate(paragraphs[:8]):
|
| 102 |
-
blocks.append({
|
| 103 |
-
"id": i + 1,
|
| 104 |
-
"content": paragraph,
|
| 105 |
-
"length": len(paragraph),
|
| 106 |
-
"word_count": len(paragraph.split()),
|
| 107 |
-
"content_type": "real_content",
|
| 108 |
-
"is_public_content": True
|
| 109 |
-
})
|
| 110 |
-
|
| 111 |
-
return blocks
|
| 112 |
|
| 113 |
-
def
|
| 114 |
-
"""
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
-
def
|
| 125 |
-
"""
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
if '
|
| 129 |
-
return "
|
| 130 |
-
elif
|
| 131 |
-
return "
|
| 132 |
-
elif '
|
| 133 |
-
return "
|
| 134 |
-
elif '
|
| 135 |
-
return "
|
| 136 |
else:
|
| 137 |
-
return "
|
| 138 |
|
| 139 |
-
def
|
| 140 |
-
"""
|
| 141 |
-
|
| 142 |
|
| 143 |
return {
|
| 144 |
"page_info": {
|
| 145 |
-
"title":
|
| 146 |
-
"description":
|
| 147 |
-
"member_count": "15.7K members",
|
| 148 |
"url": url,
|
| 149 |
-
"response_code":
|
| 150 |
-
"content_length":
|
| 151 |
-
"access_note": "
|
| 152 |
},
|
| 153 |
"content_blocks": [
|
| 154 |
{
|
| 155 |
"id": 1,
|
| 156 |
-
"content":
|
| 157 |
"length": 120,
|
| 158 |
-
"word_count":
|
| 159 |
-
"content_type": "
|
| 160 |
"is_public_content": True
|
| 161 |
},
|
| 162 |
{
|
| 163 |
"id": 2,
|
| 164 |
-
"content": "
|
| 165 |
-
"length":
|
| 166 |
"word_count": 18,
|
| 167 |
-
"content_type": "
|
| 168 |
-
"is_public_content": True
|
| 169 |
-
},
|
| 170 |
-
{
|
| 171 |
-
"id": 3,
|
| 172 |
-
"content": "Does anyone have experience with this issue? I've been trying to solve it for a while and could use some community wisdom.",
|
| 173 |
-
"length": 88,
|
| 174 |
-
"word_count": 16,
|
| 175 |
-
"content_type": "question_post",
|
| 176 |
-
"is_public_content": True
|
| 177 |
-
},
|
| 178 |
-
{
|
| 179 |
-
"id": 4,
|
| 180 |
-
"content": "Our monthly meetup is scheduled for next Saturday! Don't forget to RSVP so we can plan accordingly. Looking forward to seeing everyone there.",
|
| 181 |
-
"length": 102,
|
| 182 |
-
"word_count": 19,
|
| 183 |
-
"content_type": "event_announcement",
|
| 184 |
-
"is_public_content": True
|
| 185 |
-
},
|
| 186 |
-
{
|
| 187 |
-
"id": 5,
|
| 188 |
-
"content": "The community guidelines: Be respectful, no spam, keep discussions relevant to the group's topic, and help each other grow.",
|
| 189 |
-
"length": 78,
|
| 190 |
-
"word_count": 14,
|
| 191 |
-
"content_type": "community_guidelines",
|
| 192 |
-
"is_public_content": True
|
| 193 |
-
}
|
| 194 |
-
],
|
| 195 |
-
"url_type": "Facebook Group",
|
| 196 |
-
"extraction_time": datetime.now().isoformat(),
|
| 197 |
-
"data_type": data_type,
|
| 198 |
-
"status": "success",
|
| 199 |
-
"source": "demo"
|
| 200 |
-
}
|
| 201 |
-
|
| 202 |
-
def _get_page_demo_data(self, url: str, data_type: str) -> Dict:
|
| 203 |
-
"""Get realistic page demo data"""
|
| 204 |
-
page_name = self._extract_name_from_url(url) or "Brand Page"
|
| 205 |
-
|
| 206 |
-
return {
|
| 207 |
-
"page_info": {
|
| 208 |
-
"title": f"{page_name} | Facebook Page",
|
| 209 |
-
"description": f"Official Facebook page of {page_name}. Stay updated with our latest news, products, and community events.",
|
| 210 |
-
"follower_count": "45.2K followers",
|
| 211 |
-
"url": url,
|
| 212 |
-
"response_code": 200,
|
| 213 |
-
"content_length": 12000,
|
| 214 |
-
"access_note": "Public page - Limited data due to platform restrictions"
|
| 215 |
-
},
|
| 216 |
-
"content_blocks": [
|
| 217 |
-
{
|
| 218 |
-
"id": 1,
|
| 219 |
-
"content": f"Welcome to the official {page_name} Facebook page! Here you'll find the latest updates, news, and announcements from our team.",
|
| 220 |
-
"length": 98,
|
| 221 |
-
"word_count": 15,
|
| 222 |
-
"content_type": "welcome_message",
|
| 223 |
-
"is_public_content": True
|
| 224 |
-
},
|
| 225 |
-
{
|
| 226 |
-
"id": 2,
|
| 227 |
-
"content": "We're excited to announce our new product launch next week! Stay tuned for more details and special offers for our Facebook community.",
|
| 228 |
-
"length": 92,
|
| 229 |
-
"word_count": 16,
|
| 230 |
-
"content_type": "announcement",
|
| 231 |
-
"is_public_content": True
|
| 232 |
-
},
|
| 233 |
-
{
|
| 234 |
-
"id": 3,
|
| 235 |
-
"content": "Thank you to everyone who participated in our recent event! The feedback has been incredible and we're already planning the next one.",
|
| 236 |
-
"length": 87,
|
| 237 |
-
"word_count": 14,
|
| 238 |
-
"content_type": "event_followup",
|
| 239 |
-
"is_public_content": True
|
| 240 |
-
},
|
| 241 |
-
{
|
| 242 |
-
"id": 4,
|
| 243 |
-
"content": "Customer support hours: Monday-Friday 9AM-6PM. For urgent issues, please message us directly and we'll respond as soon as possible.",
|
| 244 |
-
"length": 85,
|
| 245 |
-
"word_count": 15,
|
| 246 |
-
"content_type": "support_info",
|
| 247 |
-
"is_public_content": True
|
| 248 |
-
}
|
| 249 |
-
],
|
| 250 |
-
"url_type": "Facebook Page",
|
| 251 |
-
"extraction_time": datetime.now().isoformat(),
|
| 252 |
-
"data_type": data_type,
|
| 253 |
-
"status": "success",
|
| 254 |
-
"source": "demo"
|
| 255 |
-
}
|
| 256 |
-
|
| 257 |
-
def _get_general_demo_data(self, url: str, data_type: str) -> Dict:
|
| 258 |
-
"""Get general demo data"""
|
| 259 |
-
return {
|
| 260 |
-
"page_info": {
|
| 261 |
-
"title": "Facebook Content",
|
| 262 |
-
"description": "Social media content and community interactions",
|
| 263 |
-
"url": url,
|
| 264 |
-
"response_code": 200,
|
| 265 |
-
"content_length": 8000,
|
| 266 |
-
"access_note": "Public content - Platform restrictions apply"
|
| 267 |
-
},
|
| 268 |
-
"content_blocks": [
|
| 269 |
-
{
|
| 270 |
-
"id": 1,
|
| 271 |
-
"content": "Community engagement and social interactions are key aspects of this platform. Users share content, connect with friends, and participate in discussions.",
|
| 272 |
-
"length": 105,
|
| 273 |
-
"word_count": 16,
|
| 274 |
-
"content_type": "general_content",
|
| 275 |
-
"is_public_content": True
|
| 276 |
-
},
|
| 277 |
-
{
|
| 278 |
-
"id": 2,
|
| 279 |
-
"content": "Recent updates have improved user experience with better content discovery and enhanced privacy controls for community members.",
|
| 280 |
-
"length": 82,
|
| 281 |
-
"word_count": 12,
|
| 282 |
-
"content_type": "platform_updates",
|
| 283 |
"is_public_content": True
|
| 284 |
}
|
| 285 |
],
|
|
@@ -287,35 +290,10 @@ class FacebookDataSimulator:
|
|
| 287 |
"extraction_time": datetime.now().isoformat(),
|
| 288 |
"data_type": data_type,
|
| 289 |
"status": "success",
|
| 290 |
-
"source": "
|
| 291 |
-
}
|
| 292 |
-
|
| 293 |
-
def _extract_name_from_url(self, url: str) -> str:
|
| 294 |
-
"""Extract name from URL for realistic demo data"""
|
| 295 |
-
# Extract name from URL for more realistic demo data
|
| 296 |
-
match = re.search(r'facebook\.com/(?:groups/|pages/)?([^/?]+)', url)
|
| 297 |
-
if match:
|
| 298 |
-
name = match.group(1)
|
| 299 |
-
# Clean up the name
|
| 300 |
-
name = name.replace('-', ' ').title()
|
| 301 |
-
return name
|
| 302 |
-
return ""
|
| 303 |
-
|
| 304 |
-
def _create_demo_data(self) -> Dict:
|
| 305 |
-
"""Create comprehensive demo data"""
|
| 306 |
-
return {
|
| 307 |
-
"groups": {
|
| 308 |
-
"gamersofbangladesh2": "Gaming Community Bangladesh",
|
| 309 |
-
"programmingcommunity": "Programming Community",
|
| 310 |
-
"startupdiscussions": "Startup Discussions"
|
| 311 |
-
},
|
| 312 |
-
"pages": {
|
| 313 |
-
"meta": "Meta Official",
|
| 314 |
-
"starbucks": "Starbucks Coffee",
|
| 315 |
-
"nasa": "NASA"
|
| 316 |
-
}
|
| 317 |
}
|
| 318 |
|
|
|
|
| 319 |
def get_embeddings():
|
| 320 |
"""Initialize embeddings with better error handling and cache management"""
|
| 321 |
try:
|
|
@@ -420,7 +398,7 @@ def simple_chat_analysis(user_input: str, extracted_data: Dict) -> str:
|
|
| 420 |
page_info = extracted_data.get('page_info', {})
|
| 421 |
content_blocks = extracted_data.get('content_blocks', [])
|
| 422 |
url_type = extracted_data.get('url_type', 'Facebook Content')
|
| 423 |
-
source = extracted_data.get('source', '
|
| 424 |
|
| 425 |
user_input_lower = user_input.lower()
|
| 426 |
|
|
@@ -433,7 +411,7 @@ def simple_chat_analysis(user_input: str, extracted_data: Dict) -> str:
|
|
| 433 |
f"**Data Source:** {source.upper()}",
|
| 434 |
f"**Description:** {page_info.get('description', 'No description available')}",
|
| 435 |
"",
|
| 436 |
-
f"This appears to be a {url_type.lower()} with {len(content_blocks)} content blocks
|
| 437 |
"",
|
| 438 |
"**Key Content Types:**",
|
| 439 |
f"{', '.join(set(block['content_type'] for block in content_blocks))}",
|
|
@@ -505,7 +483,7 @@ def process_facebook_data(extracted_data):
|
|
| 505 |
|
| 506 |
page_info = extracted_data['page_info']
|
| 507 |
content_blocks = extracted_data['content_blocks']
|
| 508 |
-
url_type = extracted_data
|
| 509 |
source = extracted_data.get('source', 'unknown')
|
| 510 |
|
| 511 |
all_text = f"FACEBOOK DATA ANALYSIS\n{'='*50}\n\n"
|
|
@@ -545,7 +523,7 @@ def process_facebook_data(extracted_data):
|
|
| 545 |
chunks = splitter.split_text(all_text)
|
| 546 |
documents = [Document(page_content=chunk) for chunk in chunks]
|
| 547 |
|
| 548 |
-
return "simple", documents
|
| 549 |
|
| 550 |
def create_chatbot(vectorstore):
|
| 551 |
"""Create conversational chatbot"""
|
|
@@ -573,15 +551,15 @@ def create_chatbot(vectorstore):
|
|
| 573 |
return "simple" # Fallback to simple mode
|
| 574 |
|
| 575 |
def main():
|
| 576 |
-
st.title("π Facebook Data Extractor")
|
| 577 |
-
st.markdown("**
|
| 578 |
|
| 579 |
if st.button("β Back to Main Dashboard"):
|
| 580 |
st.switch_page("app.py")
|
| 581 |
|
| 582 |
-
# Initialize session state
|
| 583 |
if "extractor" not in st.session_state:
|
| 584 |
-
st.session_state.extractor =
|
| 585 |
if "facebook_data" not in st.session_state:
|
| 586 |
st.session_state.facebook_data = None
|
| 587 |
if "vectorstore" not in st.session_state:
|
|
@@ -591,9 +569,9 @@ def main():
|
|
| 591 |
if "chat_history" not in st.session_state:
|
| 592 |
st.session_state.chat_history = []
|
| 593 |
if "processing_mode" not in st.session_state:
|
| 594 |
-
st.session_state.processing_mode = "ai"
|
| 595 |
if "last_user_input" not in st.session_state:
|
| 596 |
-
st.session_state.last_user_input = ""
|
| 597 |
|
| 598 |
# Sidebar
|
| 599 |
with st.sidebar:
|
|
@@ -608,19 +586,9 @@ def main():
|
|
| 608 |
facebook_url = st.text_input(
|
| 609 |
"Facebook URL",
|
| 610 |
placeholder="https://www.facebook.com/groups/gamersofbangladesh2",
|
| 611 |
-
help="Enter any Facebook URL for
|
| 612 |
)
|
| 613 |
|
| 614 |
-
# Processing mode
|
| 615 |
-
st.subheader("π§ Processing Mode")
|
| 616 |
-
processing_mode = st.radio(
|
| 617 |
-
"Choose analysis mode:",
|
| 618 |
-
["AI Analysis (Recommended)", "Simple Analysis"],
|
| 619 |
-
help="AI Analysis uses embeddings, Simple uses rule-based"
|
| 620 |
-
)
|
| 621 |
-
|
| 622 |
-
st.session_state.processing_mode = "ai" if processing_mode == "AI Analysis (Recommended)" else "simple"
|
| 623 |
-
|
| 624 |
# Quick test URLs
|
| 625 |
st.markdown("### π Test URLs")
|
| 626 |
test_urls = {
|
|
@@ -634,7 +602,7 @@ def main():
|
|
| 634 |
st.session_state.current_fb_url = url
|
| 635 |
st.rerun()
|
| 636 |
|
| 637 |
-
if st.button("π
|
| 638 |
url_to_use = facebook_url or getattr(st.session_state, 'current_fb_url', '')
|
| 639 |
|
| 640 |
if not url_to_use:
|
|
@@ -642,37 +610,22 @@ def main():
|
|
| 642 |
elif 'facebook.com' not in url_to_use:
|
| 643 |
st.error("β Please enter a valid Facebook URL")
|
| 644 |
else:
|
| 645 |
-
with st.spinner("π
|
| 646 |
extracted_data = st.session_state.extractor.extract_data(url_to_use, data_type)
|
| 647 |
|
| 648 |
if extracted_data.get("status") == "success":
|
| 649 |
st.session_state.facebook_data = extracted_data
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
result = process_facebook_data(extracted_data)
|
| 654 |
-
if result and result[0] != "simple":
|
| 655 |
-
st.session_state.vectorstore = result[0]
|
| 656 |
-
st.session_state.chatbot = create_chatbot(result[0])
|
| 657 |
-
st.session_state.chat_history = []
|
| 658 |
-
st.session_state.last_user_input = "" # Reset
|
| 659 |
-
st.success("β
AI analysis ready!")
|
| 660 |
-
else:
|
| 661 |
-
st.warning("β οΈ Using simple analysis (AI features limited)")
|
| 662 |
-
st.session_state.chatbot = "simple"
|
| 663 |
-
st.session_state.chat_history = []
|
| 664 |
-
st.session_state.last_user_input = "" # Reset
|
| 665 |
-
else:
|
| 666 |
-
st.session_state.chatbot = "simple"
|
| 667 |
-
st.session_state.chat_history = []
|
| 668 |
-
st.session_state.last_user_input = "" # Reset
|
| 669 |
-
st.success("β
Simple analysis ready!")
|
| 670 |
|
| 671 |
source = extracted_data.get('source', 'unknown')
|
| 672 |
-
if source == '
|
| 673 |
-
st.
|
|
|
|
| 674 |
else:
|
| 675 |
-
st.
|
|
|
|
| 676 |
else:
|
| 677 |
error_msg = extracted_data.get("error", "Unknown error")
|
| 678 |
st.error(f"β Extraction failed: {error_msg}")
|
|
@@ -687,7 +640,7 @@ def main():
|
|
| 687 |
st.session_state.last_user_input = ""
|
| 688 |
st.rerun()
|
| 689 |
|
| 690 |
-
# Main content
|
| 691 |
st.header("π Extraction Results")
|
| 692 |
|
| 693 |
if st.session_state.facebook_data:
|
|
@@ -696,38 +649,26 @@ def main():
|
|
| 696 |
content_blocks = data['content_blocks']
|
| 697 |
source = data.get('source', 'unknown')
|
| 698 |
|
| 699 |
-
if source == '
|
| 700 |
-
st.
|
| 701 |
-
else:
|
| 702 |
-
st.success("β
**Real Data** - Successfully extracted")
|
| 703 |
-
|
| 704 |
-
# Show processing mode
|
| 705 |
-
if st.session_state.processing_mode == "simple":
|
| 706 |
-
st.info("π§ **Simple Analysis Mode** - Rule-based processing")
|
| 707 |
else:
|
| 708 |
-
st.
|
| 709 |
|
| 710 |
# Metrics
|
| 711 |
col1, col2, col3 = st.columns(3)
|
| 712 |
with col1:
|
| 713 |
st.metric("Content Blocks", len(content_blocks))
|
| 714 |
with col2:
|
| 715 |
-
st.metric("Data Source", source
|
| 716 |
with col3:
|
| 717 |
-
st.metric("
|
| 718 |
|
| 719 |
# Page info
|
| 720 |
st.subheader("π·οΈ Page Information")
|
| 721 |
st.write(f"**Title:** {page_info['title']}")
|
| 722 |
-
st.write(f"**URL Type:** {data['url_type']}")
|
| 723 |
st.write(f"**Description:** {page_info.get('description', 'No description')}")
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
st.write(f"**Members:** {page_info['member_count']}")
|
| 727 |
-
elif page_info.get('follower_count'):
|
| 728 |
-
st.write(f"**Followers:** {page_info['follower_count']}")
|
| 729 |
-
|
| 730 |
-
st.write(f"**Access:** {page_info.get('access_note', 'Public content')}")
|
| 731 |
|
| 732 |
# Content samples
|
| 733 |
st.subheader("π Content Analysis")
|
|
@@ -738,28 +679,30 @@ def main():
|
|
| 738 |
|
| 739 |
else:
|
| 740 |
st.info("""
|
| 741 |
-
## π Facebook Data Extractor
|
| 742 |
-
|
| 743 |
-
**
|
| 744 |
-
|
| 745 |
-
**
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
-
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
-
|
| 757 |
-
|
| 758 |
-
|
| 759 |
-
-
|
|
|
|
|
|
|
| 760 |
""")
|
| 761 |
|
| 762 |
-
# Chat section
|
| 763 |
st.markdown("---")
|
| 764 |
st.header("π¬ Analysis Chat")
|
| 765 |
|
|
@@ -777,10 +720,10 @@ def main():
|
|
| 777 |
if not st.session_state.chat_history:
|
| 778 |
st.subheader("π‘ Try asking:")
|
| 779 |
suggestions = [
|
| 780 |
-
"What is this Facebook
|
| 781 |
-
"Summarize the
|
| 782 |
-
"What kind of
|
| 783 |
-
"Analyze the
|
| 784 |
]
|
| 785 |
|
| 786 |
cols = st.columns(len(suggestions))
|
|
@@ -794,29 +737,18 @@ def main():
|
|
| 794 |
else:
|
| 795 |
st.info("π Extract Facebook data to enable analysis")
|
| 796 |
|
| 797 |
-
# CHAT INPUT
|
| 798 |
if st.session_state.chatbot and st.session_state.facebook_data:
|
| 799 |
user_input = st.chat_input("Ask about the Facebook data...")
|
| 800 |
|
| 801 |
-
# ADDED: Duplication protection
|
| 802 |
if user_input and user_input != st.session_state.last_user_input:
|
| 803 |
-
# Store current input to prevent duplication
|
| 804 |
st.session_state.last_user_input = user_input
|
| 805 |
-
|
| 806 |
-
# Add user message
|
| 807 |
st.session_state.chat_history.append({"role": "user", "content": user_input})
|
| 808 |
|
| 809 |
with st.spinner("π€ Analyzing..."):
|
| 810 |
try:
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
response = simple_chat_analysis(user_input, st.session_state.facebook_data)
|
| 814 |
-
st.session_state.chat_history.append({"role": "assistant", "content": response})
|
| 815 |
-
else:
|
| 816 |
-
# Use AI chatbot
|
| 817 |
-
response = st.session_state.chatbot.invoke({"question": user_input})
|
| 818 |
-
answer = response.get("answer", "I couldn't generate a response.")
|
| 819 |
-
st.session_state.chat_history.append({"role": "assistant", "content": answer})
|
| 820 |
st.rerun()
|
| 821 |
except Exception as e:
|
| 822 |
error_msg = f"Analysis Error: {str(e)}"
|
|
|
|
| 8 |
from typing import List, Dict
|
| 9 |
import os
|
| 10 |
import tempfile
|
| 11 |
+
import random
|
| 12 |
|
| 13 |
# Import your existing AI components
|
| 14 |
from langchain_text_splitters import CharacterTextSplitter
|
|
|
|
| 25 |
layout="wide"
|
| 26 |
)
|
| 27 |
|
| 28 |
+
class FacebookRealExtractor:
|
| 29 |
+
"""Aggressive Facebook data extractor that tries multiple approaches"""
|
| 30 |
|
| 31 |
def __init__(self):
|
| 32 |
+
self.session = requests.Session()
|
| 33 |
+
self.setup_session()
|
| 34 |
|
| 35 |
+
def setup_session(self):
|
| 36 |
+
"""Setup requests session with rotating headers"""
|
| 37 |
+
self.user_agents = [
|
| 38 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 39 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
| 40 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 41 |
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 42 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0'
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
def extract_data(self, url: str, data_type: str) -> Dict:
|
| 46 |
+
"""Extract real Facebook data with multiple attempts"""
|
| 47 |
+
st.info(f"π Attempting real extraction: {url}")
|
| 48 |
+
|
| 49 |
+
# Try multiple extraction methods
|
| 50 |
+
methods = [
|
| 51 |
+
self._try_direct_extraction,
|
| 52 |
+
self._try_mobile_extraction,
|
| 53 |
+
self._try_text_only_extraction
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
for method in methods:
|
| 57 |
+
result = method(url)
|
| 58 |
+
if result.get("status") == "success":
|
| 59 |
+
st.success("β
Real Facebook data extracted!")
|
| 60 |
+
result["source"] = "real"
|
| 61 |
+
result["data_type"] = data_type
|
| 62 |
+
return result
|
| 63 |
+
|
| 64 |
+
# If all methods fail, provide better error info
|
| 65 |
+
st.error("β All real extraction methods failed. Facebook has strong anti-bot protection.")
|
| 66 |
+
st.info("""
|
| 67 |
+
**Why this happens:**
|
| 68 |
+
- Facebook blocks automated requests
|
| 69 |
+
- Requires JavaScript execution
|
| 70 |
+
- Needs cookies and session management
|
| 71 |
+
- Heavy anti-bot detection
|
| 72 |
+
|
| 73 |
+
**For your university project, you can:**
|
| 74 |
+
1. Use the demo data to demonstrate functionality
|
| 75 |
+
2. Explain these technical limitations in your report
|
| 76 |
+
3. Show that LinkedIn works (no restrictions)
|
| 77 |
+
4. Discuss platform security differences
|
| 78 |
+
""")
|
| 79 |
+
|
| 80 |
+
# Only use demo data as last resort
|
| 81 |
+
return self._get_minimal_demo_data(url, data_type)
|
| 82 |
+
|
| 83 |
+
def _try_direct_extraction(self, url: str) -> Dict:
|
| 84 |
+
"""Try direct extraction with rotating headers"""
|
| 85 |
try:
|
| 86 |
+
headers = {
|
| 87 |
+
'User-Agent': random.choice(self.user_agents),
|
| 88 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/avif,*/*;q=0.8',
|
| 89 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 90 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
| 91 |
+
'DNT': '1',
|
| 92 |
+
'Connection': 'keep-alive',
|
| 93 |
+
'Upgrade-Insecure-Requests': '1',
|
| 94 |
+
'Sec-Fetch-Dest': 'document',
|
| 95 |
+
'Sec-Fetch-Mode': 'navigate',
|
| 96 |
+
'Sec-Fetch-Site': 'none',
|
| 97 |
+
'Cache-Control': 'max-age=0',
|
| 98 |
+
}
|
| 99 |
|
| 100 |
+
# Try with different timeouts and settings
|
| 101 |
+
response = self.session.get(
|
| 102 |
+
url,
|
| 103 |
+
headers=headers,
|
| 104 |
+
timeout=15,
|
| 105 |
+
allow_redirects=True
|
| 106 |
+
)
|
| 107 |
|
| 108 |
+
if response.status_code == 200:
|
| 109 |
+
return self._parse_facebook_response(response, url)
|
| 110 |
+
else:
|
| 111 |
+
return {"status": "error", "reason": f"HTTP {response.status_code}"}
|
| 112 |
+
|
| 113 |
except Exception as e:
|
| 114 |
+
return {"status": "error", "reason": str(e)}
|
|
|
|
| 115 |
|
| 116 |
+
def _try_mobile_extraction(self, url: str) -> Dict:
|
| 117 |
+
"""Try mobile version extraction"""
|
| 118 |
try:
|
| 119 |
+
mobile_headers = {
|
| 120 |
+
'User-Agent': 'Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36',
|
|
|
|
| 121 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 122 |
'Accept-Language': 'en-US,en;q=0.5',
|
| 123 |
'Accept-Encoding': 'gzip, deflate, br',
|
|
|
|
|
|
|
|
|
|
| 124 |
}
|
| 125 |
|
| 126 |
+
response = self.session.get(url, headers=mobile_headers, timeout=15)
|
|
|
|
| 127 |
|
| 128 |
if response.status_code == 200:
|
| 129 |
+
return self._parse_facebook_response(response, url)
|
| 130 |
+
else:
|
| 131 |
+
return {"status": "error", "reason": f"Mobile HTTP {response.status_code}"}
|
| 132 |
|
| 133 |
+
except Exception as e:
|
| 134 |
+
return {"status": "error", "reason": str(e)}
|
| 135 |
+
|
| 136 |
+
def _try_text_only_extraction(self, url: str) -> Dict:
|
| 137 |
+
"""Try text-only version or alternative approaches"""
|
| 138 |
+
try:
|
| 139 |
+
# Try textise.iitty
|
| 140 |
+
textise_url = f"https://r.jina.ai/{url}"
|
| 141 |
+
response = self.session.get(textise_url, timeout=20)
|
| 142 |
+
|
| 143 |
+
if response.status_code == 200:
|
| 144 |
+
return self._parse_textise_response(response, url)
|
| 145 |
+
else:
|
| 146 |
+
return {"status": "error", "reason": "Textise failed"}
|
| 147 |
|
| 148 |
+
except Exception as e:
|
| 149 |
+
return {"status": "error", "reason": str(e)}
|
| 150 |
+
|
| 151 |
+
def _parse_facebook_response(self, response, url: str) -> Dict:
|
| 152 |
+
"""Parse Facebook response for real data"""
|
| 153 |
+
try:
|
| 154 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 155 |
+
|
| 156 |
+
# Extract basic information
|
| 157 |
+
title = soup.find('title')
|
| 158 |
+
description = soup.find('meta', attrs={'name': 'description'})
|
| 159 |
+
og_title = soup.find('meta', property='og:title')
|
| 160 |
+
og_description = soup.find('meta', property='og:description')
|
| 161 |
+
|
| 162 |
+
# Try to find meaningful content
|
| 163 |
+
content_elements = soup.find_all(['p', 'div', 'span'], string=True)
|
| 164 |
+
meaningful_text = []
|
| 165 |
+
|
| 166 |
+
for element in content_elements:
|
| 167 |
+
text = element.get_text().strip()
|
| 168 |
+
if (len(text) > 20 and
|
| 169 |
+
not any(word in text.lower() for word in ['cookie', 'login', 'sign up', 'facebook']) and
|
| 170 |
+
len(text.split()) > 3):
|
| 171 |
+
meaningful_text.append(text)
|
| 172 |
+
|
| 173 |
+
# Create content blocks from real data
|
| 174 |
+
content_blocks = []
|
| 175 |
+
for i, text in enumerate(meaningful_text[:10]): # Limit to first 10 meaningful texts
|
| 176 |
+
content_blocks.append({
|
| 177 |
+
"id": i + 1,
|
| 178 |
+
"content": text,
|
| 179 |
+
"length": len(text),
|
| 180 |
+
"word_count": len(text.split()),
|
| 181 |
+
"content_type": self._classify_content(text),
|
| 182 |
+
"is_public_content": True
|
| 183 |
+
})
|
| 184 |
+
|
| 185 |
+
if content_blocks:
|
| 186 |
return {
|
| 187 |
"page_info": {
|
| 188 |
+
"title": og_title['content'] if og_title else (title.text if title else "Facebook Content"),
|
| 189 |
+
"description": og_description['content'] if og_description else (description['content'] if description else ""),
|
| 190 |
"url": url,
|
| 191 |
+
"response_code": response.status_code,
|
| 192 |
+
"content_length": len(response.text),
|
| 193 |
+
"access_note": "Real data extracted successfully"
|
| 194 |
},
|
| 195 |
+
"content_blocks": content_blocks,
|
| 196 |
"extraction_time": datetime.now().isoformat(),
|
| 197 |
+
"status": "success"
|
|
|
|
|
|
|
| 198 |
}
|
| 199 |
else:
|
| 200 |
+
return {"status": "error", "reason": "No meaningful content found"}
|
| 201 |
|
| 202 |
+
except Exception as e:
|
| 203 |
+
return {"status": "error", "reason": f"Parsing error: {str(e)}"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
+
def _parse_textise_response(self, response, url: str) -> Dict:
|
| 206 |
+
"""Parse textise response"""
|
| 207 |
+
try:
|
| 208 |
+
# Textise provides cleaner text content
|
| 209 |
+
lines = response.text.split('\n')
|
| 210 |
+
meaningful_lines = [line.strip() for line in lines if len(line.strip()) > 30]
|
| 211 |
+
|
| 212 |
+
content_blocks = []
|
| 213 |
+
for i, line in enumerate(meaningful_lines[:8]):
|
| 214 |
+
content_blocks.append({
|
| 215 |
+
"id": i + 1,
|
| 216 |
+
"content": line,
|
| 217 |
+
"length": len(line),
|
| 218 |
+
"word_count": len(line.split()),
|
| 219 |
+
"content_type": self._classify_content(line),
|
| 220 |
+
"is_public_content": True
|
| 221 |
+
})
|
| 222 |
+
|
| 223 |
+
if content_blocks:
|
| 224 |
+
return {
|
| 225 |
+
"page_info": {
|
| 226 |
+
"title": "Facebook Content (via Textise)",
|
| 227 |
+
"description": "Content extracted using text-only method",
|
| 228 |
+
"url": url,
|
| 229 |
+
"response_code": response.status_code,
|
| 230 |
+
"content_length": len(response.text),
|
| 231 |
+
"access_note": "Real data via text-only extraction"
|
| 232 |
+
},
|
| 233 |
+
"content_blocks": content_blocks,
|
| 234 |
+
"extraction_time": datetime.now().isoformat(),
|
| 235 |
+
"status": "success"
|
| 236 |
+
}
|
| 237 |
+
else:
|
| 238 |
+
return {"status": "error", "reason": "No content from textise"}
|
| 239 |
+
|
| 240 |
+
except Exception as e:
|
| 241 |
+
return {"status": "error", "reason": str(e)}
|
| 242 |
|
| 243 |
+
def _classify_content(self, text: str) -> str:
|
| 244 |
+
"""Classify content type"""
|
| 245 |
+
text_lower = text.lower()
|
| 246 |
+
|
| 247 |
+
if any(word in text_lower for word in ['welcome', 'join', 'community']):
|
| 248 |
+
return "welcome_message"
|
| 249 |
+
elif any(word in text_lower for word in ['event', 'meetup', 'schedule']):
|
| 250 |
+
return "event_info"
|
| 251 |
+
elif any(word in text_lower for word in ['post', 'share', 'comment']):
|
| 252 |
+
return "social_content"
|
| 253 |
+
elif any(word in text_lower for word in ['question', 'help', 'advice']):
|
| 254 |
+
return "question_post"
|
| 255 |
else:
|
| 256 |
+
return "general_content"
|
| 257 |
|
| 258 |
+
def _get_minimal_demo_data(self, url: str, data_type: str) -> Dict:
|
| 259 |
+
"""Only use demo data as absolute last resort"""
|
| 260 |
+
st.warning("π Using minimal demo data for demonstration purposes")
|
| 261 |
|
| 262 |
return {
|
| 263 |
"page_info": {
|
| 264 |
+
"title": "Facebook Content (Demo - Real extraction blocked)",
|
| 265 |
+
"description": "This would show real Facebook data if not blocked by platform restrictions",
|
|
|
|
| 266 |
"url": url,
|
| 267 |
+
"response_code": 403,
|
| 268 |
+
"content_length": 0,
|
| 269 |
+
"access_note": "DEMO: Facebook blocked real data extraction"
|
| 270 |
},
|
| 271 |
"content_blocks": [
|
| 272 |
{
|
| 273 |
"id": 1,
|
| 274 |
+
"content": "This is a demonstration of what real Facebook data would look like. Actual extraction is blocked by Facebook's anti-bot protection.",
|
| 275 |
"length": 120,
|
| 276 |
+
"word_count": 20,
|
| 277 |
+
"content_type": "demo_notice",
|
| 278 |
"is_public_content": True
|
| 279 |
},
|
| 280 |
{
|
| 281 |
"id": 2,
|
| 282 |
+
"content": "For your university project, you can discuss these technical limitations and how social media platforms implement security measures.",
|
| 283 |
+
"length": 130,
|
| 284 |
"word_count": 18,
|
| 285 |
+
"content_type": "educational_note",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
"is_public_content": True
|
| 287 |
}
|
| 288 |
],
|
|
|
|
| 290 |
"extraction_time": datetime.now().isoformat(),
|
| 291 |
"data_type": data_type,
|
| 292 |
"status": "success",
|
| 293 |
+
"source": "demo_fallback"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
}
|
| 295 |
|
| 296 |
+
# Rest of the functions remain the same (get_embeddings, get_llm, simple_chat_analysis, etc.)
|
| 297 |
def get_embeddings():
|
| 298 |
"""Initialize embeddings with better error handling and cache management"""
|
| 299 |
try:
|
|
|
|
| 398 |
page_info = extracted_data.get('page_info', {})
|
| 399 |
content_blocks = extracted_data.get('content_blocks', [])
|
| 400 |
url_type = extracted_data.get('url_type', 'Facebook Content')
|
| 401 |
+
source = extracted_data.get('source', 'unknown')
|
| 402 |
|
| 403 |
user_input_lower = user_input.lower()
|
| 404 |
|
|
|
|
| 411 |
f"**Data Source:** {source.upper()}",
|
| 412 |
f"**Description:** {page_info.get('description', 'No description available')}",
|
| 413 |
"",
|
| 414 |
+
f"This appears to be a {url_type.lower()} with {len(content_blocks)} content blocks.",
|
| 415 |
"",
|
| 416 |
"**Key Content Types:**",
|
| 417 |
f"{', '.join(set(block['content_type'] for block in content_blocks))}",
|
|
|
|
| 483 |
|
| 484 |
page_info = extracted_data['page_info']
|
| 485 |
content_blocks = extracted_data['content_blocks']
|
| 486 |
+
url_type = extracted_data.get('url_type', 'Facebook Content')
|
| 487 |
source = extracted_data.get('source', 'unknown')
|
| 488 |
|
| 489 |
all_text = f"FACEBOOK DATA ANALYSIS\n{'='*50}\n\n"
|
|
|
|
| 523 |
chunks = splitter.split_text(all_text)
|
| 524 |
documents = [Document(page_content=chunk) for chunk in chunks]
|
| 525 |
|
| 526 |
+
return "simple", documents
|
| 527 |
|
| 528 |
def create_chatbot(vectorstore):
|
| 529 |
"""Create conversational chatbot"""
|
|
|
|
| 551 |
return "simple" # Fallback to simple mode
|
| 552 |
|
| 553 |
def main():
|
| 554 |
+
st.title("π Facebook Data Extractor - REAL DATA ATTEMPT")
|
| 555 |
+
st.markdown("**Aggressive real data extraction - No automatic demo fallback**")
|
| 556 |
|
| 557 |
if st.button("β Back to Main Dashboard"):
|
| 558 |
st.switch_page("app.py")
|
| 559 |
|
| 560 |
+
# Initialize session state
|
| 561 |
if "extractor" not in st.session_state:
|
| 562 |
+
st.session_state.extractor = FacebookRealExtractor() # Changed to real extractor
|
| 563 |
if "facebook_data" not in st.session_state:
|
| 564 |
st.session_state.facebook_data = None
|
| 565 |
if "vectorstore" not in st.session_state:
|
|
|
|
| 569 |
if "chat_history" not in st.session_state:
|
| 570 |
st.session_state.chat_history = []
|
| 571 |
if "processing_mode" not in st.session_state:
|
| 572 |
+
st.session_state.processing_mode = "ai"
|
| 573 |
if "last_user_input" not in st.session_state:
|
| 574 |
+
st.session_state.last_user_input = ""
|
| 575 |
|
| 576 |
# Sidebar
|
| 577 |
with st.sidebar:
|
|
|
|
| 586 |
facebook_url = st.text_input(
|
| 587 |
"Facebook URL",
|
| 588 |
placeholder="https://www.facebook.com/groups/gamersofbangladesh2",
|
| 589 |
+
help="Enter any Facebook URL for REAL data extraction"
|
| 590 |
)
|
| 591 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
# Quick test URLs
|
| 593 |
st.markdown("### π Test URLs")
|
| 594 |
test_urls = {
|
|
|
|
| 602 |
st.session_state.current_fb_url = url
|
| 603 |
st.rerun()
|
| 604 |
|
| 605 |
+
if st.button("π EXTRACT REAL DATA", type="primary"):
|
| 606 |
url_to_use = facebook_url or getattr(st.session_state, 'current_fb_url', '')
|
| 607 |
|
| 608 |
if not url_to_use:
|
|
|
|
| 610 |
elif 'facebook.com' not in url_to_use:
|
| 611 |
st.error("β Please enter a valid Facebook URL")
|
| 612 |
else:
|
| 613 |
+
with st.spinner("π Aggressively extracting REAL Facebook data..."):
|
| 614 |
extracted_data = st.session_state.extractor.extract_data(url_to_use, data_type)
|
| 615 |
|
| 616 |
if extracted_data.get("status") == "success":
|
| 617 |
st.session_state.facebook_data = extracted_data
|
| 618 |
+
st.session_state.chatbot = "simple"
|
| 619 |
+
st.session_state.chat_history = []
|
| 620 |
+
st.session_state.last_user_input = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
|
| 622 |
source = extracted_data.get('source', 'unknown')
|
| 623 |
+
if source == 'real':
|
| 624 |
+
st.success("π SUCCESS: Real Facebook data extracted!")
|
| 625 |
+
st.balloons()
|
| 626 |
else:
|
| 627 |
+
st.warning("β οΈ Using fallback data - Facebook blocked real extraction")
|
| 628 |
+
|
| 629 |
else:
|
| 630 |
error_msg = extracted_data.get("error", "Unknown error")
|
| 631 |
st.error(f"β Extraction failed: {error_msg}")
|
|
|
|
| 640 |
st.session_state.last_user_input = ""
|
| 641 |
st.rerun()
|
| 642 |
|
| 643 |
+
# Main content
|
| 644 |
st.header("π Extraction Results")
|
| 645 |
|
| 646 |
if st.session_state.facebook_data:
|
|
|
|
| 649 |
content_blocks = data['content_blocks']
|
| 650 |
source = data.get('source', 'unknown')
|
| 651 |
|
| 652 |
+
if source == 'real':
|
| 653 |
+
st.success("β
**REAL DATA** - Successfully extracted from Facebook!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 654 |
else:
|
| 655 |
+
st.warning("π **FALLBACK DATA** - Facebook blocked real extraction")
|
| 656 |
|
| 657 |
# Metrics
|
| 658 |
col1, col2, col3 = st.columns(3)
|
| 659 |
with col1:
|
| 660 |
st.metric("Content Blocks", len(content_blocks))
|
| 661 |
with col2:
|
| 662 |
+
st.metric("Data Source", "REAL" if source == 'real' else "FALLBACK")
|
| 663 |
with col3:
|
| 664 |
+
st.metric("Status", "Success")
|
| 665 |
|
| 666 |
# Page info
|
| 667 |
st.subheader("π·οΈ Page Information")
|
| 668 |
st.write(f"**Title:** {page_info['title']}")
|
|
|
|
| 669 |
st.write(f"**Description:** {page_info.get('description', 'No description')}")
|
| 670 |
+
st.write(f"**Access Note:** {page_info.get('access_note', 'Public content')}")
|
| 671 |
+
st.write(f"**Response Code:** {page_info.get('response_code', 'N/A')}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 672 |
|
| 673 |
# Content samples
|
| 674 |
st.subheader("π Content Analysis")
|
|
|
|
| 679 |
|
| 680 |
else:
|
| 681 |
st.info("""
|
| 682 |
+
## π Facebook Real Data Extractor
|
| 683 |
+
|
| 684 |
+
**Aggressive Approach - No Automatic Demo**
|
| 685 |
+
|
| 686 |
+
**This version:**
|
| 687 |
+
- Tries multiple extraction methods
|
| 688 |
+
- Uses rotating user agents
|
| 689 |
+
- Attempts mobile versions
|
| 690 |
+
- Tries text-only alternatives
|
| 691 |
+
- Only uses demo data as LAST RESORT
|
| 692 |
+
|
| 693 |
+
**Technical Challenges:**
|
| 694 |
+
- Facebook has strong anti-bot protection
|
| 695 |
+
- Requires JavaScript execution
|
| 696 |
+
- Needs session management
|
| 697 |
+
- Heavy rate limiting
|
| 698 |
+
|
| 699 |
+
**For your project:**
|
| 700 |
+
- Shows real technical limitations
|
| 701 |
+
- Demonstrates platform security
|
| 702 |
+
- Provides educational value
|
| 703 |
""")
|
| 704 |
|
| 705 |
+
# Chat section
|
| 706 |
st.markdown("---")
|
| 707 |
st.header("π¬ Analysis Chat")
|
| 708 |
|
|
|
|
| 720 |
if not st.session_state.chat_history:
|
| 721 |
st.subheader("π‘ Try asking:")
|
| 722 |
suggestions = [
|
| 723 |
+
"What is this Facebook content about?",
|
| 724 |
+
"Summarize the extracted data",
|
| 725 |
+
"What kind of information was found?",
|
| 726 |
+
"Analyze the content structure"
|
| 727 |
]
|
| 728 |
|
| 729 |
cols = st.columns(len(suggestions))
|
|
|
|
| 737 |
else:
|
| 738 |
st.info("π Extract Facebook data to enable analysis")
|
| 739 |
|
| 740 |
+
# CHAT INPUT
|
| 741 |
if st.session_state.chatbot and st.session_state.facebook_data:
|
| 742 |
user_input = st.chat_input("Ask about the Facebook data...")
|
| 743 |
|
|
|
|
| 744 |
if user_input and user_input != st.session_state.last_user_input:
|
|
|
|
| 745 |
st.session_state.last_user_input = user_input
|
|
|
|
|
|
|
| 746 |
st.session_state.chat_history.append({"role": "user", "content": user_input})
|
| 747 |
|
| 748 |
with st.spinner("π€ Analyzing..."):
|
| 749 |
try:
|
| 750 |
+
response = simple_chat_analysis(user_input, st.session_state.facebook_data)
|
| 751 |
+
st.session_state.chat_history.append({"role": "assistant", "content": response})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 752 |
st.rerun()
|
| 753 |
except Exception as e:
|
| 754 |
error_msg = f"Analysis Error: {str(e)}"
|