Spaces:

Refat81
/

Social_Media_Data_Extractor_Chatbot

Sleeping

App Files Files Community

Social_Media_Data_Extractor_Chatbot / pages /linkedin_extractor.py

Refat81

Update pages/linkedin_extractor.py

a72a934 verified about 2 months ago

raw

history blame

19 kB

	# pages/linkedin_extractor.py
	import streamlit as st
	import requests
	from bs4 import BeautifulSoup
	from langchain_text_splitters import CharacterTextSplitter
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain.memory import ConversationBufferMemory
	from langchain.chains import ConversationalRetrievalChain
	from langchain_core.documents import Document
	from langchain_community.llms import HuggingFaceHub
	import re
	import time
	import os

	st.set_page_config(
	page_title="LinkedIn AI Analyzer",
	page_icon="💼",
	layout="wide"
	)

	def get_embeddings():
	"""Initialize embeddings with multiple fallback options"""
	try:
	# Try multiple embedding models
	model_options = [
	"sentence-transformers/all-MiniLM-L6-v2", # Default
	"sentence-transformers/paraphrase-albert-small-v2", # Smaller alternative
	"sentence-transformers/all-mpnet-base-v2" # Higher quality
	]

	for model_name in model_options:
	try:
	embeddings = HuggingFaceEmbeddings(
	model_name=model_name,
	model_kwargs={'device': 'cpu'},
	encode_kwargs={'normalize_embeddings': True}
	)
	st.success(f"✅ Loaded embeddings: {model_name.split('/')[-1]}")
	return embeddings
	except Exception as e:
	continue

	st.error("❌ All embedding models failed to load")
	return None

	except Exception as e:
	st.error(f"❌ Embeddings error: {e}")
	return None

	def get_llm():
	"""Initialize Mistral 7B LLM - Best for analysis"""
	try:
	api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
	if not api_key:
	st.error("""
	❌ HuggingFace API Key not found!

	Please add your API key:
	1. Go to Space Settings → Variables and Secrets
	2. Add: HUGGINGFACEHUB_API_TOKEN = "your_hf_token_here"
	3. Restart the Space

	Get free API key: https://huggingface.co/settings/tokens
	""")
	return None

	# Using Mistral 7B - Best balance of quality and accessibility
	llm = HuggingFaceHub(
	repo_id="mistralai/Mistral-7B-Instruct-v0.1",
	huggingfacehub_api_token=api_key,
	model_kwargs={
	"temperature": 0.7,
	"max_length": 2048,
	"max_new_tokens": 512,
	"top_p": 0.95,
	"repetition_penalty": 1.1,
	"do_sample": True
	}
	)
	return llm
	except Exception as e:
	st.error(f"❌ AI Model error: {e}")
	return None

	def extract_linkedin_data(url, data_type):
	"""Extract data from LinkedIn URLs"""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Accept-Encoding': 'gzip, deflate, br',
	'DNT': '1',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1',
	}

	st.info(f"🌐 Accessing: {url}")
	response = requests.get(url, headers=headers, timeout=25)

	if response.status_code != 200:
	return {
	"error": f"Failed to access page (Status: {response.status_code})",
	"status": "error"
	}

	soup = BeautifulSoup(response.text, 'html.parser')

	# Remove scripts and styles
	for script in soup(["script", "style", "meta", "link", "nav", "header", "footer"]):
	script.decompose()

	# Extract and clean text
	text = soup.get_text()
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	clean_text = ' '.join(chunk for chunk in chunks if chunk)

	# Extract meaningful content
	paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 40]

	if not paragraphs:
	return {
	"error": "No meaningful content found. The page might require login or have restricted access.",
	"status": "error"
	}

	# Extract page title
	title = soup.find('title')
	page_title = title.text.strip() if title else "LinkedIn Page"

	# Structure the extracted data
	extracted_data = {
	"page_info": {
	"title": page_title,
	"url": url,
	"response_code": response.status_code,
	"content_length": len(clean_text)
	},
	"content_blocks": paragraphs,
	"extraction_time": time.strftime('%Y-%m-%d %H:%M:%S'),
	"data_type": data_type,
	"status": "success"
	}

	return extracted_data

	except requests.exceptions.Timeout:
	return {"error": "Request timed out. Please try again.", "status": "error"}
	except requests.exceptions.ConnectionError:
	return {"error": "Connection failed. Please check the URL and try again.", "status": "error"}
	except Exception as e:
	return {"error": f"Extraction error: {str(e)}", "status": "error"}

	def process_extracted_data(extracted_data):
	"""Process extracted data for AI analysis"""
	if not extracted_data or extracted_data.get("status") != "success":
	return None, []

	page_info = extracted_data['page_info']
	content_blocks = extracted_data['content_blocks']

	# Structure the data for AI
	all_text = f"LINKEDIN DATA ANALYSIS REPORT\n"
	all_text += "=" * 70 + "\n\n"
	all_text += f"📄 PAGE INFORMATION:\n"
	all_text += f"Title: {page_info['title']}\n"
	all_text += f"URL: {page_info['url']}\n"
	all_text += f"Type: {extracted_data['data_type'].upper()}\n"
	all_text += f"Extracted: {extracted_data['extraction_time']}\n"
	all_text += f"Response Code: {page_info['response_code']}\n"
	all_text += f"Content Length: {page_info['content_length']} characters\n\n"

	all_text += f"📊 CONTENT ANALYSIS:\n"
	all_text += f"Total Content Blocks: {len(content_blocks)}\n\n"

	# Add content blocks
	for i, block in enumerate(content_blocks[:20]):
	all_text += f"--- CONTENT BLOCK {i+1} ---\n"
	all_text += f"Words: {len(block.split())} \| Characters: {len(block)}\n"
	all_text += f"Content: {block}\n\n"

	all_text += "=" * 70 + "\n"
	all_text += "END OF EXTRACTION REPORT"

	# Split into chunks
	splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size=1000,
	chunk_overlap=200,
	length_function=len
	)

	chunks = splitter.split_text(all_text)
	documents = [Document(page_content=chunk) for chunk in chunks]

	# Create vector store
	try:
	embeddings = get_embeddings()
	if embeddings is None:
	return None, []
	vectorstore = FAISS.from_documents(documents, embeddings)
	return vectorstore, chunks
	except Exception as e:
	st.error(f"Vector store creation failed: {e}")
	return None, []

	def create_chatbot(vectorstore):
	"""Create conversational chatbot with Mistral"""
	try:
	llm = get_llm()
	if llm is None:
	return None

	memory = ConversationBufferMemory(
	memory_key="chat_history",
	return_messages=True,
	output_key="answer"
	)

	chain = ConversationalRetrievalChain.from_llm(
	llm=llm,
	retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
	memory=memory,
	return_source_documents=True,
	output_key="answer"
	)
	return chain
	except Exception as e:
	st.error(f"Failed to create chatbot: {str(e)}")
	return None

	def clear_chat_history():
	"""Clear chat history while keeping extracted data"""
	if "vectorstore" in st.session_state and st.session_state.vectorstore:
	st.session_state.chatbot = create_chatbot(st.session_state.vectorstore)
	st.session_state.chat_history = []
	st.success("🔄 Chat history cleared! Starting fresh conversation.")

	def display_metrics(extracted_data):
	"""Display extraction metrics"""
	if not extracted_data:
	return

	page_info = extracted_data['page_info']
	content_blocks = extracted_data['content_blocks']

	col1, col2, col3, col4 = st.columns(4)

	with col1:
	st.metric("Content Blocks", len(content_blocks))

	with col2:
	total_words = sum(len(block.split()) for block in content_blocks)
	st.metric("Total Words", total_words)

	with col3:
	st.metric("Characters", f"{page_info['content_length']:,}")

	with col4:
	st.metric("Response Code", page_info['response_code'])

	def main():
	st.title("💼 LinkedIn AI Analyzer")

	if st.button("← Back to Main Dashboard"):
	st.switch_page("app.py")

	# Initialize session state
	if "extracted_data" not in st.session_state:
	st.session_state.extracted_data = None
	if "vectorstore" not in st.session_state:
	st.session_state.vectorstore = None
	if "chatbot" not in st.session_state:
	st.session_state.chatbot = None
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = []
	if "processing" not in st.session_state:
	st.session_state.processing = False
	if "current_url" not in st.session_state:
	st.session_state.current_url = ""

	# Sidebar
	with st.sidebar:
	st.markdown("### ⚙️ Configuration")

	# Data type selection
	data_type = st.selectbox(
	"📊 Content Type",
	["profile", "company", "post"],
	help="Select the type of LinkedIn content"
	)

	# URL input
	url_placeholder = {
	"profile": "https://www.linkedin.com/in/username/",
	"company": "https://www.linkedin.com/company/companyname/",
	"post": "https://www.linkedin.com/posts/username_postid/"
	}

	linkedin_url = st.text_input(
	"🌐 LinkedIn URL",
	placeholder=url_placeholder[data_type],
	help="Enter a public LinkedIn URL"
	)

	# Suggested URLs
	st.markdown("### 🚀 Quick Test")
	suggested_urls = {
	"Microsoft": "https://www.linkedin.com/company/microsoft/",
	"Google": "https://www.linkedin.com/company/google/",
	"Apple": "https://www.linkedin.com/company/apple/",
	"Amazon": "https://www.linkedin.com/company/amazon/"
	}

	for name, url in suggested_urls.items():
	if st.button(f"🏢 {name}", key=name, use_container_width=True):
	st.session_state.current_url = url
	st.rerun()

	# Extract button
	if st.button("🚀 Extract & Analyze", type="primary", use_container_width=True):
	url_to_use = linkedin_url.strip() or st.session_state.current_url

	if not url_to_use:
	st.warning("⚠️ Please enter a LinkedIn URL")
	elif not url_to_use.startswith('https://www.linkedin.com/'):
	st.error("❌ Please enter a valid LinkedIn URL")
	else:
	st.session_state.processing = True
	with st.spinner("🔄 Extracting and analyzing data..."):
	extracted_data = extract_linkedin_data(url_to_use, data_type)

	if extracted_data.get("status") == "success":
	st.session_state.extracted_data = extracted_data
	st.session_state.current_url = url_to_use

	# Process for AI
	vectorstore, chunks = process_extracted_data(extracted_data)
	if vectorstore:
	st.session_state.vectorstore = vectorstore
	st.session_state.chatbot = create_chatbot(vectorstore)
	st.session_state.chat_history = []
	st.success(f"✅ Successfully processed {len(chunks)} content chunks!")
	st.balloons()
	else:
	st.error("❌ Failed to process data for AI analysis")
	else:
	error_msg = extracted_data.get("error", "Unknown error occurred")
	st.error(f"❌ Extraction failed: {error_msg}")

	st.session_state.processing = False

	# Chat management
	if st.session_state.chatbot and st.session_state.extracted_data:
	st.markdown("---")
	st.subheader("💬 Chat Management")
	if st.button("🗑️ Clear Chat History", type="secondary", use_container_width=True):
	clear_chat_history()

	# Main content area
	col1, col2 = st.columns([1, 1])

	with col1:
	st.markdown("### 📊 Extraction Results")

	if st.session_state.processing:
	st.info("🔄 Processing LinkedIn data...")

	elif st.session_state.extracted_data:
	data = st.session_state.extracted_data
	page_info = data['page_info']
	content_blocks = data['content_blocks']

	st.success("✅ Extraction Complete")

	# Display metrics
	display_metrics(data)

	# Display page info
	st.markdown("#### 🏷️ Page Information")
	st.write(f"Title: {page_info['title']}")
	st.write(f"URL: {page_info['url']}")
	st.write(f"Data Type: {data['data_type'].title()}")
	st.write(f"Content Blocks: {len(content_blocks)}")
	st.write(f"Extraction Time: {data['extraction_time']}")

	# Display sample content
	st.markdown("#### 📝 Sample Content")
	for i, block in enumerate(content_blocks[:3]):
	with st.expander(f"Content Block {i+1} ({len(block.split())} words)"):
	st.write(block)

	if len(content_blocks) > 3:
	st.info(f"📄 And {len(content_blocks) - 3} more content blocks...")

	else:
	st.info("""
	👋 Welcome to LinkedIn AI Analyzer!

	Powered by Mistral 7B AI

	To get started:
	1. Select content type
	2. Enter a LinkedIn URL or click a suggested company
	3. Click "Extract & Analyze"
	4. Chat with AI about the extracted content

	Supported URLs:
	- 👤 Public Profiles
	- 🏢 Company Pages
	- 📝 Public Posts

	AI Features:
	- Smart content analysis
	- Conversational chat
	- Data insights
	- Content summarization
	""")

	with col2:
	st.markdown("### 💬 AI Chat Analysis")

	if st.session_state.chatbot and st.session_state.extracted_data:
	# Display chat history
	for i, chat in enumerate(st.session_state.chat_history):
	if chat["role"] == "user":
	st.markdown(f"👤 You: {chat['content']}")
	elif chat["role"] == "assistant":
	st.markdown(f"🤖 AI: {chat['content']}")

	# Chat input
	user_input = st.chat_input("Ask about the LinkedIn data...")

	if user_input:
	# Add user message
	st.session_state.chat_history.append({"role": "user", "content": user_input})

	# Generate AI response
	with st.spinner("🤔 Mistral AI is analyzing..."):
	try:
	response = st.session_state.chatbot.invoke({"question": user_input})
	answer = response.get("answer", "I couldn't generate a response based on the available data.")

	st.session_state.chat_history.append({"role": "assistant", "content": answer})
	st.rerun()
	except Exception as e:
	error_msg = f"❌ Error generating response: {str(e)}"
	st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
	st.rerun()

	# Suggested questions
	if not st.session_state.chat_history:
	st.markdown("#### 💡 Suggested Questions")
	suggestions = [
	"Summarize the main information from this page",
	"What are the key highlights or achievements?",
	"Analyze the business or professional focus",
	"What insights can you extract from this content?",
	"Provide a comprehensive overview"
	]

	for suggestion in suggestions:
	if st.button(suggestion, key=f"suggest_{suggestion}", use_container_width=True):
	st.info(f"💡 Try asking: '{suggestion}'")

	elif st.session_state.extracted_data:
	st.info("💬 Start a conversation with the AI assistant")
	else:
	st.info("🔍 Extract LinkedIn data to enable AI analysis")

	# Features section
	st.markdown("---")
	st.markdown("### 🚀 Powered by Mistral 7B AI")

	feature_cols = st.columns(3)

	with feature_cols[0]:
	st.markdown("""
	🤖 Advanced AI
	- Mistral 7B Instruct model
	- Intelligent text analysis
	- Contextual understanding
	""")

	with feature_cols[1]:
	st.markdown("""
	💬 Smart Chat
	- Conversational memory
	- Relevant responses
	- Data-driven insights
	""")

	with feature_cols[2]:
	st.markdown("""
	🔍 Deep Analysis
	- Content summarization
	- Pattern recognition
	- Professional insights
	""")

	if __name__ == "__main__":
	main()