Refat81's picture
Update pages/linkedin_extractor.py
a72a934 verified
raw
history blame
19 kB
# pages/linkedin_extractor.py
import streamlit as st
import requests
from bs4 import BeautifulSoup
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_core.documents import Document
from langchain_community.llms import HuggingFaceHub
import re
import time
import os
st.set_page_config(
page_title="LinkedIn AI Analyzer",
page_icon="πŸ’Ό",
layout="wide"
)
def get_embeddings():
"""Initialize embeddings with multiple fallback options"""
try:
# Try multiple embedding models
model_options = [
"sentence-transformers/all-MiniLM-L6-v2", # Default
"sentence-transformers/paraphrase-albert-small-v2", # Smaller alternative
"sentence-transformers/all-mpnet-base-v2" # Higher quality
]
for model_name in model_options:
try:
embeddings = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs={'device': 'cpu'},
encode_kwargs={'normalize_embeddings': True}
)
st.success(f"βœ… Loaded embeddings: {model_name.split('/')[-1]}")
return embeddings
except Exception as e:
continue
st.error("❌ All embedding models failed to load")
return None
except Exception as e:
st.error(f"❌ Embeddings error: {e}")
return None
def get_llm():
"""Initialize Mistral 7B LLM - Best for analysis"""
try:
api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
if not api_key:
st.error("""
❌ HuggingFace API Key not found!
Please add your API key:
1. Go to Space Settings β†’ Variables and Secrets
2. Add: HUGGINGFACEHUB_API_TOKEN = "your_hf_token_here"
3. Restart the Space
Get free API key: https://huggingface.co/settings/tokens
""")
return None
# Using Mistral 7B - Best balance of quality and accessibility
llm = HuggingFaceHub(
repo_id="mistralai/Mistral-7B-Instruct-v0.1",
huggingfacehub_api_token=api_key,
model_kwargs={
"temperature": 0.7,
"max_length": 2048,
"max_new_tokens": 512,
"top_p": 0.95,
"repetition_penalty": 1.1,
"do_sample": True
}
)
return llm
except Exception as e:
st.error(f"❌ AI Model error: {e}")
return None
def extract_linkedin_data(url, data_type):
"""Extract data from LinkedIn URLs"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
st.info(f"🌐 Accessing: {url}")
response = requests.get(url, headers=headers, timeout=25)
if response.status_code != 200:
return {
"error": f"Failed to access page (Status: {response.status_code})",
"status": "error"
}
soup = BeautifulSoup(response.text, 'html.parser')
# Remove scripts and styles
for script in soup(["script", "style", "meta", "link", "nav", "header", "footer"]):
script.decompose()
# Extract and clean text
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
clean_text = ' '.join(chunk for chunk in chunks if chunk)
# Extract meaningful content
paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 40]
if not paragraphs:
return {
"error": "No meaningful content found. The page might require login or have restricted access.",
"status": "error"
}
# Extract page title
title = soup.find('title')
page_title = title.text.strip() if title else "LinkedIn Page"
# Structure the extracted data
extracted_data = {
"page_info": {
"title": page_title,
"url": url,
"response_code": response.status_code,
"content_length": len(clean_text)
},
"content_blocks": paragraphs,
"extraction_time": time.strftime('%Y-%m-%d %H:%M:%S'),
"data_type": data_type,
"status": "success"
}
return extracted_data
except requests.exceptions.Timeout:
return {"error": "Request timed out. Please try again.", "status": "error"}
except requests.exceptions.ConnectionError:
return {"error": "Connection failed. Please check the URL and try again.", "status": "error"}
except Exception as e:
return {"error": f"Extraction error: {str(e)}", "status": "error"}
def process_extracted_data(extracted_data):
"""Process extracted data for AI analysis"""
if not extracted_data or extracted_data.get("status") != "success":
return None, []
page_info = extracted_data['page_info']
content_blocks = extracted_data['content_blocks']
# Structure the data for AI
all_text = f"LINKEDIN DATA ANALYSIS REPORT\n"
all_text += "=" * 70 + "\n\n"
all_text += f"πŸ“„ PAGE INFORMATION:\n"
all_text += f"Title: {page_info['title']}\n"
all_text += f"URL: {page_info['url']}\n"
all_text += f"Type: {extracted_data['data_type'].upper()}\n"
all_text += f"Extracted: {extracted_data['extraction_time']}\n"
all_text += f"Response Code: {page_info['response_code']}\n"
all_text += f"Content Length: {page_info['content_length']} characters\n\n"
all_text += f"πŸ“Š CONTENT ANALYSIS:\n"
all_text += f"Total Content Blocks: {len(content_blocks)}\n\n"
# Add content blocks
for i, block in enumerate(content_blocks[:20]):
all_text += f"--- CONTENT BLOCK {i+1} ---\n"
all_text += f"Words: {len(block.split())} | Characters: {len(block)}\n"
all_text += f"Content: {block}\n\n"
all_text += "=" * 70 + "\n"
all_text += "END OF EXTRACTION REPORT"
# Split into chunks
splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
chunks = splitter.split_text(all_text)
documents = [Document(page_content=chunk) for chunk in chunks]
# Create vector store
try:
embeddings = get_embeddings()
if embeddings is None:
return None, []
vectorstore = FAISS.from_documents(documents, embeddings)
return vectorstore, chunks
except Exception as e:
st.error(f"Vector store creation failed: {e}")
return None, []
def create_chatbot(vectorstore):
"""Create conversational chatbot with Mistral"""
try:
llm = get_llm()
if llm is None:
return None
memory = ConversationBufferMemory(
memory_key="chat_history",
return_messages=True,
output_key="answer"
)
chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
memory=memory,
return_source_documents=True,
output_key="answer"
)
return chain
except Exception as e:
st.error(f"Failed to create chatbot: {str(e)}")
return None
def clear_chat_history():
"""Clear chat history while keeping extracted data"""
if "vectorstore" in st.session_state and st.session_state.vectorstore:
st.session_state.chatbot = create_chatbot(st.session_state.vectorstore)
st.session_state.chat_history = []
st.success("πŸ”„ Chat history cleared! Starting fresh conversation.")
def display_metrics(extracted_data):
"""Display extraction metrics"""
if not extracted_data:
return
page_info = extracted_data['page_info']
content_blocks = extracted_data['content_blocks']
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Content Blocks", len(content_blocks))
with col2:
total_words = sum(len(block.split()) for block in content_blocks)
st.metric("Total Words", total_words)
with col3:
st.metric("Characters", f"{page_info['content_length']:,}")
with col4:
st.metric("Response Code", page_info['response_code'])
def main():
st.title("πŸ’Ό LinkedIn AI Analyzer")
if st.button("← Back to Main Dashboard"):
st.switch_page("app.py")
# Initialize session state
if "extracted_data" not in st.session_state:
st.session_state.extracted_data = None
if "vectorstore" not in st.session_state:
st.session_state.vectorstore = None
if "chatbot" not in st.session_state:
st.session_state.chatbot = None
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
if "processing" not in st.session_state:
st.session_state.processing = False
if "current_url" not in st.session_state:
st.session_state.current_url = ""
# Sidebar
with st.sidebar:
st.markdown("### βš™οΈ Configuration")
# Data type selection
data_type = st.selectbox(
"πŸ“Š Content Type",
["profile", "company", "post"],
help="Select the type of LinkedIn content"
)
# URL input
url_placeholder = {
"profile": "https://www.linkedin.com/in/username/",
"company": "https://www.linkedin.com/company/companyname/",
"post": "https://www.linkedin.com/posts/username_postid/"
}
linkedin_url = st.text_input(
"🌐 LinkedIn URL",
placeholder=url_placeholder[data_type],
help="Enter a public LinkedIn URL"
)
# Suggested URLs
st.markdown("### πŸš€ Quick Test")
suggested_urls = {
"Microsoft": "https://www.linkedin.com/company/microsoft/",
"Google": "https://www.linkedin.com/company/google/",
"Apple": "https://www.linkedin.com/company/apple/",
"Amazon": "https://www.linkedin.com/company/amazon/"
}
for name, url in suggested_urls.items():
if st.button(f"🏒 {name}", key=name, use_container_width=True):
st.session_state.current_url = url
st.rerun()
# Extract button
if st.button("πŸš€ Extract & Analyze", type="primary", use_container_width=True):
url_to_use = linkedin_url.strip() or st.session_state.current_url
if not url_to_use:
st.warning("⚠️ Please enter a LinkedIn URL")
elif not url_to_use.startswith('https://www.linkedin.com/'):
st.error("❌ Please enter a valid LinkedIn URL")
else:
st.session_state.processing = True
with st.spinner("πŸ”„ Extracting and analyzing data..."):
extracted_data = extract_linkedin_data(url_to_use, data_type)
if extracted_data.get("status") == "success":
st.session_state.extracted_data = extracted_data
st.session_state.current_url = url_to_use
# Process for AI
vectorstore, chunks = process_extracted_data(extracted_data)
if vectorstore:
st.session_state.vectorstore = vectorstore
st.session_state.chatbot = create_chatbot(vectorstore)
st.session_state.chat_history = []
st.success(f"βœ… Successfully processed {len(chunks)} content chunks!")
st.balloons()
else:
st.error("❌ Failed to process data for AI analysis")
else:
error_msg = extracted_data.get("error", "Unknown error occurred")
st.error(f"❌ Extraction failed: {error_msg}")
st.session_state.processing = False
# Chat management
if st.session_state.chatbot and st.session_state.extracted_data:
st.markdown("---")
st.subheader("πŸ’¬ Chat Management")
if st.button("πŸ—‘οΈ Clear Chat History", type="secondary", use_container_width=True):
clear_chat_history()
# Main content area
col1, col2 = st.columns([1, 1])
with col1:
st.markdown("### πŸ“Š Extraction Results")
if st.session_state.processing:
st.info("πŸ”„ Processing LinkedIn data...")
elif st.session_state.extracted_data:
data = st.session_state.extracted_data
page_info = data['page_info']
content_blocks = data['content_blocks']
st.success("βœ… Extraction Complete")
# Display metrics
display_metrics(data)
# Display page info
st.markdown("#### 🏷️ Page Information")
st.write(f"**Title:** {page_info['title']}")
st.write(f"**URL:** {page_info['url']}")
st.write(f"**Data Type:** {data['data_type'].title()}")
st.write(f"**Content Blocks:** {len(content_blocks)}")
st.write(f"**Extraction Time:** {data['extraction_time']}")
# Display sample content
st.markdown("#### πŸ“ Sample Content")
for i, block in enumerate(content_blocks[:3]):
with st.expander(f"Content Block {i+1} ({len(block.split())} words)"):
st.write(block)
if len(content_blocks) > 3:
st.info(f"πŸ“„ And {len(content_blocks) - 3} more content blocks...")
else:
st.info("""
πŸ‘‹ **Welcome to LinkedIn AI Analyzer!**
**Powered by Mistral 7B AI**
**To get started:**
1. Select content type
2. Enter a LinkedIn URL or click a suggested company
3. Click "Extract & Analyze"
4. Chat with AI about the extracted content
**Supported URLs:**
- πŸ‘€ Public Profiles
- 🏒 Company Pages
- πŸ“ Public Posts
**AI Features:**
- Smart content analysis
- Conversational chat
- Data insights
- Content summarization
""")
with col2:
st.markdown("### πŸ’¬ AI Chat Analysis")
if st.session_state.chatbot and st.session_state.extracted_data:
# Display chat history
for i, chat in enumerate(st.session_state.chat_history):
if chat["role"] == "user":
st.markdown(f"**πŸ‘€ You:** {chat['content']}")
elif chat["role"] == "assistant":
st.markdown(f"**πŸ€– AI:** {chat['content']}")
# Chat input
user_input = st.chat_input("Ask about the LinkedIn data...")
if user_input:
# Add user message
st.session_state.chat_history.append({"role": "user", "content": user_input})
# Generate AI response
with st.spinner("πŸ€” Mistral AI is analyzing..."):
try:
response = st.session_state.chatbot.invoke({"question": user_input})
answer = response.get("answer", "I couldn't generate a response based on the available data.")
st.session_state.chat_history.append({"role": "assistant", "content": answer})
st.rerun()
except Exception as e:
error_msg = f"❌ Error generating response: {str(e)}"
st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
st.rerun()
# Suggested questions
if not st.session_state.chat_history:
st.markdown("#### πŸ’‘ Suggested Questions")
suggestions = [
"Summarize the main information from this page",
"What are the key highlights or achievements?",
"Analyze the business or professional focus",
"What insights can you extract from this content?",
"Provide a comprehensive overview"
]
for suggestion in suggestions:
if st.button(suggestion, key=f"suggest_{suggestion}", use_container_width=True):
st.info(f"πŸ’‘ Try asking: '{suggestion}'")
elif st.session_state.extracted_data:
st.info("πŸ’¬ Start a conversation with the AI assistant")
else:
st.info("πŸ” Extract LinkedIn data to enable AI analysis")
# Features section
st.markdown("---")
st.markdown("### πŸš€ Powered by Mistral 7B AI")
feature_cols = st.columns(3)
with feature_cols[0]:
st.markdown("""
**πŸ€– Advanced AI**
- Mistral 7B Instruct model
- Intelligent text analysis
- Contextual understanding
""")
with feature_cols[1]:
st.markdown("""
**πŸ’¬ Smart Chat**
- Conversational memory
- Relevant responses
- Data-driven insights
""")
with feature_cols[2]:
st.markdown("""
**πŸ” Deep Analysis**
- Content summarization
- Pattern recognition
- Professional insights
""")
if __name__ == "__main__":
main()