Spaces:

MHamdan
/

ContentAnalyzer

Running

App Files Files Community

ContentAnalyzer / app.py

MHamdan

Update app.py

18d6761 verified 2 months ago

raw

history blame

9.14 kB

	# app.py

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	from transformers import pipeline
	import PyPDF2
	import docx
	import os
	import time
	from typing import List, Tuple, Optional

	class ContentAnalyzer:
	def __init__(self):
	print("[DEBUG] Initializing pipelines...")
	self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
	self.sentiment_analyzer = pipeline("sentiment-analysis")
	self.zero_shot = pipeline("zero-shot-classification")
	print("[DEBUG] Pipelines initialized.")

	def read_file(self, file_obj) -> str:
	"""Read content from different file types."""
	if file_obj is None:
	print("[DEBUG] No file uploaded.")
	return ""

	file_ext = os.path.splitext(file_obj.name)[1].lower()
	print(f"[DEBUG] Uploaded file extension detected: {file_ext}")

	try:
	if file_ext == '.txt':
	content = file_obj.read().decode('utf-8')
	print("[DEBUG] Successfully read .txt file.")
	return content

	elif file_ext == '.pdf':
	# Note: For PyPDF2 >= 3.0.0, this usage is valid
	pdf_reader = PyPDF2.PdfReader(file_obj)
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"
	print("[DEBUG] Successfully read .pdf file.")
	return text

	elif file_ext == '.docx':
	doc = docx.Document(file_obj)
	paragraphs = [paragraph.text for paragraph in doc.paragraphs]
	print("[DEBUG] Successfully read .docx file.")
	return "\n".join(paragraphs)

	else:
	msg = f"Unsupported file type: {file_ext}"
	print("[DEBUG]", msg)
	return msg

	except Exception as e:
	error_msg = f"Error reading file: {str(e)}"
	print("[DEBUG]", error_msg)
	return error_msg

	def fetch_web_content(self, url: str) -> str:
	"""Fetch content from URL."""
	print(f"[DEBUG] Attempting to fetch URL: {url}")
	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')

	# Remove scripts and styles
	for script in soup(["script", "style"]):
	script.decompose()

	text = soup.get_text(separator='\n')
	lines = (line.strip() for line in text.splitlines())
	final_text = "\n".join(line for line in lines if line)
	print("[DEBUG] Successfully fetched and cleaned web content.")
	return final_text

	except Exception as e:
	error_msg = f"Error fetching URL: {str(e)}"
	print("[DEBUG]", error_msg)
	return error_msg

	def analyze_content(
	self,
	text: Optional[str] = None,
	url: Optional[str] = None,
	file: Optional[object] = None,
	analysis_types: List[str] = ["summarize"],
	progress_callback=None
	) -> dict:
	"""
	Analyze content from text, URL, or file.
	progress_callback is a function for updating progress steps.
	"""
	try:
	# Step 1: Retrieve content
	if progress_callback:
	progress_callback(1, "Reading input...")

	if url:
	content = self.fetch_web_content(url)
	elif file:
	content = self.read_file(file)
	else:
	content = text or ""

	if not content or content.startswith("Error"):
	return {"error": content or "No content provided"}

	# Truncate for debug
	truncated = content[:1000] + "..." if len(content) > 1000 else content
	results = {"original_text": truncated}

	# Step 2: Summarize
	if "summarize" in analysis_types:
	if progress_callback:
	progress_callback(2, "Summarizing content...")
	summary = self.summarizer(content[:1024], max_length=130, min_length=30)
	results["summary"] = summary[0]['summary_text']

	# Step 3: Sentiment
	if "sentiment" in analysis_types:
	if progress_callback:
	progress_callback(3, "Performing sentiment analysis...")
	sentiment = self.sentiment_analyzer(content[:512])
	results["sentiment"] = {
	"label": sentiment[0]['label'],
	"score": round(sentiment[0]['score'], 3)
	}

	# Step 4: Topics
	if "topics" in analysis_types:
	if progress_callback:
	progress_callback(4, "Identifying topics...")
	topics = self.zero_shot(
	content[:512],
	candidate_labels=[
	"technology", "science", "business", "politics",
	"entertainment", "education", "health", "sports"
	]
	)
	results["topics"] = [
	{"label": label, "score": round(score, 3)}
	for label, score in zip(topics['labels'], topics['scores'])
	if score > 0.1
	]

	return results

	except Exception as e:
	error_msg = f"Analysis error: {str(e)}"
	print("[DEBUG]", error_msg)
	return {"error": error_msg}


	def create_interface():
	analyzer = ContentAnalyzer()

	with gr.Blocks(title="Content Analyzer") as demo:
	gr.Markdown("# 📑 Content Analyzer")
	gr.Markdown("Analyze text content from various sources using AI.")

	with gr.Tabs():
	# Text Input Tab
	with gr.Tab("Text Input"):
	text_input = gr.Textbox(
	label="Enter Text",
	placeholder="Paste your text here...",
	lines=5
	)

	# URL Input Tab
	with gr.Tab("Web URL"):
	url_input = gr.Textbox(
	label="Enter URL",
	placeholder="https://example.com"
	)

	# File Upload Tab
	with gr.Tab("File Upload"):
	file_input = gr.File(
	label="Upload File",
	file_types=[".txt", ".pdf", ".docx"]
	)

	# Analysis Options
	analysis_types = gr.CheckboxGroup(
	choices=["summarize", "sentiment", "topics"],
	value=["summarize"],
	label="Analysis Types"
	)

	analyze_btn = gr.Button("Analyze", variant="primary")

	# Output Sections
	with gr.Tabs():
	with gr.Tab("Original Text"):
	original_text = gr.Markdown()
	with gr.Tab("Summary"):
	summary_output = gr.Markdown()
	with gr.Tab("Sentiment"):
	sentiment_output = gr.Markdown()
	with gr.Tab("Topics"):
	topics_output = gr.Markdown()

	def process_analysis(text, url, file, types, progress=gr.Progress()):
	"""
	This function is wrapped by gradio to handle user inputs.
	We use progress to show step-by-step updates.
	"""
	steps_total = 4 # We have up to 4 possible steps

	def progress_callback(step, desc):
	progress((step, desc), total=steps_total)

	results = analyzer.analyze_content(
	text=text,
	url=url,
	file=file,
	analysis_types=types,
	progress_callback=progress_callback
	)

	# If there's an error, show it in "Original Text" tab for clarity
	if "error" in results:
	return results["error"], "", "", ""

	# Format outputs
	original = results.get("original_text", "")
	summary = results.get("summary", "")

	sentiment = ""
	if "sentiment" in results:
	sent = results["sentiment"]
	sentiment = f"Sentiment: {sent['label']} (Confidence: {sent['score']})"

	topics = ""
	if "topics" in results:
	topics_list = "\n".join([
	f"- {t['label']}: {t['score']}"
	for t in results["topics"]
	])
	topics = "Detected Topics:\n" + topics_list

	return original, summary, sentiment, topics

	analyze_btn.click(
	fn=process_analysis,
	inputs=[text_input, url_input, file_input, analysis_types],
	outputs=[original_text, summary_output, sentiment_output, topics_output],
	show_progress=True # Enable the progress bar in Gradio
	)

	return demo

	if __name__ == "__main__":
	demo = create_interface()
	demo.launch()