Spaces:

simar007
/

web-scraper

Sleeping

App Files Files Community

web-scraper / app.py

simar007

Upload 3 files

ae4572b verified 3 months ago

raw

history blame contribute delete

5.6 kB

	"""
	Web Content Extractor - Hugging Face Version
	--------------------------------------------
	✅ Flask + BeautifulSoup + NLTK
	✅ Extracts headings, paragraphs, links, images
	✅ Performs NLP analysis (word counts, frequency, stopwords)
	✅ Auto language detection
	"""

	from flask import Flask, render_template, request, jsonify
	from flask_cors import CORS
	import os
	import requests
	from bs4 import BeautifulSoup
	import nltk
	from nltk.corpus import stopwords
	from nltk.probability import FreqDist
	from nltk.tokenize import word_tokenize, sent_tokenize
	import re
	from langdetect import detect, DetectorFactory

	# Flask setup
	app = Flask(__name__)
	CORS(app)

	# Fix random seed for langdetect
	DetectorFactory.seed = 0

	# Download required NLTK resources (with full compatibility)
	for pkg in ["punkt", "punkt_tab", "stopwords"]:
	try:
	nltk.download(pkg, quiet=True)
	except Exception as e:
	print(f"⚠️ Could not download {pkg}: {e}")

	# ---------------------------------------------------------------
	# 1️⃣ Extract Web Content
	# ---------------------------------------------------------------
	def extract_content(url):
	try:
	print("\n🌐 Fetching website content...")

	headers = {
	"User-Agent": (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/124.0.0.0 Safari/537.36"
	)
	}

	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, "html5lib")

	# Extract various elements
	headings = []
	for i in range(1, 7):
	tag = f'h{i}'
	headings += [h.get_text(strip=True) for h in soup.find_all(tag)]

	paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)]
	images = [img['src'] for img in soup.find_all('img', src=True)]
	links = [a['href'] for a in soup.find_all('a', href=True)]

	text = soup.get_text(separator=' ', strip=True)

	# Try to detect language
	try:
	lang = detect(text[:500]) if text else "unknown"
	except:
	lang = "unknown"

	return {
	"headings": headings,
	"paragraphs": paragraphs,
	"images": images,
	"links": links,
	"text": text,
	"language": lang
	}

	except requests.exceptions.HTTPError as e:
	print(f"❌ HTTP error: {e}")
	except requests.exceptions.RequestException as e:
	print(f"❌ Network error: {e}")
	except Exception as e:
	print(f"❌ General error while fetching webpage: {e}")

	return None

	# ---------------------------------------------------------------
	# 2️⃣ NLP Text Analysis
	# ---------------------------------------------------------------
	def analyze_text(text, lang="english"):
	if not text:
	return None

	print("\n🧠 Analyzing text using NLTK...")

	cleaned = re.sub(r'[^A-Za-z ]', ' ', text)

	try:
	words = word_tokenize(cleaned)
	sentences = sent_tokenize(text)
	except LookupError:
	nltk.download("punkt_tab", quiet=True)
	words = word_tokenize(cleaned)
	sentences = sent_tokenize(text)

	try:
	sw = stopwords.words(lang)
	except:
	sw = stopwords.words("english")

	filtered = [w.lower() for w in words if w.lower() not in sw and len(w) > 2]
	freq = FreqDist(filtered)
	top_words = freq.most_common(10)

	return {
	"word_count": len(words),
	"sentence_count": len(sentences),
	"unique_words": len(set(filtered)),
	"top_words": top_words,
	"stopword_count": len(words) - len(filtered),
	"filtered_words": filtered[:50]
	}

	# ---------------------------------------------------------------
	# 3️⃣ Flask Routes
	# ---------------------------------------------------------------
	@app.route('/')
	def index():
	return render_template('index.html')

	@app.route('/extract', methods=['POST'])
	def extract_route():
	try:
	data = request.get_json()
	url = data.get('url')
	tag = data.get('tag', 'all')

	if not url:
	return jsonify({"error": "No URL provided"}), 400

	if not url.startswith("http"):
	url = "https://" + url

	content = extract_content(url)
	if not content:
	return jsonify({"error": "Failed to fetch content"}), 400

	analysis = analyze_text(content.get("text", ""))
	content["analysis"] = analysis

	if tag != "all":
	tag_map = {
	"h1": "headings",
	"p": "paragraphs",
	"img": "images",
	"a": "links"
	}
	result = content.get(tag_map.get(tag, ""), [])
	return jsonify({
	"tag": tag,
	"results": result,
	"language": content.get("language"),
	"analysis": analysis
	})

	return jsonify(content)

	except Exception as e:
	print("❌ Backend Error:", e)
	return jsonify({"error": str(e)}), 500

	# ---------------------------------------------------------------
	# 4️⃣ Run Flask App (Hugging Face compatible)
	# ---------------------------------------------------------------
	if __name__ == "__main__":
	print("=" * 70)
	print("🚀 Hugging Face Web Content Extractor running...")
	print("=" * 70)
	app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))