web-scraper / app.py
simar007's picture
Upload 3 files
ae4572b verified
"""
Web Content Extractor - Hugging Face Version
--------------------------------------------
✅ Flask + BeautifulSoup + NLTK
✅ Extracts headings, paragraphs, links, images
✅ Performs NLP analysis (word counts, frequency, stopwords)
✅ Auto language detection
"""
from flask import Flask, render_template, request, jsonify
from flask_cors import CORS
import os
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize, sent_tokenize
import re
from langdetect import detect, DetectorFactory
# Flask setup
app = Flask(__name__)
CORS(app)
# Fix random seed for langdetect
DetectorFactory.seed = 0
# Download required NLTK resources (with full compatibility)
for pkg in ["punkt", "punkt_tab", "stopwords"]:
try:
nltk.download(pkg, quiet=True)
except Exception as e:
print(f"⚠️ Could not download {pkg}: {e}")
# ---------------------------------------------------------------
# 1️⃣ Extract Web Content
# ---------------------------------------------------------------
def extract_content(url):
try:
print("\n🌐 Fetching website content...")
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
)
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html5lib")
# Extract various elements
headings = []
for i in range(1, 7):
tag = f'h{i}'
headings += [h.get_text(strip=True) for h in soup.find_all(tag)]
paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)]
images = [img['src'] for img in soup.find_all('img', src=True)]
links = [a['href'] for a in soup.find_all('a', href=True)]
text = soup.get_text(separator=' ', strip=True)
# Try to detect language
try:
lang = detect(text[:500]) if text else "unknown"
except:
lang = "unknown"
return {
"headings": headings,
"paragraphs": paragraphs,
"images": images,
"links": links,
"text": text,
"language": lang
}
except requests.exceptions.HTTPError as e:
print(f"❌ HTTP error: {e}")
except requests.exceptions.RequestException as e:
print(f"❌ Network error: {e}")
except Exception as e:
print(f"❌ General error while fetching webpage: {e}")
return None
# ---------------------------------------------------------------
# 2️⃣ NLP Text Analysis
# ---------------------------------------------------------------
def analyze_text(text, lang="english"):
if not text:
return None
print("\n🧠 Analyzing text using NLTK...")
cleaned = re.sub(r'[^A-Za-z ]', ' ', text)
try:
words = word_tokenize(cleaned)
sentences = sent_tokenize(text)
except LookupError:
nltk.download("punkt_tab", quiet=True)
words = word_tokenize(cleaned)
sentences = sent_tokenize(text)
try:
sw = stopwords.words(lang)
except:
sw = stopwords.words("english")
filtered = [w.lower() for w in words if w.lower() not in sw and len(w) > 2]
freq = FreqDist(filtered)
top_words = freq.most_common(10)
return {
"word_count": len(words),
"sentence_count": len(sentences),
"unique_words": len(set(filtered)),
"top_words": top_words,
"stopword_count": len(words) - len(filtered),
"filtered_words": filtered[:50]
}
# ---------------------------------------------------------------
# 3️⃣ Flask Routes
# ---------------------------------------------------------------
@app.route('/')
def index():
return render_template('index.html')
@app.route('/extract', methods=['POST'])
def extract_route():
try:
data = request.get_json()
url = data.get('url')
tag = data.get('tag', 'all')
if not url:
return jsonify({"error": "No URL provided"}), 400
if not url.startswith("http"):
url = "https://" + url
content = extract_content(url)
if not content:
return jsonify({"error": "Failed to fetch content"}), 400
analysis = analyze_text(content.get("text", ""))
content["analysis"] = analysis
if tag != "all":
tag_map = {
"h1": "headings",
"p": "paragraphs",
"img": "images",
"a": "links"
}
result = content.get(tag_map.get(tag, ""), [])
return jsonify({
"tag": tag,
"results": result,
"language": content.get("language"),
"analysis": analysis
})
return jsonify(content)
except Exception as e:
print("❌ Backend Error:", e)
return jsonify({"error": str(e)}), 500
# ---------------------------------------------------------------
# 4️⃣ Run Flask App (Hugging Face compatible)
# ---------------------------------------------------------------
if __name__ == "__main__":
print("=" * 70)
print("🚀 Hugging Face Web Content Extractor running...")
print("=" * 70)
app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))