Spaces:
Sleeping
Sleeping
| """ | |
| Web Content Extractor - Hugging Face Version | |
| -------------------------------------------- | |
| ✅ Flask + BeautifulSoup + NLTK | |
| ✅ Extracts headings, paragraphs, links, images | |
| ✅ Performs NLP analysis (word counts, frequency, stopwords) | |
| ✅ Auto language detection | |
| """ | |
| from flask import Flask, render_template, request, jsonify | |
| from flask_cors import CORS | |
| import os | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.probability import FreqDist | |
| from nltk.tokenize import word_tokenize, sent_tokenize | |
| import re | |
| from langdetect import detect, DetectorFactory | |
| # Flask setup | |
| app = Flask(__name__) | |
| CORS(app) | |
| # Fix random seed for langdetect | |
| DetectorFactory.seed = 0 | |
| # Download required NLTK resources (with full compatibility) | |
| for pkg in ["punkt", "punkt_tab", "stopwords"]: | |
| try: | |
| nltk.download(pkg, quiet=True) | |
| except Exception as e: | |
| print(f"⚠️ Could not download {pkg}: {e}") | |
| # --------------------------------------------------------------- | |
| # 1️⃣ Extract Web Content | |
| # --------------------------------------------------------------- | |
| def extract_content(url): | |
| try: | |
| print("\n🌐 Fetching website content...") | |
| headers = { | |
| "User-Agent": ( | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/124.0.0.0 Safari/537.36" | |
| ) | |
| } | |
| response = requests.get(url, headers=headers, timeout=10) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, "html5lib") | |
| # Extract various elements | |
| headings = [] | |
| for i in range(1, 7): | |
| tag = f'h{i}' | |
| headings += [h.get_text(strip=True) for h in soup.find_all(tag)] | |
| paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)] | |
| images = [img['src'] for img in soup.find_all('img', src=True)] | |
| links = [a['href'] for a in soup.find_all('a', href=True)] | |
| text = soup.get_text(separator=' ', strip=True) | |
| # Try to detect language | |
| try: | |
| lang = detect(text[:500]) if text else "unknown" | |
| except: | |
| lang = "unknown" | |
| return { | |
| "headings": headings, | |
| "paragraphs": paragraphs, | |
| "images": images, | |
| "links": links, | |
| "text": text, | |
| "language": lang | |
| } | |
| except requests.exceptions.HTTPError as e: | |
| print(f"❌ HTTP error: {e}") | |
| except requests.exceptions.RequestException as e: | |
| print(f"❌ Network error: {e}") | |
| except Exception as e: | |
| print(f"❌ General error while fetching webpage: {e}") | |
| return None | |
| # --------------------------------------------------------------- | |
| # 2️⃣ NLP Text Analysis | |
| # --------------------------------------------------------------- | |
| def analyze_text(text, lang="english"): | |
| if not text: | |
| return None | |
| print("\n🧠 Analyzing text using NLTK...") | |
| cleaned = re.sub(r'[^A-Za-z ]', ' ', text) | |
| try: | |
| words = word_tokenize(cleaned) | |
| sentences = sent_tokenize(text) | |
| except LookupError: | |
| nltk.download("punkt_tab", quiet=True) | |
| words = word_tokenize(cleaned) | |
| sentences = sent_tokenize(text) | |
| try: | |
| sw = stopwords.words(lang) | |
| except: | |
| sw = stopwords.words("english") | |
| filtered = [w.lower() for w in words if w.lower() not in sw and len(w) > 2] | |
| freq = FreqDist(filtered) | |
| top_words = freq.most_common(10) | |
| return { | |
| "word_count": len(words), | |
| "sentence_count": len(sentences), | |
| "unique_words": len(set(filtered)), | |
| "top_words": top_words, | |
| "stopword_count": len(words) - len(filtered), | |
| "filtered_words": filtered[:50] | |
| } | |
| # --------------------------------------------------------------- | |
| # 3️⃣ Flask Routes | |
| # --------------------------------------------------------------- | |
| def index(): | |
| return render_template('index.html') | |
| def extract_route(): | |
| try: | |
| data = request.get_json() | |
| url = data.get('url') | |
| tag = data.get('tag', 'all') | |
| if not url: | |
| return jsonify({"error": "No URL provided"}), 400 | |
| if not url.startswith("http"): | |
| url = "https://" + url | |
| content = extract_content(url) | |
| if not content: | |
| return jsonify({"error": "Failed to fetch content"}), 400 | |
| analysis = analyze_text(content.get("text", "")) | |
| content["analysis"] = analysis | |
| if tag != "all": | |
| tag_map = { | |
| "h1": "headings", | |
| "p": "paragraphs", | |
| "img": "images", | |
| "a": "links" | |
| } | |
| result = content.get(tag_map.get(tag, ""), []) | |
| return jsonify({ | |
| "tag": tag, | |
| "results": result, | |
| "language": content.get("language"), | |
| "analysis": analysis | |
| }) | |
| return jsonify(content) | |
| except Exception as e: | |
| print("❌ Backend Error:", e) | |
| return jsonify({"error": str(e)}), 500 | |
| # --------------------------------------------------------------- | |
| # 4️⃣ Run Flask App (Hugging Face compatible) | |
| # --------------------------------------------------------------- | |
| if __name__ == "__main__": | |
| print("=" * 70) | |
| print("🚀 Hugging Face Web Content Extractor running...") | |
| print("=" * 70) | |
| app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860))) | |