ragV98 commited on
Commit
c72e054
·
1 Parent(s): 76187cf

undid the changes

Browse files
Files changed (1) hide show
  1. components/fetchers/scraper.py +17 -61
components/fetchers/scraper.py CHANGED
@@ -12,68 +12,36 @@ HEADERS = {
12
  )
13
  }
14
 
15
-
16
  def clean_text(text: str) -> str:
17
- """Remove HTML tags and collapse whitespace."""
18
  soup = BeautifulSoup(text, "html.parser")
19
  cleaned = soup.get_text(separator=" ", strip=True)
20
- return " ".join(cleaned.split())
21
-
22
 
23
  def is_low_quality(text: str) -> bool:
24
- """Heuristic to detect low-value content like navbars, footers, etc."""
25
- if not text or len(text.split()) < 50:
26
  return True
27
-
28
  junk_markers = [
29
- "subscribe", "click here", "latest headlines", "more from",
30
- "privacy policy", "video", "terms of service", "back to top",
31
- "all rights reserved", "advertisement", "read more", "sign in"
32
  ]
33
-
34
  return any(marker in text.lower() for marker in junk_markers)
35
 
36
-
37
- def fallback_html_extract(html: str) -> Optional[str]:
38
- """Very basic content extractor as a last resort."""
39
- try:
40
- soup = BeautifulSoup(html, "html.parser")
41
- paragraphs = soup.find_all("p")
42
- text = " ".join(p.get_text(strip=True) for p in paragraphs)
43
- cleaned = clean_text(text)
44
- return cleaned if len(cleaned.split()) >= 50 else None
45
- except Exception as e:
46
- print(f"⚠️ Fallback extract failed: {e}")
47
- return None
48
-
49
-
50
  def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
51
- """Extract meaningful text from a given URL using multiple methods."""
52
  try:
53
  response = requests.get(url, timeout=timeout, headers=HEADERS)
54
- if response.status_code != 200:
55
- print(f"⚠️ Bad status ({response.status_code}) for {url}")
56
- return None
57
-
58
- html = response.text
59
-
60
- # Attempt trafilatura
61
- extracted = trafilatura.extract(
62
- html,
63
- include_comments=False,
64
- include_tables=False,
65
- no_fallback=False
66
- )
67
-
68
- if extracted:
69
- text = clean_text(extracted)
70
- if not is_low_quality(text):
71
- return text
72
- else:
73
- print(f"⚠️ Skipped low-quality text from Trafilatura: {url}")
74
- else:
75
- print(f"⚠️ Trafilatura extraction failed or empty: {url}")
76
-
77
  except Exception as e:
78
  print(f"⚠️ Trafilatura failed for {url}: {e}")
79
 
@@ -88,19 +56,7 @@ def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
88
  return text
89
  else:
90
  print(f"⚠️ Skipped low-quality text from Newspaper3k: {url}")
91
- else:
92
- print(f"⚠️ Newspaper3k extracted no text: {url}")
93
  except Exception as e:
94
  print(f"⚠️ Newspaper3k failed for {url}: {e}")
95
 
96
- # Final fallback to basic HTML parsing
97
- try:
98
- if html:
99
- fallback = fallback_html_extract(html)
100
- if fallback:
101
- print(f"✅ Used fallback extractor for: {url}")
102
- return fallback
103
- except Exception as e:
104
- print(f"⚠️ Final fallback failed for {url}: {e}")
105
-
106
  return None
 
12
  )
13
  }
14
 
 
15
  def clean_text(text: str) -> str:
16
+ # Remove HTML tags, collapse whitespace
17
  soup = BeautifulSoup(text, "html.parser")
18
  cleaned = soup.get_text(separator=" ", strip=True)
19
+ cleaned = " ".join(cleaned.split())
20
+ return cleaned
21
 
22
  def is_low_quality(text: str) -> bool:
23
+ """Detect navigation garbage, footers, or low-word-count dumps."""
24
+ if not text or len(text.split()) < 120:
25
  return True
 
26
  junk_markers = [
27
+ "subscribe", "click here", "latest headlines", "more from", "privacy policy",
28
+ "video", "terms of service", "back to top", "all rights reserved"
 
29
  ]
 
30
  return any(marker in text.lower() for marker in junk_markers)
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
33
+ # Try Trafilatura first
34
  try:
35
  response = requests.get(url, timeout=timeout, headers=HEADERS)
36
+ if response.status_code == 200:
37
+ html = response.text
38
+ extracted = trafilatura.extract(html, include_comments=False, include_tables=False)
39
+ if extracted:
40
+ text = clean_text(extracted)
41
+ if not is_low_quality(text):
42
+ return text
43
+ else:
44
+ print(f"⚠️ Skipped low-quality text from Trafilatura: {url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  except Exception as e:
46
  print(f"⚠️ Trafilatura failed for {url}: {e}")
47
 
 
56
  return text
57
  else:
58
  print(f"⚠️ Skipped low-quality text from Newspaper3k: {url}")
 
 
59
  except Exception as e:
60
  print(f"⚠️ Newspaper3k failed for {url}: {e}")
61
 
 
 
 
 
 
 
 
 
 
 
62
  return None