Scrap-Dji / parser /cleaner.py
joel
Initial deployment: Scrap-Dji with API
dfdddb1
raw
history blame contribute delete
289 Bytes
import re
from bs4 import BeautifulSoup
def clean_html(raw_html: str) -> str:
"""Supprime les balises HTML et normalise le texte."""
soup = BeautifulSoup(raw_html, "html.parser")
text = soup.get_text(separator=" ")
text = re.sub(r"\s+", " ", text).strip()
return text