|
|
import re |
|
|
import string |
|
|
from statistics import mode |
|
|
|
|
|
import emoji |
|
|
from langdetect import detect |
|
|
from spellchecker import SpellChecker |
|
|
|
|
|
|
|
|
def clean_text(text: str) -> str: |
|
|
|
|
|
|
|
|
|
|
|
for fun in [ |
|
|
remove_URL, |
|
|
remove_html, |
|
|
remove_hashtags, |
|
|
|
|
|
|
|
|
|
|
|
]: |
|
|
text = fun(text) |
|
|
return text |
|
|
|
|
|
|
|
|
def remove_URL(text: str) -> str: |
|
|
url = re.compile(r"https?://\S+|www\.\S+") |
|
|
return url.sub(r"", text) |
|
|
|
|
|
|
|
|
def remove_hashtags(text: str) -> str: |
|
|
hashtag = re.compile(r"#\S+") |
|
|
return hashtag.sub(r"", text) |
|
|
|
|
|
|
|
|
def remove_html(text: str) -> str: |
|
|
html = re.compile(r"<.*?>") |
|
|
return html.sub(r"", text) |
|
|
|
|
|
|
|
|
def remove_emojis(text: str) -> str: |
|
|
delimiter = "#4=" |
|
|
for i in range(5): |
|
|
text = emoji.demojize(string=text, delimiters=(delimiter, delimiter)) |
|
|
text = re.sub(f"{delimiter}\S+{delimiter}", "", text) |
|
|
return text |
|
|
|
|
|
|
|
|
def remove_punct(text): |
|
|
table = str.maketrans("", "", string.punctuation) |
|
|
return text.translate(table) |
|
|
|
|
|
|
|
|
def correct_spellings(text): |
|
|
spell = SpellChecker() |
|
|
corrected_text = [] |
|
|
misspelled_words = spell.unknown(text.split()) |
|
|
for word in text.split(): |
|
|
corrected_word = spell.correction(word) |
|
|
if word in misspelled_words and corrected_word is not None: |
|
|
corrected_text.append(corrected_word) |
|
|
else: |
|
|
corrected_text.append(word) |
|
|
return " ".join(corrected_text) |
|
|
|
|
|
|
|
|
def remove_backslashes(text: str) -> str: |
|
|
backslash = re.compile(r"\\\S+") |
|
|
return backslash.sub(r"", text) |
|
|
|
|
|
|
|
|
def detect_language(list_of_texts: list[str]) -> str | None: |
|
|
|
|
|
if len(list_of_texts) == 0: |
|
|
return None |
|
|
|
|
|
languages = [] |
|
|
|
|
|
for text in list_of_texts: |
|
|
try: |
|
|
lan = detect(text) |
|
|
languages.append(lan) |
|
|
except Exception: |
|
|
continue |
|
|
|
|
|
return mode(languages) if len(languages) else None |
|
|
|