File size: 822 Bytes
f3d8098
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from string import punctuation
import re

def text_cleaning(text):
    # Remove URLs starting with http, https and www, as well as quotes
    result = re.sub(r'http\S+|www\S+|\"', '', text)
    
    # Split the text into a list of words
    words = result.split()
    
    # Remove mentions and hashtags
    words = [word for word in words if not word.startswith(('@', '#'))]
    
    # Remove leading/trailing punctuation, and individual punctuation marks
    words = [word.strip(punctuation) for word in words if word not in punctuation]
    filtered_list = [item for item in words if item != '']
    # Remove words starting with digits
    words = [word for word in filtered_list if not word[0].isdigit()]
    
    # Convert all words to lowercase
    words = [w.lower() for w in words]
    
    return " ".join(words)