Spaces:

devinlee14
/

test

Sleeping

File size: 1,013 Bytes

e0c55bb

"""
// function.py //
This programme was created to store the function used through out this project.
"""


import re
from nltk.tokenize import word_tokenize

# Create A Function for Text Preprocessing
def text_preprocessing(text, lemmatizer, sw):
  # Case folding
  text = text.lower()

  # Mention removal
  text = re.sub("@[A-Za-z0-9_]+", " ", text)

  # Hashtags removal
  text = re.sub("#[A-Za-z0-9_]+", " ", text)

  # Newline removal (\n)
  text = re.sub(r"\\n", " ",text)

  # Whitespace removal
  text = text.strip()

  # URL removal
  text = re.sub(r"http\S+", " ", text)
  text = re.sub(r"www.\S+", " ", text)

  # Non-letter removal (such as emoticon, symbol (like μ, $, 兀), etc
  text = re.sub("[^A-Za-z\s']", " ", text)

  # Tokenization
  tokens = word_tokenize(text)
  
  # Stopwords removal
  tokens = [word for word in tokens if word not in sw]
  
  # Lemmatization
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
  
  # Combining Tokens
  text = ' '.join(tokens)
  
  return text