Spaces:

WinterJet2021
/

New_Hybrid_Text_Classifier_Model

Build error

App Files Files Community

New_Hybrid_Text_Classifier_Model / utils.py

WinterJet2021

Upload 13 files

673d9a1 verified about 1 year ago

raw

history blame contribute delete

3.12 kB

	import re
	import pandas as pd

	def preprocess_text(text):
	"""
	Enhanced text preprocessing that better preserves domain-specific indicators
	"""
	# Handle potential NaN values
	if text is None or isinstance(text, float) and pd.isna(text):
	return ""

	# Convert to lowercase
	text = text.lower()

	# Remove special characters while preserving important separators
	text = re.sub(r'[^\w\s\|-]', ' ', text)

	# Replace multiple spaces with a single space
	text = re.sub(r'\s+', ' ', text)

	# Explicitly preserve key domain terms by adding them multiple times
	# This increases their weight in the vectorization
	domain_terms = {
	'music': ['music', 'guitar', 'band', 'concert', 'gig', 'sing', 'song', 'play music', 'musician'],
	'food': ['food', 'cook', 'cuisine', 'recipe', 'restaurant', 'eat', 'culinary', 'bake', 'chef'],
	'sports': ['sport', 'run', 'gym', 'fitness', 'workout', 'exercise', 'athletic', 'training'],
	'arts': ['art', 'paint', 'draw', 'museum', 'gallery', 'exhibit', 'creative', 'design'],
	'technology': ['tech', 'code', 'program', 'software', 'developer', 'computer', 'app', 'digital'],
	'education': ['education', 'learn', 'course', 'class', 'study', 'book', 'read', 'academic'],
	'travel': ['travel', 'trip', 'hike', 'explore', 'tour', 'visit', 'journey', 'destination']
	}

	# Check for domain terms and emphasize them
	modified_text = text
	for category, terms in domain_terms.items():
	for term in terms:
	if term in text:
	# Add the category name explicitly if a related term is found
	modified_text += f" {category} {category} {term} {term}"

	# Split on common separators but preserve the important phrases
	parts = []
	for part in re.split(r'\s\\|\s', modified_text):
	# Remove numbers (but keep words with numbers like "web3")
	part = re.sub(r'\b\d+\b', '', part)
	parts.append(part)

	# Define a more focused stopwords list (smaller to keep more domain indicators)
	core_stopwords = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'the', 'a', 'an', 'and', 'but',
	'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with',
	'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',
	'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over',
	'under', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were'}

	# Process each part and filter stopwords
	processed_parts = []
	for part in parts:
	words = part.split()
	filtered_words = [word for word in words if word not in core_stopwords]

	if filtered_words:
	processed_parts.append(' '.join(filtered_words))

	# Join the processed parts back
	processed_text = ' '.join(processed_parts)

	return processed_text.strip()