Spaces:

mfoud444
/

tel2

Runtime error

tel2 / clean.py

all

30855e5 about 1 month ago

1.59 kB

	import pandas as pd

	# Input and output file paths
	input_csv = 'telegram_links.csv' # Replace with your input CSV file
	output_csv = 'cleaned_telegram_links.csv' # Output file after cleaning

	# Read the CSV file into a pandas DataFrame
	df = pd.read_csv(input_csv)

	# Remove duplicate links
	df.drop_duplicates(subset=['Telegram Links'], inplace=True)

	# Function to clean and filter URLs
	def clean_url(url):
	# Fix malformed URLs (e.g., 'tps://' -> 'https://')
	if url.startswith('tps://'):
	url = url.replace('tps://', 'https://')

	# Remove everything after '"' if present
	if '"' in url:
	url = url.split('"')[0]

	# Remove trailing symbols like ')🔸', ')🔹', etc.
	if ')' in url:
	url = url.split(')')[0]

	# Remove any non-URL text (e.g., "🤍", "الله أكبر")
	if ' ' in url or '\n' in url:
	url = url.split()[0] # Keep only the first part (assumed to be the URL)

	# Ensure the URL starts with 'https://' and has a valid format
	if not url.startswith('https://t.me/'):
	return None

	# Count the number of '/' characters after 'https://'
	if url.count('/') > 3: # Allow up to 2 '/' after 'https://'
	return None

	return url

	# Apply the cleaning function to the DataFrame
	df['Telegram Links'] = df['Telegram Links'].apply(clean_url)

	# Drop rows with None values (filtered out URLs)
	df.dropna(subset=['Telegram Links'], inplace=True)

	# Save the cleaned DataFrame to a new CSV file
	df.to_csv(output_csv, index=False)

	print(f"Cleaned links saved to {output_csv}")