tel2 / clean.py
mfoud444's picture
all
30855e5
import pandas as pd
# Input and output file paths
input_csv = 'telegram_links.csv' # Replace with your input CSV file
output_csv = 'cleaned_telegram_links.csv' # Output file after cleaning
# Read the CSV file into a pandas DataFrame
df = pd.read_csv(input_csv)
# Remove duplicate links
df.drop_duplicates(subset=['Telegram Links'], inplace=True)
# Function to clean and filter URLs
def clean_url(url):
# Fix malformed URLs (e.g., 'tps://' -> 'https://')
if url.startswith('tps://'):
url = url.replace('tps://', 'https://')
# Remove everything after '"' if present
if '"' in url:
url = url.split('"')[0]
# Remove trailing symbols like ')๐Ÿ”ธ', ')๐Ÿ”น', etc.
if ')' in url:
url = url.split(')')[0]
# Remove any non-URL text (e.g., "๐Ÿค", "ุงู„ู„ู‡ ุฃูƒุจุฑ")
if ' ' in url or '\n' in url:
url = url.split()[0] # Keep only the first part (assumed to be the URL)
# Ensure the URL starts with 'https://' and has a valid format
if not url.startswith('https://t.me/'):
return None
# Count the number of '/' characters after 'https://'
if url.count('/') > 3: # Allow up to 2 '/' after 'https://'
return None
return url
# Apply the cleaning function to the DataFrame
df['Telegram Links'] = df['Telegram Links'].apply(clean_url)
# Drop rows with None values (filtered out URLs)
df.dropna(subset=['Telegram Links'], inplace=True)
# Save the cleaned DataFrame to a new CSV file
df.to_csv(output_csv, index=False)
print(f"Cleaned links saved to {output_csv}")