import pandas as pd | |
# Input and output file paths | |
input_csv = 'telegram_links.csv' # Replace with your input CSV file | |
output_csv = 'cleaned_telegram_links.csv' # Output file after cleaning | |
# Read the CSV file into a pandas DataFrame | |
df = pd.read_csv(input_csv) | |
# Remove duplicate links | |
df.drop_duplicates(subset=['Telegram Links'], inplace=True) | |
# Function to clean and filter URLs | |
def clean_url(url): | |
# Fix malformed URLs (e.g., 'tps://' -> 'https://') | |
if url.startswith('tps://'): | |
url = url.replace('tps://', 'https://') | |
# Remove everything after '"' if present | |
if '"' in url: | |
url = url.split('"')[0] | |
# Remove trailing symbols like ')๐ธ', ')๐น', etc. | |
if ')' in url: | |
url = url.split(')')[0] | |
# Remove any non-URL text (e.g., "๐ค", "ุงููู ุฃูุจุฑ") | |
if ' ' in url or '\n' in url: | |
url = url.split()[0] # Keep only the first part (assumed to be the URL) | |
# Ensure the URL starts with 'https://' and has a valid format | |
if not url.startswith('https://t.me/'): | |
return None | |
# Count the number of '/' characters after 'https://' | |
if url.count('/') > 3: # Allow up to 2 '/' after 'https://' | |
return None | |
return url | |
# Apply the cleaning function to the DataFrame | |
df['Telegram Links'] = df['Telegram Links'].apply(clean_url) | |
# Drop rows with None values (filtered out URLs) | |
df.dropna(subset=['Telegram Links'], inplace=True) | |
# Save the cleaned DataFrame to a new CSV file | |
df.to_csv(output_csv, index=False) | |
print(f"Cleaned links saved to {output_csv}") |