import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Preprocess text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    words = word_tokenize(text)  # Tokenize text
    words = [word for word in words if word.isalnum()]  # Remove non-alphanumeric characters
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(words)

# Calculate text similarity using TF-IDF and cosine similarity
def calculate_similarity(text1, text2):
    preprocessed_text1 = preprocess_text(text1)
    preprocessed_text2 = preprocess_text(text2)

    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([preprocessed_text1, preprocessed_text2])

    return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]

# Replace 'text1' and 'text2' with the text you want to compare
text1 = "This is the original text."
text2 = "📣 Exciting news! 🚀 The Falcon 180B has landed, revolutionizing the world of open LLMs. 🦅 Want to know how to deploy it on Amazon SageMaker? Check out this informative blog post by Philipp Schmid, Technical Lead at Hugging Face and AWS ML HERO. 🤗 Get insights on setting up your dev environment, hardware requirements, running inferences, and more. Don't miss out! Read the full article here 👉 Deploy Falcon 180B on Amazon SageMaker. Stay tuned for more Falcon 180B updates! 🌟 #AI #MachineLearning #AmazonSageMaker."

# Calculate text similarity
similarity = calculate_similarity(text1, text2)

# Set a threshold for plagiarism detection (adjust as needed)
threshold = 0.8

# Check if the similarity exceeds the threshold
if similarity >= threshold:
    print("Plagiarism detected!")
else:
    print("No plagiarism detected.")