import nltk import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity nltk.download('punkt') nltk.download('stopwords') from nltk.tokenize import word_tokenize from nltk.corpus import stopwords # Preprocess text def preprocess_text(text): text = text.lower() # Convert to lowercase words = word_tokenize(text) # Tokenize text words = [word for word in words if word.isalnum()] # Remove non-alphanumeric characters words = [word for word in words if word not in stopwords.words('english')] # Remove stopwords return ' '.join(words) # Calculate text similarity using TF-IDF and cosine similarity def calculate_similarity(text1, text2): preprocessed_text1 = preprocess_text(text1) preprocessed_text2 = preprocess_text(text2) tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform([preprocessed_text1, preprocessed_text2]) return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0] # Replace 'text1' and 'text2' with the text you want to compare text1 = "This is the original text." text2 = "📣 Exciting news! 🚀 The Falcon 180B has landed, revolutionizing the world of open LLMs. 🦅 Want to know how to deploy it on Amazon SageMaker? Check out this informative blog post by Philipp Schmid, Technical Lead at Hugging Face and AWS ML HERO. 🤗 Get insights on setting up your dev environment, hardware requirements, running inferences, and more. Don't miss out! Read the full article here 👉 Deploy Falcon 180B on Amazon SageMaker. Stay tuned for more Falcon 180B updates! 🌟 #AI #MachineLearning #AmazonSageMaker." # Calculate text similarity similarity = calculate_similarity(text1, text2) # Set a threshold for plagiarism detection (adjust as needed) threshold = 0.8 # Check if the similarity exceeds the threshold if similarity >= threshold: print("Plagiarism detected!") else: print("No plagiarism detected.")