from nltk.stem import WordNetLemmatizer import pandas as pd import numpy as np import math import nltk import re nltk.download("wordnet") # nltk.download("omw-1.4") # Initialize wordnet lemmatizer wnl = WordNetLemmatizer() file1 = './assets/text1.txt' file2 = './assets/text2.txt' file3 = './assets/text3.txt' file4 = './assets/text4.txt' file5 = './assets/text5.txt' file6 = './assets/text6.txt' file7 = './assets/text7.txt' file8 = './assets/text8.txt' file9 = './assets/text9.txt' file10 = './assets/text10.txt' files = [file1, file2, file3,file4,file5,file6,file7,file8,file9,file10] gist_file = open("gist_stopwords.txt", "r") try: content = gist_file.read() stopwords = content.split(",") finally: gist_file.close() def read_file(name): with open(name,'r') as file: contents = file.read(); return contents def process_string(name): text = ''.join(c.lower() for c in name) # remove punctuation using regex that matches only words or digits or underscore of length 1 or more tokens = re.findall(r'\w+', text) # remove commonly used words like 'is', 'the', 'a', etc. filtered_tokens = [token for token in tokens if token not in stopwords] # convert words to their root form ie 'running' to 'run' root_tokens = [wnl.lemmatize(token,pos='n') for token in filtered_tokens] return root_tokens def process_tokens(tokens,st_global_words): # global st_global_words freq_dict = {} tf_dict = {} for word in st_global_words: freq_dict[word] = tokens.count(word) tf_dict[word] = freq_dict[word]/len(tokens) return freq_dict, tf_dict def main(input1,input2): processed_files = [ read_file(file) for file in files ] processed_files.insert(0,input2) processed_files.insert(0,input1) processed_strings = [ process_string(file) for file in processed_files ] st_global_words = set() for tokens in processed_strings: st_global_words.update(tokens) processed_tokens = [] for tokens in processed_strings: freq_dict, tf_dict = process_tokens(tokens,st_global_words) processed_tokens.append((freq_dict, tf_dict)) idf_dict = {} for word in st_global_words: cnt = 0 for freq_dict, tf_dict in processed_tokens: if freq_dict[word] > 0: cnt += 1 idf_dict[word] = math.log(len(processed_tokens)/cnt) df = pd.DataFrame({'word': list(st_global_words)}) df['idf_col']= [idf_dict[word] for word in st_global_words] for i, (freq_dict, tf_dict) in enumerate(processed_tokens): freq_col = [freq_dict[word] for word in st_global_words] tf_col = [tf_dict[word] for word in st_global_words] df['freq_{}'.format(i+1)] = freq_col df['tf_{}'.format(i+1)] = tf_col df[f'tfidf_{i+1}'] = df[f'tf_{i+1}'] * df['idf_col'] tf_idf_cols = [col for col in df.columns if 'tfidf' in col] tf_idf_vals = [] for i in range(len(tf_idf_cols)): tf_idf_vals.append(df[tf_idf_cols[i]].values) tf_idf_vals = np.array(tf_idf_vals) return tf_idf_vals def cosine_diff(A,B): dot_product = sum(A[i]*B[i] for i in range(len(A))) norm_A = math.sqrt(sum([A[i]**2 for i in range(len(A))])) norm_B = math.sqrt(sum([B[i]**2 for i in range(len(B))])) similarity = dot_product / (norm_A * norm_B) return similarity def euclidean(A,B): su = 0 for i in range(len(A)): su += (A[i]-B[i])**2 return math.sqrt(su) def final_main(input1,input2): tf_idf_vals = main(input1,input2) outputString = "" similarity = cosine_diff(tf_idf_vals[0],tf_idf_vals[1]) outputString+=f"Cosine similarity:{round(similarity*100,2)}%\n" diff = euclidean(tf_idf_vals[0],tf_idf_vals[1]) outputString += f"Euclidean Distance(difference): {round(math.sqrt(diff)*100,2)}%\n" return outputString