has12zen
text
711ce4f
raw
history blame contribute delete
No virus
3.89 kB
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
import math
import nltk
import re
nltk.download("wordnet")
# nltk.download("omw-1.4")
# Initialize wordnet lemmatizer
wnl = WordNetLemmatizer()
file1 = './assets/text1.txt'
file2 = './assets/text2.txt'
file3 = './assets/text3.txt'
file4 = './assets/text4.txt'
file5 = './assets/text5.txt'
file6 = './assets/text6.txt'
file7 = './assets/text7.txt'
file8 = './assets/text8.txt'
file9 = './assets/text9.txt'
file10 = './assets/text10.txt'
files = [file1, file2, file3,file4,file5,file6,file7,file8,file9,file10]
gist_file = open("gist_stopwords.txt", "r")
try:
content = gist_file.read()
stopwords = content.split(",")
finally:
gist_file.close()
def read_file(name):
with open(name,'r') as file:
contents = file.read();
return contents
def process_string(name):
text = ''.join(c.lower() for c in name)
# remove punctuation using regex that matches only words or digits or underscore of length 1 or more
tokens = re.findall(r'\w+', text)
# remove commonly used words like 'is', 'the', 'a', etc.
filtered_tokens = [token for token in tokens if token not in stopwords]
# convert words to their root form ie 'running' to 'run'
root_tokens = [wnl.lemmatize(token,pos='n') for token in filtered_tokens]
return root_tokens
def process_tokens(tokens,st_global_words):
# global st_global_words
freq_dict = {}
tf_dict = {}
for word in st_global_words:
freq_dict[word] = tokens.count(word)
tf_dict[word] = freq_dict[word]/len(tokens)
return freq_dict, tf_dict
def main(input1,input2):
processed_files = [ read_file(file) for file in files ]
processed_files.insert(0,input2)
processed_files.insert(0,input1)
processed_strings = [ process_string(file) for file in processed_files ]
st_global_words = set()
for tokens in processed_strings:
st_global_words.update(tokens)
processed_tokens = []
for tokens in processed_strings:
freq_dict, tf_dict = process_tokens(tokens,st_global_words)
processed_tokens.append((freq_dict, tf_dict))
idf_dict = {}
for word in st_global_words:
cnt = 0
for freq_dict, tf_dict in processed_tokens:
if freq_dict[word] > 0:
cnt += 1
idf_dict[word] = math.log(len(processed_tokens)/cnt)
df = pd.DataFrame({'word': list(st_global_words)})
df['idf_col']= [idf_dict[word] for word in st_global_words]
for i, (freq_dict, tf_dict) in enumerate(processed_tokens):
freq_col = [freq_dict[word] for word in st_global_words]
tf_col = [tf_dict[word] for word in st_global_words]
df['freq_{}'.format(i+1)] = freq_col
df['tf_{}'.format(i+1)] = tf_col
df[f'tfidf_{i+1}'] = df[f'tf_{i+1}'] * df['idf_col']
tf_idf_cols = [col for col in df.columns if 'tfidf' in col]
tf_idf_vals = []
for i in range(len(tf_idf_cols)):
tf_idf_vals.append(df[tf_idf_cols[i]].values)
tf_idf_vals = np.array(tf_idf_vals)
return tf_idf_vals
def cosine_diff(A,B):
dot_product = sum(A[i]*B[i] for i in range(len(A)))
norm_A = math.sqrt(sum([A[i]**2 for i in range(len(A))]))
norm_B = math.sqrt(sum([B[i]**2 for i in range(len(B))]))
similarity = dot_product / (norm_A * norm_B)
return similarity
def euclidean(A,B):
su = 0
for i in range(len(A)):
su += (A[i]-B[i])**2
return math.sqrt(su)
def final_main(input1,input2):
tf_idf_vals = main(input1,input2)
outputString = ""
similarity = cosine_diff(tf_idf_vals[0],tf_idf_vals[1])
outputString+=f"Cosine similarity:{round(similarity*100,2)}%\n"
diff = euclidean(tf_idf_vals[0],tf_idf_vals[1])
outputString += f"Euclidean Distance(difference): {round(math.sqrt(diff)*100,2)}%\n"
return outputString