has12zen
text
711ce4f
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
import math
import nltk
import re
nltk.download("wordnet")
# nltk.download("omw-1.4")
# Initialize wordnet lemmatizer
wnl = WordNetLemmatizer()
file1 = './assets/text1.txt'
file2 = './assets/text2.txt'
file3 = './assets/text3.txt'
file4 = './assets/text4.txt'
file5 = './assets/text5.txt'
file6 = './assets/text6.txt'
file7 = './assets/text7.txt'
file8 = './assets/text8.txt'
file9 = './assets/text9.txt'
file10 = './assets/text10.txt'
files = [file1, file2, file3,file4,file5,file6,file7,file8,file9,file10]
gist_file = open("gist_stopwords.txt", "r")
try:
content = gist_file.read()
stopwords = content.split(",")
finally:
gist_file.close()
def read_file(name):
with open(name,'r') as file:
contents = file.read();
return contents
def process_string(name):
text = ''.join(c.lower() for c in name)
# remove punctuation using regex that matches only words or digits or underscore of length 1 or more
tokens = re.findall(r'\w+', text)
# remove commonly used words like 'is', 'the', 'a', etc.
filtered_tokens = [token for token in tokens if token not in stopwords]
# convert words to their root form ie 'running' to 'run'
root_tokens = [wnl.lemmatize(token,pos='n') for token in filtered_tokens]
return root_tokens
def process_tokens(tokens,st_global_words):
# global st_global_words
freq_dict = {}
tf_dict = {}
for word in st_global_words:
freq_dict[word] = tokens.count(word)
tf_dict[word] = freq_dict[word]/len(tokens)
return freq_dict, tf_dict
def main(input1,input2):
processed_files = [ read_file(file) for file in files ]
processed_files.insert(0,input2)
processed_files.insert(0,input1)
processed_strings = [ process_string(file) for file in processed_files ]
st_global_words = set()
for tokens in processed_strings:
st_global_words.update(tokens)
processed_tokens = []
for tokens in processed_strings:
freq_dict, tf_dict = process_tokens(tokens,st_global_words)
processed_tokens.append((freq_dict, tf_dict))
idf_dict = {}
for word in st_global_words:
cnt = 0
for freq_dict, tf_dict in processed_tokens:
if freq_dict[word] > 0:
cnt += 1
idf_dict[word] = math.log(len(processed_tokens)/cnt)
df = pd.DataFrame({'word': list(st_global_words)})
df['idf_col']= [idf_dict[word] for word in st_global_words]
for i, (freq_dict, tf_dict) in enumerate(processed_tokens):
freq_col = [freq_dict[word] for word in st_global_words]
tf_col = [tf_dict[word] for word in st_global_words]
df['freq_{}'.format(i+1)] = freq_col
df['tf_{}'.format(i+1)] = tf_col
df[f'tfidf_{i+1}'] = df[f'tf_{i+1}'] * df['idf_col']
tf_idf_cols = [col for col in df.columns if 'tfidf' in col]
tf_idf_vals = []
for i in range(len(tf_idf_cols)):
tf_idf_vals.append(df[tf_idf_cols[i]].values)
tf_idf_vals = np.array(tf_idf_vals)
return tf_idf_vals
def cosine_diff(A,B):
dot_product = sum(A[i]*B[i] for i in range(len(A)))
norm_A = math.sqrt(sum([A[i]**2 for i in range(len(A))]))
norm_B = math.sqrt(sum([B[i]**2 for i in range(len(B))]))
similarity = dot_product / (norm_A * norm_B)
return similarity
def euclidean(A,B):
su = 0
for i in range(len(A)):
su += (A[i]-B[i])**2
return math.sqrt(su)
def final_main(input1,input2):
tf_idf_vals = main(input1,input2)
outputString = ""
similarity = cosine_diff(tf_idf_vals[0],tf_idf_vals[1])
outputString+=f"Cosine similarity:{round(similarity*100,2)}%\n"
diff = euclidean(tf_idf_vals[0],tf_idf_vals[1])
outputString += f"Euclidean Distance(difference): {round(math.sqrt(diff)*100,2)}%\n"
return outputString