File size: 3,887 Bytes
1c13527
 
 
 
 
 
 
 
 
 
 
 
 
6118d1d
 
 
 
 
 
 
 
 
 
1c13527
6118d1d
1c13527
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ed207d
 
1c13527
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ed207d
711ce4f
1ed207d
711ce4f
1c13527
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
import math
import nltk
import re

nltk.download("wordnet")
# nltk.download("omw-1.4")

# Initialize wordnet lemmatizer
wnl = WordNetLemmatizer()

file1 = './assets/text1.txt'
file2 = './assets/text2.txt'
file3 = './assets/text3.txt'
file4 = './assets/text4.txt'
file5 = './assets/text5.txt'
file6 = './assets/text6.txt'
file7 = './assets/text7.txt'
file8 = './assets/text8.txt'
file9 = './assets/text9.txt'
file10 = './assets/text10.txt'

files = [file1, file2, file3,file4,file5,file6,file7,file8,file9,file10]

gist_file = open("gist_stopwords.txt", "r")
try:
    content = gist_file.read()
    stopwords = content.split(",")
finally:
    gist_file.close()

def read_file(name):
    with open(name,'r') as file:
        contents = file.read();
        return contents

def process_string(name):
    text = ''.join(c.lower() for c in name)
    # remove punctuation using regex that matches only words or digits or underscore of length 1 or more
    tokens = re.findall(r'\w+', text)
    # remove commonly used words like 'is', 'the', 'a', etc.
    filtered_tokens = [token for token in tokens if token not in stopwords]
    # convert words to their root form ie 'running' to 'run'
    root_tokens = [wnl.lemmatize(token,pos='n') for token in filtered_tokens]
    return root_tokens

def process_tokens(tokens,st_global_words):
    # global st_global_words
    freq_dict = {}
    tf_dict = {}
    for word in st_global_words:
        freq_dict[word] = tokens.count(word)
        tf_dict[word] = freq_dict[word]/len(tokens)
    return freq_dict, tf_dict

def main(input1,input2):
    processed_files = [ read_file(file) for file in files ]
    processed_files.insert(0,input2)
    processed_files.insert(0,input1)
    processed_strings = [ process_string(file) for file in processed_files ]
    st_global_words = set()
    for tokens in processed_strings:
        st_global_words.update(tokens)
    processed_tokens = []
    for tokens in processed_strings:
        freq_dict, tf_dict = process_tokens(tokens,st_global_words)
        processed_tokens.append((freq_dict, tf_dict))
    idf_dict = {}
    for word in st_global_words:
        cnt = 0
        for freq_dict, tf_dict in processed_tokens:
            if freq_dict[word] > 0:
                cnt += 1
        idf_dict[word] = math.log(len(processed_tokens)/cnt)
    
    df = pd.DataFrame({'word': list(st_global_words)})
    df['idf_col']= [idf_dict[word] for word in st_global_words]
    for i, (freq_dict, tf_dict) in enumerate(processed_tokens):
        freq_col = [freq_dict[word] for word in st_global_words]
        tf_col = [tf_dict[word] for word in st_global_words]
        df['freq_{}'.format(i+1)] = freq_col
        df['tf_{}'.format(i+1)] = tf_col
        df[f'tfidf_{i+1}'] = df[f'tf_{i+1}'] * df['idf_col']
    
    tf_idf_cols = [col for col in df.columns if 'tfidf' in col]   
    tf_idf_vals = []
    for i in range(len(tf_idf_cols)):
        tf_idf_vals.append(df[tf_idf_cols[i]].values)
    tf_idf_vals = np.array(tf_idf_vals)
    return tf_idf_vals

def cosine_diff(A,B):
    dot_product = sum(A[i]*B[i] for i in range(len(A)))
    norm_A = math.sqrt(sum([A[i]**2 for i in range(len(A))]))
    norm_B = math.sqrt(sum([B[i]**2 for i in range(len(B))]))
    similarity = dot_product / (norm_A * norm_B)
    return similarity

def euclidean(A,B):
    su = 0
    for i in range(len(A)):
        su += (A[i]-B[i])**2
    
    return math.sqrt(su)

def final_main(input1,input2):
    tf_idf_vals = main(input1,input2)
    outputString = ""
    similarity = cosine_diff(tf_idf_vals[0],tf_idf_vals[1])
    outputString+=f"Cosine similarity:{round(similarity*100,2)}%\n"
    diff = euclidean(tf_idf_vals[0],tf_idf_vals[1])
    outputString += f"Euclidean Distance(difference): {round(math.sqrt(diff)*100,2)}%\n"
    return outputString