File size: 3,459 Bytes
1c13527
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
import math
import nltk
import re

nltk.download("wordnet")
# nltk.download("omw-1.4")

# Initialize wordnet lemmatizer
wnl = WordNetLemmatizer()

file3 = './example.txt'

files = [file3]

gist_file = open("gist_stopwords.txt", "r")
try:
    content = gist_file.read()
    stopwords = content.split(",")
finally:
    gist_file.close()

def read_file(name):
    with open(name,'r') as file:
        contents = file.read();
        return contents

def process_string(name):
    text = ''.join(c.lower() for c in name)
    # remove punctuation using regex that matches only words or digits or underscore of length 1 or more
    tokens = re.findall(r'\w+', text)
    # remove commonly used words like 'is', 'the', 'a', etc.
    filtered_tokens = [token for token in tokens if token not in stopwords]
    # convert words to their root form ie 'running' to 'run'
    root_tokens = [wnl.lemmatize(token,pos='n') for token in filtered_tokens]
    return root_tokens

def process_tokens(tokens,st_global_words):
    # global st_global_words
    freq_dict = {}
    tf_dict = {}
    for word in st_global_words:
        freq_dict[word] = tokens.count(word)
        tf_dict[word] = freq_dict[word]/len(tokens)
    return freq_dict, tf_dict

def main(input1,input2):
    processed_files = [ read_file(file) for file in files ]
    processed_files.append(input1)
    processed_files.append(input2)
    processed_strings = [ process_string(file) for file in processed_files ]
    st_global_words = set()
    for tokens in processed_strings:
        st_global_words.update(tokens)
    processed_tokens = []
    for tokens in processed_strings:
        freq_dict, tf_dict = process_tokens(tokens,st_global_words)
        processed_tokens.append((freq_dict, tf_dict))
    idf_dict = {}
    for word in st_global_words:
        cnt = 0
        for freq_dict, tf_dict in processed_tokens:
            if freq_dict[word] > 0:
                cnt += 1
        idf_dict[word] = math.log(len(processed_tokens)/cnt)
    
    df = pd.DataFrame({'word': list(st_global_words)})
    df['idf_col']= [idf_dict[word] for word in st_global_words]
    for i, (freq_dict, tf_dict) in enumerate(processed_tokens):
        freq_col = [freq_dict[word] for word in st_global_words]
        tf_col = [tf_dict[word] for word in st_global_words]
        df['freq_{}'.format(i+1)] = freq_col
        df['tf_{}'.format(i+1)] = tf_col
        df[f'tfidf_{i+1}'] = df[f'tf_{i+1}'] * df['idf_col']
    
    tf_idf_cols = [col for col in df.columns if 'tfidf' in col]   
    tf_idf_vals = []
    for i in range(len(tf_idf_cols)):
        tf_idf_vals.append(df[tf_idf_cols[i]].values)
    tf_idf_vals = np.array(tf_idf_vals)
    return tf_idf_vals

def cosine_diff(A,B):
    dot_product = sum(A[i]*B[i] for i in range(len(A)))
    norm_A = math.sqrt(sum([A[i]**2 for i in range(len(A))]))
    norm_B = math.sqrt(sum([B[i]**2 for i in range(len(B))]))
    similarity = dot_product / (norm_A * norm_B)
    return similarity

def euclidean(A,B):
    su = 0
    for i in range(len(A)):
        su += (A[i]-B[i])**2
    
    return math.sqrt(su)

def final_main(input1,input2):
    tf_idf_vals = main(input1,input2)
    outputString = ""
    outputString+= f"Cosine sim: {cosine_diff(tf_idf_vals[1],tf_idf_vals[2])}\n"
    outputString+= f"Euclidean difference: {euclidean(tf_idf_vals[1],tf_idf_vals[2])}\n"
    return outputString