has12zen commited on
Commit
1c13527
1 Parent(s): 8343c12
Files changed (6) hide show
  1. .gitignore +2 -0
  2. app.py +16 -0
  3. example.txt +1 -0
  4. gist_stopwords.txt +1 -0
  5. requirements.txt +4 -0
  6. utils.py +104 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ A5/
2
+ __pycache__/
app.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from utils import *
3
+
4
+ with gr.Blocks() as demo:
5
+ gr.Markdown("# Enter the 2 Docs.")
6
+ with gr.Tab("Encrypt"):
7
+ with gr.Row():
8
+ with gr.Column():
9
+ encrypt_msg = gr.Textbox(lines=2, label="Doc1")
10
+ encrypt_key = gr.Textbox(lines=2, label="Doc2")
11
+ encrypt_output = gr.Textbox()
12
+ encrypt_button = gr.Button("Encrypt")
13
+
14
+ encrypt_button.click(final_main, inputs=[encrypt_msg, encrypt_key ], outputs=[encrypt_output])
15
+
16
+ demo.launch(share=False);
example.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ The benefits of exercise for physical and mental health are well-established. Regular exercise can help reduce the risk of chronic diseases, such as diabetes, heart disease, and cancer, as well as improve mood and cognitive function. However, it's important to choose activities that you enjoy and that are safe for your current level of fitness. Some popular options include walking, running, cycling, swimming, and yoga. Finding a workout buddy or joining a group fitness class can also help keep you motivated and accountable.
gist_stopwords.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 0o,0s,3a,3b,3d,6b,6o,a,a1,a2,a3,a4,ab,able,about,above,abst,ac,accordance,according,accordingly,across,act,actually,ad,added,adj,ae,af,affected,affecting,affects,after,afterwards,ag,again,against,ah,ain,ain't,aj,al,all,allow,allows,almost,alone,along,already,also,although,always,am,among,amongst,amoungst,amount,an,and,announce,another,any,anybody,anyhow,anymore,anyone,anything,anyway,anyways,anywhere,ao,ap,apart,apparently,appear,appreciate,appropriate,approximately,ar,are,aren,arent,aren't,arise,around,as,a's,aside,ask,asking,associated,at,au,auth,av,available,aw,away,awfully,ax,ay,az,b,b1,b2,b3,ba,back,bc,bd,be,became,because,become,becomes,becoming,been,before,beforehand,begin,beginning,beginnings,begins,behind,being,believe,below,beside,besides,best,better,between,beyond,bi,bill,biol,bj,bk,bl,bn,both,bottom,bp,br,brief,briefly,bs,bt,bu,but,bx,by,c,c1,c2,c3,ca,call,came,can,cannot,cant,can't,cause,causes,cc,cd,ce,certain,certainly,cf,cg,ch,changes,ci,cit,cj,cl,clearly,cm,c'mon,cn,co,com,come,comes,con,concerning,consequently,consider,considering,contain,containing,contains,corresponding,could,couldn,couldnt,couldn't,course,cp,cq,cr,cry,cs,c's,ct,cu,currently,cv,cx,cy,cz,d,d2,da,date,dc,dd,de,definitely,describe,described,despite,detail,df,di,did,didn,didn't,different,dj,dk,dl,do,does,doesn,doesn't,doing,don,done,don't,down,downwards,dp,dr,ds,dt,du,due,during,dx,dy,e,e2,e3,ea,each,ec,ed,edu,ee,ef,effect,eg,ei,eight,eighty,either,ej,el,eleven,else,elsewhere,em,empty,en,end,ending,enough,entirely,eo,ep,eq,er,es,especially,est,et,et-al,etc,eu,ev,even,ever,every,everybody,everyone,everything,everywhere,ex,exactly,example,except,ey,f,f2,fa,far,fc,few,ff,fi,fifteen,fifth,fify,fill,find,fire,first,five,fix,fj,fl,fn,fo,followed,following,follows,for,former,formerly,forth,forty,found,four,fr,from,front,fs,ft,fu,full,further,furthermore,fy,g,ga,gave,ge,get,gets,getting,gi,give,given,gives,giving,gj,gl,go,goes,going,gone,got,gotten,gr,greetings,gs,gy,h,h2,h3,had,hadn,hadn't,happens,hardly,has,hasn,hasnt,hasn't,have,haven,haven't,having,he,hed,he'd,he'll,hello,help,hence,her,here,hereafter,hereby,herein,heres,here's,hereupon,hers,herself,hes,he's,hh,hi,hid,him,himself,his,hither,hj,ho,home,hopefully,how,howbeit,however,how's,hr,hs,http,hu,hundred,hy,i,i2,i3,i4,i6,i7,i8,ia,ib,ibid,ic,id,i'd,ie,if,ig,ignored,ih,ii,ij,il,i'll,im,i'm,immediate,immediately,importance,important,in,inasmuch,inc,indeed,index,indicate,indicated,indicates,information,inner,insofar,instead,interest,into,invention,inward,io,ip,iq,ir,is,isn,isn't,it,itd,it'd,it'll,its,it's,itself,iv,i've,ix,iy,iz,j,jj,jr,js,jt,ju,just,k,ke,keep,keeps,kept,kg,kj,km,know,known,knows,ko,l,l2,la,largely,last,lately,later,latter,latterly,lb,lc,le,least,les,less,lest,let,lets,let's,lf,like,liked,likely,line,little,lj,ll,ll,ln,lo,look,looking,looks,los,lr,ls,lt,ltd,m,m2,ma,made,mainly,make,makes,many,may,maybe,me,mean,means,meantime,meanwhile,merely,mg,might,mightn,mightn't,mill,million,mine,miss,ml,mn,mo,more,moreover,most,mostly,move,mr,mrs,ms,mt,mu,much,mug,must,mustn,mustn't,my,myself,n,n2,na,name,namely,nay,nc,nd,ne,near,nearly,necessarily,necessary,need,needn,needn't,needs,neither,never,nevertheless,new,next,ng,ni,nine,ninety,nj,nl,nn,no,nobody,non,none,nonetheless,noone,nor,normally,nos,not,noted,nothing,novel,now,nowhere,nr,ns,nt,ny,o,oa,ob,obtain,obtained,obviously,oc,od,of,off,often,og,oh,oi,oj,ok,okay,ol,old,om,omitted,on,once,one,ones,only,onto,oo,op,oq,or,ord,os,ot,other,others,otherwise,ou,ought,our,ours,ourselves,out,outside,over,overall,ow,owing,own,ox,oz,p,p1,p2,p3,page,pagecount,pages,par,part,particular,particularly,pas,past,pc,pd,pe,per,perhaps,pf,ph,pi,pj,pk,pl,placed,please,plus,pm,pn,po,poorly,possible,possibly,potentially,pp,pq,pr,predominantly,present,presumably,previously,primarily,probably,promptly,proud,provides,ps,pt,pu,put,py,q,qj,qu,que,quickly,quite,qv,r,r2,ra,ran,rather,rc,rd,re,readily,really,reasonably,recent,recently,ref,refs,regarding,regardless,regards,related,relatively,research,research-articl,respectively,resulted,resulting,results,rf,rh,ri,right,rj,rl,rm,rn,ro,rq,rr,rs,rt,ru,run,rv,ry,s,s2,sa,said,same,saw,say,saying,says,sc,sd,se,sec,second,secondly,section,see,seeing,seem,seemed,seeming,seems,seen,self,selves,sensible,sent,serious,seriously,seven,several,sf,shall,shan,shan't,she,shed,she'd,she'll,shes,she's,should,shouldn,shouldn't,should've,show,showed,shown,showns,shows,si,side,significant,significantly,similar,similarly,since,sincere,six,sixty,sj,sl,slightly,sm,sn,so,some,somebody,somehow,someone,somethan,something,sometime,sometimes,somewhat,somewhere,soon,sorry,sp,specifically,specified,specify,specifying,sq,sr,ss,st,still,stop,strongly,sub,substantially,successfully,such,sufficiently,suggest,sup,sure,sy,system,sz,t,t1,t2,t3,take,taken,taking,tb,tc,td,te,tell,ten,tends,tf,th,than,thank,thanks,thanx,that,that'll,thats,that's,that've,the,their,theirs,them,themselves,then,thence,there,thereafter,thereby,thered,therefore,therein,there'll,thereof,therere,theres,there's,thereto,thereupon,there've,these,they,theyd,they'd,they'll,theyre,they're,they've,thickv,thin,think,third,this,thorough,thoroughly,those,thou,though,thoughh,thousand,three,throug,through,throughout,thru,thus,ti,til,tip,tj,tl,tm,tn,to,together,too,took,top,toward,towards,tp,tq,tr,tried,tries,truly,try,trying,ts,t's,tt,tv,twelve,twenty,twice,two,tx,u,u201d,ue,ui,uj,uk,um,un,under,unfortunately,unless,unlike,unlikely,until,unto,uo,up,upon,ups,ur,us,use,used,useful,usefully,usefulness,uses,using,usually,ut,v,va,value,various,vd,ve,ve,very,via,viz,vj,vo,vol,vols,volumtype,vq,vs,vt,vu,w,wa,want,wants,was,wasn,wasnt,wasn't,way,we,wed,we'd,welcome,well,we'll,well-b,went,were,we're,weren,werent,weren't,we've,what,whatever,what'll,whats,what's,when,whence,whenever,when's,where,whereafter,whereas,whereby,wherein,wheres,where's,whereupon,wherever,whether,which,while,whim,whither,who,whod,whoever,whole,who'll,whom,whomever,whos,who's,whose,why,why's,wi,widely,will,willing,wish,with,within,without,wo,won,wonder,wont,won't,words,world,would,wouldn,wouldnt,wouldn't,www,x,x1,x2,x3,xf,xi,xj,xk,xl,xn,xo,xs,xt,xv,xx,y,y2,yes,yet,yj,yl,you,youd,you'd,you'll,your,youre,you're,yours,yourself,yourselves,you've,yr,ys,yt,z,zero,zi,zz
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ nltk
4
+ gradio
utils.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk.stem import WordNetLemmatizer
2
+ import pandas as pd
3
+ import numpy as np
4
+ import math
5
+ import nltk
6
+ import re
7
+
8
+ nltk.download("wordnet")
9
+ # nltk.download("omw-1.4")
10
+
11
+ # Initialize wordnet lemmatizer
12
+ wnl = WordNetLemmatizer()
13
+
14
+ file3 = './example.txt'
15
+
16
+ files = [file3]
17
+
18
+ gist_file = open("gist_stopwords.txt", "r")
19
+ try:
20
+ content = gist_file.read()
21
+ stopwords = content.split(",")
22
+ finally:
23
+ gist_file.close()
24
+
25
+ def read_file(name):
26
+ with open(name,'r') as file:
27
+ contents = file.read();
28
+ return contents
29
+
30
+ def process_string(name):
31
+ text = ''.join(c.lower() for c in name)
32
+ # remove punctuation using regex that matches only words or digits or underscore of length 1 or more
33
+ tokens = re.findall(r'\w+', text)
34
+ # remove commonly used words like 'is', 'the', 'a', etc.
35
+ filtered_tokens = [token for token in tokens if token not in stopwords]
36
+ # convert words to their root form ie 'running' to 'run'
37
+ root_tokens = [wnl.lemmatize(token,pos='n') for token in filtered_tokens]
38
+ return root_tokens
39
+
40
+ def process_tokens(tokens,st_global_words):
41
+ # global st_global_words
42
+ freq_dict = {}
43
+ tf_dict = {}
44
+ for word in st_global_words:
45
+ freq_dict[word] = tokens.count(word)
46
+ tf_dict[word] = freq_dict[word]/len(tokens)
47
+ return freq_dict, tf_dict
48
+
49
+ def main(input1,input2):
50
+ processed_files = [ read_file(file) for file in files ]
51
+ processed_files.append(input1)
52
+ processed_files.append(input2)
53
+ processed_strings = [ process_string(file) for file in processed_files ]
54
+ st_global_words = set()
55
+ for tokens in processed_strings:
56
+ st_global_words.update(tokens)
57
+ processed_tokens = []
58
+ for tokens in processed_strings:
59
+ freq_dict, tf_dict = process_tokens(tokens,st_global_words)
60
+ processed_tokens.append((freq_dict, tf_dict))
61
+ idf_dict = {}
62
+ for word in st_global_words:
63
+ cnt = 0
64
+ for freq_dict, tf_dict in processed_tokens:
65
+ if freq_dict[word] > 0:
66
+ cnt += 1
67
+ idf_dict[word] = math.log(len(processed_tokens)/cnt)
68
+
69
+ df = pd.DataFrame({'word': list(st_global_words)})
70
+ df['idf_col']= [idf_dict[word] for word in st_global_words]
71
+ for i, (freq_dict, tf_dict) in enumerate(processed_tokens):
72
+ freq_col = [freq_dict[word] for word in st_global_words]
73
+ tf_col = [tf_dict[word] for word in st_global_words]
74
+ df['freq_{}'.format(i+1)] = freq_col
75
+ df['tf_{}'.format(i+1)] = tf_col
76
+ df[f'tfidf_{i+1}'] = df[f'tf_{i+1}'] * df['idf_col']
77
+
78
+ tf_idf_cols = [col for col in df.columns if 'tfidf' in col]
79
+ tf_idf_vals = []
80
+ for i in range(len(tf_idf_cols)):
81
+ tf_idf_vals.append(df[tf_idf_cols[i]].values)
82
+ tf_idf_vals = np.array(tf_idf_vals)
83
+ return tf_idf_vals
84
+
85
+ def cosine_diff(A,B):
86
+ dot_product = sum(A[i]*B[i] for i in range(len(A)))
87
+ norm_A = math.sqrt(sum([A[i]**2 for i in range(len(A))]))
88
+ norm_B = math.sqrt(sum([B[i]**2 for i in range(len(B))]))
89
+ similarity = dot_product / (norm_A * norm_B)
90
+ return similarity
91
+
92
+ def euclidean(A,B):
93
+ su = 0
94
+ for i in range(len(A)):
95
+ su += (A[i]-B[i])**2
96
+
97
+ return math.sqrt(su)
98
+
99
+ def final_main(input1,input2):
100
+ tf_idf_vals = main(input1,input2)
101
+ outputString = ""
102
+ outputString+= f"Cosine sim: {cosine_diff(tf_idf_vals[1],tf_idf_vals[2])}\n"
103
+ outputString+= f"Euclidean difference: {euclidean(tf_idf_vals[1],tf_idf_vals[2])}\n"
104
+ return outputString