arjunpatel commited on
Commit
3d9c842
1 Parent(s): b688d81

Upload requirements and script

Browse files
Files changed (2) hide show
  1. data_cleaning.py +90 -0
  2. requirements.txt +5 -0
data_cleaning.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+
6
+ from nltk.tokenize import word_tokenize, sent_tokenize
7
+ from nltk.stem import PorterStemmer
8
+
9
+ from textacy.preprocessing.remove import accents, brackets, punctuation
10
+ from textacy.preprocessing.replace import numbers, urls
11
+ from textacy.preprocessing.normalize import whitespace
12
+
13
+ import os
14
+
15
+ def clean_page(page):
16
+ # given a page, removes heading, newlines, tabs, etc
17
+ page = re.sub("=+", "", page)
18
+ page = page.replace("\n", "")
19
+ page = page.replace("\t", "")
20
+ page = accents(brackets(page))
21
+ page = urls(page)
22
+
23
+ return whitespace(page).lower()
24
+
25
+ def clean_sentences(s):
26
+
27
+ pattern = r'[^A-Za-z0-9]+'
28
+ page = re.sub(pattern, '', s)
29
+ return s
30
+
31
+
32
+
33
+ ps = PorterStemmer()
34
+ def prepare_document(doc):
35
+ # given a document, preprocesses and tokenizes it for tfidf
36
+
37
+ # clean the document of misc symbols and headings, lowercase it
38
+ doc = clean_page(doc)
39
+
40
+ #tokenize by sentence and then by word
41
+ sentences = sent_tokenize(doc)
42
+
43
+ #remove punctuation
44
+ sentences = [punctuation(s) for s in sentences]
45
+
46
+
47
+ # stem every word
48
+ sentences_and_words = [word_tokenize(s) for s in sentences]
49
+
50
+ prepared_doc = []
51
+
52
+ for sent in sentences_and_words:
53
+ stemmed_sentences = []
54
+ for word in sent:
55
+ stemmed_sentences.append(ps.stem(word))
56
+ cleaned_sentence = " ".join(stemmed_sentences)
57
+ prepared_doc.append(cleaned_sentence)
58
+ return " ".join(prepared_doc)
59
+
60
+
61
+ # small function to calculats cosine similarity of all pairs and store
62
+ def cosine_similarity(v1, v2):
63
+ numerator = np.dot(v1, v2)
64
+ denom = np.sqrt(np.sum(np.square(v1))) * np.sqrt(np.sum(np.square(v2)))
65
+
66
+ return numerator/denom
67
+
68
+
69
+ def cos_dicts(names, vects):
70
+
71
+ #given a set of vectors, create a dict of dicts for cosine similarity
72
+ # This dict of dict structure allows us to index directly into the pair we want
73
+ # The first key will be our desired game
74
+ # and the value for that key will be a dictionary of partner games
75
+
76
+ # The inner key will be the second game we wish to seek, and its value will be cosine similarity to our first game
77
+
78
+ d = {}
79
+ for name, vect in zip(names, vects):
80
+ cos_sim_by_vect = {}
81
+ for n2, v2 in zip(names, vects):
82
+ if n2 != name:
83
+ cos_sim_by_vect[n2] = cosine_similarity(vect, v2)
84
+ d[name] = cos_sim_by_vect
85
+ return d
86
+
87
+ def retrieve_top_k_similar(n1, similarity_dict, k):
88
+ inner_dict = similarity_dict[n1]
89
+ # sort the dictionary by value, descending, then retrieve top k values
90
+ return sorted(inner_dict.items(), reverse = True, key = lambda x: x[1])[:k]
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ sentence_transformers
3
+ datasets
4
+ scikit-learn
5
+ torch