Vinnybustacap commited on
Commit
56d3d9c
1 Parent(s): d6ae3d6

Create Main

Browse files
Files changed (1) hide show
  1. Main +34 -0
Main ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+ import gensim
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ import folium
6
+ from collections import defaultdict
7
+ from nltk.corpus import stopwords
8
+ from nltk.stem import WordNetLemmatizer
9
+ from gensim import corpora
10
+ from wordcloud import WordCloud
11
+
12
+ url = "http://www.freepatentsonline.com/download/us-patents-granted-2016.zip"
13
+ dataset_zip = pd.io.zipcode.ZipFile(url)
14
+ patent_data = pd.read_csv(dataset_zip.open('us-patents-granted-2016.csv'), encoding='ISO-8859-1')
15
+
16
+ patent_data.dropna(inplace=True)
17
+
18
+ patent_data['Title'] = patent_data['Title'].str.lower()
19
+ patent_data['Abstract'] = patent_data['Abstract'].str.lower()
20
+
21
+ patent_data['TitleTokens'] = patent_data['Title'].apply(nltk.word_tokenize)
22
+ patent_data['AbstractTokens'] = patent_data['Abstract'].apply(nltk.word_tokenize)
23
+
24
+ stop_words = set(stopwords.words('english'))
25
+ lemmatizer = WordNetLemmatizer()
26
+
27
+ patent_data['TitleClean'] = patent_data['TitleTokens'].apply(lambda x: ' '.join([w for w in x if not w in stop_words]))
28
+ patent_data['AbstractClean'] = patent_data['AbstractTokens'].apply(lambda x: ' '.join([w for w in x if not w in stop_words]))
29
+
30
+ patent_data['TitleLemma'] = patent_data['TitleClean'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()]))
31
+ patent_data['AbstractLemma'] = patent_data['AbstractClean'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()]))
32
+
33
+ patent_data['Text'] = patent_data['TitleLemma'] + ' ' + patent_data['AbstractLemma']
34
+ documents = list(patent_data['Text'])