Commit
•
56d3d9c
1
Parent(s):
d6ae3d6
Create Main
Browse files
Main
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import string
|
2 |
+
import gensim
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import seaborn as sns
|
5 |
+
import folium
|
6 |
+
from collections import defaultdict
|
7 |
+
from nltk.corpus import stopwords
|
8 |
+
from nltk.stem import WordNetLemmatizer
|
9 |
+
from gensim import corpora
|
10 |
+
from wordcloud import WordCloud
|
11 |
+
|
12 |
+
url = "http://www.freepatentsonline.com/download/us-patents-granted-2016.zip"
|
13 |
+
dataset_zip = pd.io.zipcode.ZipFile(url)
|
14 |
+
patent_data = pd.read_csv(dataset_zip.open('us-patents-granted-2016.csv'), encoding='ISO-8859-1')
|
15 |
+
|
16 |
+
patent_data.dropna(inplace=True)
|
17 |
+
|
18 |
+
patent_data['Title'] = patent_data['Title'].str.lower()
|
19 |
+
patent_data['Abstract'] = patent_data['Abstract'].str.lower()
|
20 |
+
|
21 |
+
patent_data['TitleTokens'] = patent_data['Title'].apply(nltk.word_tokenize)
|
22 |
+
patent_data['AbstractTokens'] = patent_data['Abstract'].apply(nltk.word_tokenize)
|
23 |
+
|
24 |
+
stop_words = set(stopwords.words('english'))
|
25 |
+
lemmatizer = WordNetLemmatizer()
|
26 |
+
|
27 |
+
patent_data['TitleClean'] = patent_data['TitleTokens'].apply(lambda x: ' '.join([w for w in x if not w in stop_words]))
|
28 |
+
patent_data['AbstractClean'] = patent_data['AbstractTokens'].apply(lambda x: ' '.join([w for w in x if not w in stop_words]))
|
29 |
+
|
30 |
+
patent_data['TitleLemma'] = patent_data['TitleClean'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()]))
|
31 |
+
patent_data['AbstractLemma'] = patent_data['AbstractClean'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()]))
|
32 |
+
|
33 |
+
patent_data['Text'] = patent_data['TitleLemma'] + ' ' + patent_data['AbstractLemma']
|
34 |
+
documents = list(patent_data['Text'])
|