|
import string |
|
import gensim |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import folium |
|
from collections import defaultdict |
|
from nltk.corpus import stopwords |
|
from nltk.stem import WordNetLemmatizer |
|
from gensim import corpora |
|
from wordcloud import WordCloud |
|
|
|
url = "http://www.freepatentsonline.com/download/us-patents-granted-2016.zip" |
|
dataset_zip = pd.io.zipcode.ZipFile(url) |
|
patent_data = pd.read_csv(dataset_zip.open('us-patents-granted-2016.csv'), encoding='ISO-8859-1') |
|
|
|
patent_data.dropna(inplace=True) |
|
|
|
patent_data['Title'] = patent_data['Title'].str.lower() |
|
patent_data['Abstract'] = patent_data['Abstract'].str.lower() |
|
|
|
patent_data['TitleTokens'] = patent_data['Title'].apply(nltk.word_tokenize) |
|
patent_data['AbstractTokens'] = patent_data['Abstract'].apply(nltk.word_tokenize) |
|
|
|
stop_words = set(stopwords.words('english')) |
|
lemmatizer = WordNetLemmatizer() |
|
|
|
patent_data['TitleClean'] = patent_data['TitleTokens'].apply(lambda x: ' '.join([w for w in x if not w in stop_words])) |
|
patent_data['AbstractClean'] = patent_data['AbstractTokens'].apply(lambda x: ' '.join([w for w in x if not w in stop_words])) |
|
|
|
patent_data['TitleLemma'] = patent_data['TitleClean'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()])) |
|
patent_data['AbstractLemma'] = patent_data['AbstractClean'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()])) |
|
|
|
patent_data['Text'] = patent_data['TitleLemma'] + ' ' + patent_data['AbstractLemma'] |
|
documents = list(patent_data['Text']) |