Patent / Main
Vinnybustacap's picture
Create Main
56d3d9c verified
import string
import gensim
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from wordcloud import WordCloud
url = "http://www.freepatentsonline.com/download/us-patents-granted-2016.zip"
dataset_zip = pd.io.zipcode.ZipFile(url)
patent_data = pd.read_csv(dataset_zip.open('us-patents-granted-2016.csv'), encoding='ISO-8859-1')
patent_data.dropna(inplace=True)
patent_data['Title'] = patent_data['Title'].str.lower()
patent_data['Abstract'] = patent_data['Abstract'].str.lower()
patent_data['TitleTokens'] = patent_data['Title'].apply(nltk.word_tokenize)
patent_data['AbstractTokens'] = patent_data['Abstract'].apply(nltk.word_tokenize)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
patent_data['TitleClean'] = patent_data['TitleTokens'].apply(lambda x: ' '.join([w for w in x if not w in stop_words]))
patent_data['AbstractClean'] = patent_data['AbstractTokens'].apply(lambda x: ' '.join([w for w in x if not w in stop_words]))
patent_data['TitleLemma'] = patent_data['TitleClean'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()]))
patent_data['AbstractLemma'] = patent_data['AbstractClean'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()]))
patent_data['Text'] = patent_data['TitleLemma'] + ' ' + patent_data['AbstractLemma']
documents = list(patent_data['Text'])