xin's picture
initial commit
22738ca
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : run.py
# @Author: nixin
# @Date : 2021/11/26
import pandas as pd
from functions import *
from functools import partial
import multiprocessing as mp
df = pd.read_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/MCDA/data/results (18).csv')
print(df.columns)
patent_number =[]
for patent in df['patent_number']:
patent_number.append(patent)
print(patent_number)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
# ~~~ Parameters for data_patent_details file ~~~ #
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
path_to_data = "/Users/nixin/PycharmProjects/PatentSolver_demonstrator/MCDA/data/" #### don't forget to change
## Create csv file to store the data_patent_details from the patent runs
# (1) Specify column order of patents
# (2) Create csv if it does not exist in the data_patent_details path
data_column_order = ['inventor_name',
'assignee_name_orig',
'assignee_name_current',
'pub_date',
'priority_date',
'grant_date',
'filing_date',
'forward_cite_no_family',
'forward_cite_yes_family',
'backward_cite_no_family',
'backward_cite_yes_family',
'patent',
'url',
'abstract_text']
if 'edison_patents.csv' in os.listdir(path_to_data):
os.remove(path_to_data + 'edison_patents.csv') # delete previous csv file
with open(path_to_data + 'edison_patents.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(data_column_order)
else:
with open(path_to_data + 'edison_patents.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(data_column_order)
#
#
########### Run pool process #############
if __name__ == "__main__":
## Create lock to prevent collisions when processes try to write on same file
l = mp.Lock()
## Use a pool of workers where the number of processes is equal to
## the number of cpus - 1
with poolcontext(processes=mp.cpu_count() - 1, initializer=init, initargs=(l,)) as pool:
pool.map(partial(single_process_scraper, path_to_data_file=path_to_data + 'edison_patents.csv',
data_column_order=data_column_order),
patent_number)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
# ~~~ clean raw data_patent_details ~~~ #
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
##read Google scrawer's results
table = pd.read_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/MCDA/data/edison_patents.csv')
# clean raw patent results
results = clean_patent(table)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
# ~~~ count number ~~~ #
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
results = count_patent(results)
print(results.columns)
results.to_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/MCDA/data/cleaned_count_patents.csv', index=False)