Spaces:
Build error
Build error
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# @File : run.py | |
# @Author: nixin | |
# @Date : 2021/11/26 | |
import pandas as pd | |
from functions import * | |
from functools import partial | |
import multiprocessing as mp | |
df = pd.read_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/MCDA/data/results (18).csv') | |
print(df.columns) | |
patent_number =[] | |
for patent in df['patent_number']: | |
patent_number.append(patent) | |
print(patent_number) | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
# ~~~ Parameters for data_patent_details file ~~~ # | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
path_to_data = "/Users/nixin/PycharmProjects/PatentSolver_demonstrator/MCDA/data/" #### don't forget to change | |
## Create csv file to store the data_patent_details from the patent runs | |
# (1) Specify column order of patents | |
# (2) Create csv if it does not exist in the data_patent_details path | |
data_column_order = ['inventor_name', | |
'assignee_name_orig', | |
'assignee_name_current', | |
'pub_date', | |
'priority_date', | |
'grant_date', | |
'filing_date', | |
'forward_cite_no_family', | |
'forward_cite_yes_family', | |
'backward_cite_no_family', | |
'backward_cite_yes_family', | |
'patent', | |
'url', | |
'abstract_text'] | |
if 'edison_patents.csv' in os.listdir(path_to_data): | |
os.remove(path_to_data + 'edison_patents.csv') # delete previous csv file | |
with open(path_to_data + 'edison_patents.csv', 'w', newline='') as file: | |
writer = csv.writer(file) | |
writer.writerow(data_column_order) | |
else: | |
with open(path_to_data + 'edison_patents.csv', 'w', newline='') as file: | |
writer = csv.writer(file) | |
writer.writerow(data_column_order) | |
# | |
# | |
########### Run pool process ############# | |
if __name__ == "__main__": | |
## Create lock to prevent collisions when processes try to write on same file | |
l = mp.Lock() | |
## Use a pool of workers where the number of processes is equal to | |
## the number of cpus - 1 | |
with poolcontext(processes=mp.cpu_count() - 1, initializer=init, initargs=(l,)) as pool: | |
pool.map(partial(single_process_scraper, path_to_data_file=path_to_data + 'edison_patents.csv', | |
data_column_order=data_column_order), | |
patent_number) | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
# ~~~ clean raw data_patent_details ~~~ # | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
##read Google scrawer's results | |
table = pd.read_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/MCDA/data/edison_patents.csv') | |
# clean raw patent results | |
results = clean_patent(table) | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
# ~~~ count number ~~~ # | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
results = count_patent(results) | |
print(results.columns) | |
results.to_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/MCDA/data/cleaned_count_patents.csv', index=False) | |