Spaces:
Runtime error
Runtime error
import os | |
import time | |
import json | |
import joblib | |
import math | |
import itertools | |
import argparse | |
import multiprocessing as mp | |
import pandas as pd | |
from dotenv import load_dotenv | |
from serpapi import GoogleSearch | |
import tiktoken | |
from openai import OpenAI | |
from tqdm import tqdm | |
load_dotenv() | |
ORGANIZATION_ID = os.getenv('OPENAI_ORGANIZATION_ID') | |
SERP_API_KEY = os.getenv('SERP_APIKEY') | |
def get_leads( file_path: str, names: list = ['營業地址', '統一編號', '總機構統一編號', '營業人名稱', '資本額', '設立日期', '組織別名稱', '使用統一發票', | |
'行業代號', '名稱', '行業代號1', '名稱1', '行業代號2', '名稱2', '行業代號3', '名稱3']): | |
""" | |
""" | |
assert os.path.exists(file_path) | |
data = pd.read_csv( file_path, names=names) | |
return data | |
def get_serp( query: str, google_domain: str, gl: str, lr: str) -> dict: | |
""" | |
""" | |
results = [] | |
search = GoogleSearch({ | |
"q": query, | |
'google_domain': google_domain, | |
'gl': gl, | |
'lr': lr, | |
"api_key": SERP_API_KEY | |
}) | |
result = search.get_dict() | |
# print(result['organic_results'][0]) | |
# return result['organic_results'][0] | |
return result | |
def test_get_serp(): | |
# query = "原味商行" | |
# query = "南投縣中寮鄉中寮村鄉林巷43號 和興商店" | |
# query = "啓輝環管企業社" | |
# query = "蘭陽客棧小吃店" | |
# query = '韓笑味食品有限公司' | |
# query = '小阿姨的店' | |
query = '達米娜魚料理店' | |
res = get_serp(query, google_domain='google.com.tw') | |
print(res) | |
def get_condensed_result(result): | |
""" | |
Argument | |
result | |
Return | |
condensed_result: | |
Example: | |
result['knowledge_graph'].keys() # 'title', 'thumbnail', 'type', 'entity_type', 'kgmid', 'knowledge_graph_search_link', 'serpapi_knowledge_graph_search_link', 'tabs', 'place_id', 'directions', 'local_map', 'rating', 'review_count', '服務項目', '地址', '地址_links', 'raw_hours', 'hours', '電話號碼', '電話號碼_links', 'popular_times', 'user_reviews', 'reviews_from_the_web', 'unclaimed_listing', '個人資料', '其他人也搜尋了以下項目', '其他人也搜尋了以下項目_link', '其他人也搜尋了以下項目_stick' | |
""" | |
filtered_results = [ | |
{"title": r.get('title',""), 'snippet': r.get('snippet',"")} for r in result['organic_results'] | |
] | |
if 'knowledge_graph' in result: | |
if 'user_reviews' in result['knowledge_graph']: | |
filtered_results.append( {'title': result['knowledge_graph']['title'], '顧客評價': "\t".join([ _.get('summary', '') for _ in result['knowledge_graph']['user_reviews']]) }) | |
if '其他人也搜尋了以下項目' in result['knowledge_graph']: | |
filtered_results.append( {'title': "類似的店", 'snippet': "\t".join([ str(_.get('extensions', '')) for _ in result['knowledge_graph']['其他人也搜尋了以下項目']]) }) | |
if '暫停營業' in result['knowledge_graph']: | |
filtered_results.append( {'status': '暫停營業' if result['knowledge_graph']['暫停營業'] else '營業中'}) | |
if '電話號碼' in result['knowledge_graph']: | |
filtered_results.append( {'telephone_number': result['knowledge_graph']['電話號碼']}) | |
condensed_result = json.dumps(filtered_results, ensure_ascii=False) | |
# print( condensed_results ) | |
return condensed_result | |
def test_get_condensed_result(): | |
# query = "原味商行" | |
# query = "南投縣中寮鄉中寮村鄉林巷43號 和興商店" | |
# query = "啓輝環管企業社" | |
# query = "蘭陽客棧小吃店" | |
# query = '韓笑味食品有限公司' | |
# query = '小阿姨的店' | |
query = '達米娜魚料理店' | |
res = get_serp(query) | |
cond_res = get_condensed_result(res) | |
def compose_analysis( client, query, search_results): | |
""" | |
Argument | |
query: str | |
search_results: str | |
Return | |
response: str | |
""" | |
chat_completion = client.chat.completions.create( | |
messages=[ | |
{ | |
"role": "system", | |
"content": ''' | |
As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query, | |
your task is to first identify relevant information of the identical store based on store name and proxmity of address if known. After that, extract `store_name`, `address`, `description`, `category` and `phone_number` from the found relevant information, where `category` can only be `小吃店`, `日式料理(含居酒屋,串燒)`, `火(鍋/爐)`, `東南亞料理(不含日韓)`, `海鮮熱炒`, `特色餐廳(含雞、鵝、牛、羊肉)`, `傳統餐廳`, `燒烤`, `韓式料理(含火鍋,烤肉)` or `西餐廳(含美式,義式,墨式)`. | |
It's very important to omit unrelated results. Do not make up any assumption. | |
Please think step by step, and output in json format. An example output json is like {"store_name": "...", "address": "...", "description": "... products, service or highlights ...", "category": "...", "phone_number": "..."} | |
If no relevant information has been found, simply output json with empty values. | |
I'll tip you and guarantee a place in heaven you do a great job completely according to my instruction. | |
''' | |
}, | |
{ | |
"role": "user", | |
"content": f''' | |
`query`: `{query}`, | |
`search_results`: {search_results} | |
''', | |
} | |
], | |
model = "gpt-4-0125-preview", | |
response_format = {"type": "json_object"}, | |
temperature = 0, | |
# stream = True | |
) | |
# response = [] | |
# for chunk in chat_completion: | |
# text = chunk.choices[0].delta.content or "" | |
# response.append(text) | |
# print( text, end="") | |
# return "".join(response) | |
response = chat_completion.choices[0].message.content | |
return response | |
def test_compose_analysis(): | |
# query = "原味商行" | |
# query = "南投縣中寮鄉中寮村鄉林巷43號 和興商店" | |
# query = "啓輝環管企業社" | |
# query = "蘭陽客棧小吃店" | |
# query = '韓笑味食品有限公司' | |
# query = '小阿姨的店' | |
query = '達米娜魚料理店' | |
res = get_serp(query) | |
cond_res = get_condensed_result(res) | |
resp = compose_analysis( client, query = query, search_results = cond_res) | |
print( resp ) | |
def compose_classication( | |
client, | |
evidence, | |
classes: list = ['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)'], | |
backup_classes: list = [ '中式', '西式'], | |
) -> str: | |
""" | |
Argument | |
client: | |
evidence: str | |
classes: list | |
Return | |
response: str | |
""" | |
if isinstance(classes, list): | |
classes = ", ".join([ f"`{x}`" for x in classes]) | |
elif isinstance(classes, str): | |
pass | |
else: | |
raise Exception(f"Incorrect classes type: {type(classes)}") | |
chat_completion = client.chat.completions.create( | |
messages=[ | |
{ | |
"role": "system", | |
"content": f''' | |
As a helpful and rigorous retail analyst, given the provided information about a store, | |
your task is two-fold. First, classify provided evidence below into the mostly relevant category from the following: {classes}. | |
Second, if no relevant information has been found, classify the evidence into the mostly relevant supercategory from the following: {backup_classes}. | |
It's very important to omit unrelated piece of evidence and don't make up any assumption. | |
Please think step by step, and output in json format. An example output json is like {{"category": "..."}} | |
If no relevant piece of information can ever be found at all, simply output json with empty string "". | |
I'll tip you and guarantee a place in heaven you do a great job completely according to my instruction. | |
''' | |
}, | |
{ | |
"role": "user", | |
"content": f''' | |
`evidence`: `{evidence}` | |
''', | |
} | |
], | |
model = "gpt-4-0125-preview", | |
response_format = {"type": "json_object"}, | |
temperature = 0, | |
# stream = True | |
) | |
response = chat_completion.choices[0].message.content | |
return response | |
def test_compose_classification( evidence): | |
""" | |
""" | |
evidence = '[{"title": "年年有魚餐飲有限公司- 店家介紹", "snippet": "統一編號. 93769370 · 公司狀況. 營業中 · 公司名稱. 年年有魚餐飲有限公司 · 公司類型. 有限公司 · 資本總額. 6000000 · 所在地. 臺中市西區民龍里臺灣大道2段159號1樓."}, {"title": "年年有魚餐飲有限公司", "snippet": "營業地址, 臺中市西區民龍里臺灣大道2段159號1樓 ; 統編, 93769370 ; 營業名稱, 年年有魚餐飲有限公司 ; 資本額, 6,000,000 ; 設立日期, 1120713."}, {"title": "年年有魚餐飲有限公司", "snippet": "公司名稱, 年年有魚餐飲有限公司 ; 資本總額(元), 6,000,000 ; 負責人, 江敏 ; 登記地址, 看地圖 臺中市西區民龍里臺灣大道二段159號1樓 郵遞區號查詢 ; 設立 ..."}, {"title": "年年有魚餐飲有限公司", "snippet": "年年有魚餐飲有限公司 ; 負責人, 江敏 ; 登記地址, 台中市西區民龍里台灣大道二段159號1樓 ; 公司狀態, 核准設立 ; 資本額, 6,000,000元 ; 所在縣市, 台中市 西區 民龍里."}, {"title": "江_敏-年年有魚餐飲有限公司", "snippet": "負責人:江_敏·公司名:年年有魚餐飲有限公司·統一編號:93769370·公司地址:臺中市西區民龍里臺灣大道二段159號1樓·資本額:6000000·公司狀況:核准設立·核准設立 ..."}, {"title": "年年有魚餐飲有限公司/負責人:江_敏", "snippet": "公司名稱:年年有魚餐飲有限公司·代表人姓名:江_敏·公司所在地:臺中市西區民龍里臺灣大道二段159號1樓·統編:93769370資本總額:6000000·公司狀況:核准設立·核准設立 ..."}, {"title": "貓吃魚餐飲有限公司|工作徵才簡介", "snippet": "貓吃魚餐飲有限公司. 台中市西屯區. 時薪186元. 應徵人數:1 ~ 5人. 排休; 晚班; 工作經驗不拘; 學歷不拘. 1.佈置及清理餐桌2.為顧客帶位或安排座位3.上菜並提供有關用餐的 ..."}, {"title": "食力餐飲_食力國際有限公司|公司簡介", "snippet": "「食力國際有限公司」正式成立於2023年4月,目前短短時間已成立了四個品牌~ 一、【食力據點】 1:食力咖哩- 台中遠百店(台中市西屯區臺灣大道三段251號大遠百12樓大食 ..."}, {"title": "112 年臺中市優質餐飲店家分級評核獲獎名單", "snippet": "112 年臺中市優質餐飲店家分級評核獲獎名單-. 臺中市餐廳飲食店低碳認證書20 家. 1 築間幸福鍋物-臺中市政二店臺中市西屯區文心路二段213 號. 2 有之和牛-臺中文心店."}, {"title": "年年有魚水族館", "snippet": "營業地址, 臺中市西屯區何安里西屯路2段101-2號1樓 ; 統編, 21833774 ; 營業名稱, 年年有魚水族館 ; 資本額, 60,000 ; 設立日期, 0940502."}, {"title": "類似的店", "snippet": "[\'設計公司\']\\t[\'餐廳\']"}, {"telephone_number": "04 2376 6318"}]' | |
x = compose_classication( evidence ) | |
print( x ) | |
def classify_results( | |
analysis_results: pd.DataFrame, | |
input_column: str = 'evidence', | |
output_column: str = 'classified_category', | |
classes: list = ['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)'], | |
backup_classes: list = [ '中式', '西式'] | |
): | |
""" | |
Argument | |
analysis_results: dataframe | |
input_column: str | |
output_column: str | |
classes: list | |
Return | |
analysis_results: dataframe | |
""" | |
client = OpenAI( organization = ORGANIZATION_ID) | |
classified_results = analysis_results.copy() | |
empty_indices = [] | |
labels = [] | |
for idx, evidence in zip( analysis_results['index'], analysis_results[input_column]): | |
try: | |
label = json.loads(compose_classication( client, evidence, classes=classes, backup_classes=backup_classes))['category'] | |
labels.append(label) | |
except Exception as e: | |
print(f"# CLASSIFICATION error -> evidence: {e}") | |
labels.append("") | |
empty_indices.append(idx) | |
classified_results[output_column] = labels | |
return { | |
"classified_results": classified_results, | |
"empty_indices": empty_indices | |
} | |
def classify_results_mp( extracted_results: pd.DataFrame, classified_file_path, classes, backup_classes, n_processes: int = 4): | |
""" | |
Argument | |
extracted_results: | |
classified_file_path: | |
classes: ['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)'] | |
backup_classes: [ '中式', '西式'] | |
n_processes: int | |
Return | |
classified_results: dataframe | |
Reference | |
200 records, 4 processes, 122.4695s | |
""" | |
st = time.time() | |
# classified_file_path = "data/classified_result.joblib" | |
if not os.path.exists(classified_file_path): | |
split_data = split_dataframe(extracted_results) | |
with mp.Pool(args.n_processes) as pool: | |
classified_results = pool.starmap( | |
classify_results, | |
[ ( | |
d, | |
'evidence', | |
'classified_category', | |
classes, | |
backup_classes | |
) for d in split_data] | |
) | |
classified_results = merge_results( classified_results, dataframe_columns=['classified_results'], list_columns=['empty_indices']) | |
with open( classified_file_path, "wb") as f: | |
joblib.dump( classified_results, f) | |
else: | |
with open( classified_file_path, "rb") as f: | |
classified_results = joblib.load(f) | |
print( f"total time: {time.time() - st}") | |
return classified_results | |
def test_get_evidence_classification(): | |
analysis_results = classify_results( analysis_results) | |
patch_analysis_results = classify_results( patch_analysis_results) | |
def compose_query( address, name, with_index: bool = True): | |
""" | |
Argumemnt | |
# d: series with d[1]: 地址, d[4]: 營業人名稱 # | |
address: str | |
name: str | |
with_index: bool | |
Return | |
query: `縣市` `營業人名稱` | |
""" | |
# if with_index: # .itertuples() | |
# query = f"{d[1][:3]} {d[4]}" | |
# else: | |
# query = f"{d[0][:3]} {d[3]}" | |
query = f"{address[:3]} {name}" | |
return query | |
def crawl_results( data: pd.DataFrame, google_domain: str = 'google.com.tw', gl: str = 'tw', lr: str = 'lang_zh-TW'): | |
""" | |
Argument | |
data: dataframe | |
google_domain: str | |
gl: str | |
lr: str | |
Return | |
crawled_results | |
Reference | |
200 records, 4 processes, 171.36490321159363 | |
""" | |
serp_results = [] | |
condensed_results = [] | |
crawled_results = [] | |
empty_indices = [] | |
for i, d in tqdm(enumerate(data.itertuples())): | |
idx = d[0] | |
address = d[1] | |
business_id = d[2] | |
business_name = d[4] | |
query = compose_query(address, business_name) | |
try: | |
res = get_serp( query, google_domain, gl, lr) | |
serp_results.append(res) | |
except: | |
print( f"# SERP error: i = {i}, idx = {idx}, query = {query}") | |
empty_indices.append(i) | |
continue | |
try: | |
cond_res = get_condensed_result(res) | |
condensed_results.append(cond_res) | |
except: | |
print(f"# CONDENSE error: i = {i}, idx = {idx}, res = {res}") | |
empty_indices.append(i) | |
continue | |
crawled_results.append( { | |
"index": idx, | |
"business_id": business_id, | |
"business_name": business_name, | |
"serp": res, | |
"evidence": cond_res, | |
"address": address, | |
} ) | |
crawled_results = pd.DataFrame(crawled_results) | |
return { | |
"crawled_results": crawled_results, | |
"empty_indices": empty_indices | |
} | |
def crawl_results_mp( data: pd.DataFrame, crawl_file_path: str, n_processes: int = 4): | |
st = time.time() | |
# crawl_file_path = "data/crawled_results.joblib" | |
if not os.path.exists(crawl_file_path): | |
split_data = split_dataframe( data ) | |
with mp.Pool(n_processes) as pool: | |
crawled_results = pool.map( crawl_results, split_data) | |
crawled_results = merge_results( crawled_results, dataframe_columns=['crawled_results'], list_columns=['empty_indices']) | |
with open( crawl_file_path, "wb") as f: | |
joblib.dump( crawled_results, f) | |
else: | |
with open( crawl_file_path, "rb") as f: | |
crawled_results = joblib.load(f) | |
print( f"total time: {time.time() - st}") | |
return crawled_results | |
def extract_results( data: pd.DataFrame ): | |
""" | |
Argument | |
data: `evidence`, `result` | |
Return | |
extracted_results: dataframe of `extracted_evidence` | |
""" | |
client = OpenAI( organization = ORGANIZATION_ID) | |
extracted_results = [] | |
empty_indices = [] | |
for i, d in tqdm(enumerate(data.itertuples())): | |
idx = d[1] | |
evidence = d.evidence | |
business_id = d[2] | |
business_name = d[3] | |
address = d[6] | |
query = compose_query( address, business_name) | |
try: | |
ana_res = compose_analysis( client, query = query, search_results = evidence) | |
ana_res = json.loads(ana_res) | |
except Exception as e: | |
print(f"# ANALYSIS error {e}: i = {i}, evidence = {evidence}") | |
empty_indices.append(i) | |
continue | |
extracted_results.append( { | |
"index": idx, | |
"business_id": business_id, | |
"business_name": business_name, | |
"evidence": evidence, | |
** ana_res | |
} ) | |
extracted_results = pd.DataFrame(extracted_results) | |
return { | |
"extracted_results": extracted_results, | |
"empty_indices": empty_indices | |
} | |
def extract_results_mp( crawled_results, extracted_file_path): | |
""" | |
Argument | |
Return | |
Reference | |
200 records, 4 processes, 502.26914715766907 | |
""" | |
st = time.time() | |
# args.extracted_file_path = "data/extracted_results.joblib" | |
if not os.path.exists(extracted_file_path): | |
split_data = split_dataframe( crawled_results) | |
with mp.Pool(args.n_processes) as pool: | |
extracted_results = pool.map( extract_results, split_data) | |
extracted_results = merge_results( extracted_results, dataframe_columns=['extracted_results'], list_columns=['empty_indices']) | |
with open( extracted_file_path, "wb") as f: | |
joblib.dump( extracted_results, f) | |
else: | |
with open( extracted_file_path, "rb") as f: | |
extracted_results = joblib.load(f) | |
print( f"total time: {time.time() - st}") | |
return extracted_results | |
def test_get_analysis_results(): | |
data = pd.read_csv("data/餐廳類型分類.xlsx - 測試清單.csv") | |
analysis_results, empty_indices = extract_results( data ) | |
def postprocess_result( results: pd.DataFrame, postprocessed_results_path, category_hierarchy: dict, column_name: str = 'category'): | |
""" | |
Argument | |
analysis_result: `evidence`, `result` | |
postprocessed_results_path | |
Return | |
""" | |
# index = analysis_result['result']['index'] | |
# store_name = data.loc[index]['營業人名稱'] if len(analysis_result['result'].get('store_name',''))==0 else analysis_result['result']['store_name'] | |
# address = data.loc[index]['營業地址'] if len(analysis_result['result'].get('address',''))==0 else analysis_result['result']['address'] | |
# post_res = { | |
# "evidence": analysis_result['evidence'], | |
# "index": index, | |
# "begin_date": data.loc[index]['設立日期'], | |
# "store_name": store_name, | |
# "address": address, | |
# "description": analysis_result['result'].get('description', ""), | |
# "phone_number": analysis_result['result'].get('phone_number', ""), | |
# "category": analysis_result['result'].get('category', ""), | |
# "supercategory": category_hierarchy.get(analysis_result['result'].get('category', ""), analysis_result['result'].get('category',"")), | |
# } | |
if not os.path.exists(postprocessed_results_path): | |
postprocessed_results = results.copy() | |
postprocessed_results['supercategory'] = postprocessed_results[column_name].apply(lambda x: category_hierarchy.get(x, '')) | |
with open( postprocessed_results_path, "wb") as f: | |
joblib.dump( postprocessed_results, f) | |
else: | |
with open( postprocessed_results_path, "rb") as f: | |
postprocessed_results = joblib.load(f) | |
return postprocessed_results | |
def test_postprocess_result(): | |
analysis_result = "" | |
pos_res = postprocess_result( analysis_result) | |
def combine_results( results: pd.DataFrame, combined_results_path: str, src_column: str = 'classified_category', tgt_column: str = 'category', strategy: str = 'replace'): | |
""" | |
Argument | |
classified_results_df: dataframe | |
combined_results_path | |
src_column: str | |
strategy: str, 'replace' or 'patch' | |
Return | |
combined_results: dataframe | |
""" | |
if not os.path.exists(combined_results_path): | |
combined_results = results.copy() | |
if strategy == 'replace': | |
condition = (combined_results[tgt_column]=='') | (combined_results[src_column]!=combined_results[tgt_column]) | |
combined_results.loc[ condition, tgt_column] = combined_results[condition][src_column].values | |
elif strategy == 'patch': | |
condition = (combined_results[tgt_column]=='') | |
combined_results.loc[ condition, tgt_column] = combined_results[condition][src_column].values | |
else: | |
raise Exception(f"Strategy {strategy} not implemented") | |
with open( combined_results_path, "wb") as f: | |
joblib.dump( combined_results, f) | |
else: | |
with open( combined_results_path, "rb") as f: | |
combined_results = joblib.load(f) | |
return combined_results | |
def format_evidence(evidence): | |
""" | |
""" | |
formatted = [] | |
evidence = json.loads(evidence) | |
# print( len(evidence) ) | |
for i in range(len(evidence)): | |
if 'title' in evidence[i] and '顧客評價' in evidence[i]: | |
f = f"\n> 顧客評價: {evidence[i]['顧客評價']}" | |
elif 'title' in evidence[i] and evidence[i]['title']=='類似的店': | |
f = f"\n> 類似的店: {evidence[i]['snippet']}" | |
elif 'status' in evidence[i]: | |
f = f"\n> 經營狀態: {evidence[i]['status']}" | |
elif 'telephone_number' in evidence[i]: | |
f = f"\n> 電話號碼: {evidence[i]['telephone_number']}" | |
else: | |
try: | |
f = f"{i+1}. {evidence[i]['title']} ({evidence[i].get('snippet','')})" | |
except KeyError: | |
print( evidence[i] ) | |
raise KeyError | |
formatted.append(f) | |
return "\n".join(formatted) | |
def format_output( df: pd.DataFrame, input_column: str = 'evidence', output_column: str = 'formatted_evidence', format_func = format_evidence): | |
""" | |
Argument | |
df: `evidence`, `result` | |
input_column: | |
output_column: | |
format_func: | |
Return | |
formatted_df: dataframe of `formatted_evidence` | |
""" | |
formatted_df = df.copy() | |
formatted_df[output_column] = formatted_df[input_column].apply(format_evidence) | |
return formatted_df | |
def merge_results( results: list, dataframe_columns: list, list_columns: list): | |
""" | |
Argument | |
results: a list of dataframes | |
dataframe_columns: list | |
list_columns: list | |
""" | |
assert len(results) > 0, "No results to merge" | |
merged_results = {} | |
for result in results: | |
for key in dataframe_columns: | |
mer_res = pd.concat([ r[key] for r in results], ignore_index=True) | |
merged_results[key] = mer_res | |
for key in list_columns: | |
mer_res = list(itertools.chain(*[ r[key] for r in results])) | |
merged_results[key] = mer_res | |
return merged_results | |
def split_dataframe( df: pd.DataFrame, n_processes: int = 4) -> list: | |
""" | |
""" | |
n = df.shape[0] | |
n_per_process = math.ceil(n / n_processes) | |
return [ df.iloc[i:i+n_per_process] for i in range(0, n, n_per_process)] | |
def main(args): | |
""" | |
Argument | |
args: argparse | |
""" | |
## 讀取資料名單 ## | |
data = get_leads(args.data_path) | |
## 進行爬蟲與分析 ## | |
# crawled_results = crawl_results(data) | |
crawled_results = crawl_results_mp( data, args.crawled_file_path, n_processes=args.n_processes) | |
## 方法 1: 擷取關鍵資訊與分類 ## | |
# extracted_results = extract_results( | |
# crawled_results['crawled_results'] | |
# ) | |
extracted_results = extract_results_mp( | |
crawled_results = crawled_results['crawled_results'], | |
extracted_file_path = args.extracted_file_path | |
) | |
## 方法2: 直接對爬蟲結果分類 ## | |
# classified_results = classify_results( | |
# extracted_results['extracted_results'], | |
# input_column = 'evidence', | |
# output_column = 'classified_category', | |
# classes = ['中式', '西式'], | |
# backup_classes = [ '中式', '西式'] | |
# ) | |
classified_results = classify_results_mp( | |
extracted_results['extracted_results'], | |
args.classified_file_path, | |
classes=args.classes, | |
backup_classes=args.backup_classes, | |
n_processes=args.n_processes | |
) | |
## 合併分析結果 ## | |
combined_results = combine_results( | |
classified_results['classified_results'], | |
args.combined_file_path, | |
src_column='classified_category', | |
tgt_column='category', | |
strategy='replace' | |
) | |
## 後處理分析結果 ## | |
postprossed_results = postprocess_result( | |
combined_results, | |
args.postprocessed_results, | |
category2supercategory | |
) | |
formatted_results = format_output( postprossed_results, input_column = 'evidence', output_column = 'formatted_evidence', format_func = format_evidence) | |
formatted_results.to_csv("data/formatted_results.csv", index=False) | |
category2supercategory = { | |
"小吃店": "中式", | |
"日式料理(含居酒屋,串燒)": "中式", | |
"火(鍋/爐)": "中式", | |
"東南亞料理(不含日韓)": "中式", | |
"海鮮熱炒": "中式", | |
"特色餐廳(含雞、鵝、牛、羊肉)": "中式", | |
"傳統餐廳": "中式", | |
"燒烤": "中式", | |
"韓式料理(含火鍋,烤肉)": "中式", | |
"西餐廳(含美式,義式,墨式)": "西式", | |
"中式": "中式", | |
"西式": "西式" | |
} | |
supercategory2category = { | |
"中式": [ | |
"小吃店", | |
"日式料理(含居酒屋,串燒)", | |
"火(鍋/爐)", | |
"東南亞料理(不含日韓)", | |
"海鮮熱炒", | |
"特色餐廳(含雞、鵝、牛、羊肉)", | |
"傳統餐廳", | |
"燒烤", | |
"韓式料理(含火鍋,烤肉)" | |
], | |
"西式": ["西餐廳(含美式,義式,墨式)"] | |
} | |
if __name__=='__main__': | |
base = "https://serpapi.com/search.json" | |
engine = 'google' | |
# query = "Coffee" | |
google_domain = 'google.com.tw' | |
gl = 'tw' | |
lr = 'lang_zh-TW' | |
# url = f"{base}?engine={engine}&q={query}&google_domain={google_domain}&gl={gl}&lr={lr}" | |
n_processes = 4 | |
client = OpenAI( organization = ORGANIZATION_ID) | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--data_path", type=str, default="data/餐廳類型分類.xlsx - 測試清單.csv") | |
parser.add_argument("--classified_file_path", type=str, default="data/classified_results.joblib") | |
parser.add_argument("--extracted_file_path", type=str, default="data/extracted_results.joblib") | |
parser.add_argument("--crawled_file_path", type=str, default="data/crawled_results.joblib") | |
parser.add_argument("--combined_file_path", type=str, default="data/combined_results.joblib") | |
parser.add_argument("--postprocessed_results", type=str, default="data/postprocessed_results.joblib") | |
parser.add_argument("--classes", type=list, default=['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)']) | |
parser.add_argument("--backup_classes", type=list, default=['中式', '西式']) | |
parser.add_argument("--n_processes", type=int, default=4) | |
args = parser.parse_args() | |
main(args) | |