linpershey commited on
Commit
4925baf
1 Parent(s): d8e228f

remove from lfs and add back

Browse files
Files changed (2) hide show
  1. app.py +112 -3
  2. sheet.py +671 -3
app.py CHANGED
@@ -1,3 +1,112 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1361c1e04f9071fadb5137915387ace77713c1d919db55da04e338b4d69eee35
3
- size 3816
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+
4
+ import gradio as gr
5
+ import pandas as pd
6
+ from dotenv import load_dotenv
7
+ import jieba
8
+ jieba.cut('你好')
9
+ from wordcloud import WordCloud
10
+ from PIL import Image
11
+ import matplotlib.pyplot as plt
12
+
13
+ from sheet import compose_query, get_serp, get_condensed_result, extract_results, postprocess_result, format_output, category2supercategory
14
+
15
+ load_dotenv()
16
+
17
+ logger = logging.getLogger(__name__)
18
+ logger.setLevel(logging.DEBUG)
19
+
20
+
21
+ def plot_wordcloud( text):
22
+ """
23
+ """
24
+ if os.getenv("FONT_PATH", None) is not None:
25
+ wc_generator = WordCloud(font_path=os.getenv("FONT_PATH"))
26
+ else:
27
+ wc_generator = WordCloud()
28
+ img = wc_generator.generate( " ".join(jieba.cut(text)))
29
+ # fig, ax = plt.subplots()
30
+ # ax.imshow(wordcloud, interpolation='bilinear')
31
+ # ax.axis("off")
32
+ return img.to_image()
33
+
34
+ def format_category( formatted_results):
35
+ """
36
+ """
37
+ return "\n\n".join([
38
+ f"> 大類別:{formatted_results['supercategory'].values[0]}",
39
+ f"> 小類別:{formatted_results['category'].values[0]}",
40
+ f"> 商家名稱:{formatted_results['store_name'].values[0]}",
41
+ f"> 電話:{formatted_results['phone_number'].values[0]}",
42
+ f"> 描述:{formatted_results['description'].values[0]}"
43
+ ])
44
+
45
+ def do( business_id, business_name, address):
46
+ """
47
+ """
48
+
49
+ crawled_results = []
50
+
51
+ google_domain = "google.com.tw"
52
+ gl = 'tw'
53
+ lr = 'lang_zh-TW'
54
+
55
+ query = compose_query(address, business_name)
56
+ try:
57
+ res = get_serp( query, google_domain, gl, lr)
58
+ except Exception as e:
59
+ return f"Error: {e}"
60
+
61
+ cond_res = get_condensed_result(res)
62
+
63
+ crawled_results.append( {
64
+ "index": 0,
65
+ "business_id": business_id,
66
+ "business_name": business_name,
67
+ "serp": res,
68
+ "evidence": cond_res,
69
+ "address": address
70
+ } )
71
+
72
+ crawled_results = pd.DataFrame(crawled_results)
73
+ # logger.debug(crawled_results)
74
+ extracted_results = extract_results( crawled_results)
75
+ # logger.error(extracted_results['extracted_results'].columns)
76
+ extracted_results = extracted_results['extracted_results'][ [ 'business_id', 'business_name', 'address', 'category', 'evidence', 'phone_number', 'description', 'store_name'] ]
77
+
78
+ postprocessed_results = postprocess_result( extracted_results, postprocessed_results_path="/tmp/postprocessed_results.joblib", category_hierarchy=category2supercategory)
79
+ os.remove("/tmp/postprocessed_results.joblib")
80
+
81
+ formatted_results = format_output( postprocessed_results)
82
+ # logger.error( formatted_results.columns)
83
+
84
+ formatted_output = format_category( formatted_results)
85
+
86
+ img = plot_wordcloud(formatted_results['formatted_evidence'].values[0])
87
+ return formatted_results['formatted_evidence'].values[0], img, formatted_output
88
+
89
+ ## --- interface --- ##
90
+ # outputs = [gr.Dataframe(row_count = (1, "dynamic"), col_count=(6,"dynamic"), label="output data", interactive=1)]
91
+ # demo = gr.Interface(
92
+ # fn=do,
93
+ # inputs=[ "text", "text", "text"],
94
+ # outputs=outputs,
95
+ # )
96
+
97
+ ## --- block --- ##
98
+ with gr.Blocks() as demo:
99
+ gr.Markdown("🌟 自動分類餐廳型態 🌟")
100
+ with gr.Row():
101
+ inputs = [ gr.Textbox( label="統一編號", placeholder="統一編號"), gr.Textbox(placeholder="商家名稱"), gr.Textbox(placeholder="地址")]
102
+ with gr.Row():
103
+ # outputs = [gr.Dataframe(row_count = (1, "dynamic"), col_count=(6,"dynamic"), label="output data", interactive=1)]
104
+ outputs = [ gr.Markdown( label="參考資料(google search)"), gr.Image( label="文字雲"), gr.Markdown( label="類別", )]
105
+ btn = gr.Button("Submit")
106
+ btn.click(fn=do, inputs=inputs, outputs=outputs)
107
+
108
+
109
+ if __name__ == "__main__":
110
+
111
+ demo.launch(share=True, auth=("kota", "kota"))
112
+
sheet.py CHANGED
@@ -1,3 +1,671 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:01d56594e5b1014193942ccac5bed55f04a0927aece2617172fabff1794745ad
3
- size 30077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import json
4
+ import joblib
5
+ import math
6
+ import itertools
7
+ import argparse
8
+ import multiprocessing as mp
9
+
10
+ import pandas as pd
11
+ from dotenv import load_dotenv
12
+ from serpapi import GoogleSearch
13
+ import tiktoken
14
+ from openai import OpenAI
15
+ from tqdm import tqdm
16
+
17
+ load_dotenv()
18
+ ORGANIZATION_ID = os.getenv('OPENAI_ORGANIZATION_ID')
19
+ SERP_API_KEY = os.getenv('SERP_APIKEY')
20
+
21
+
22
+ def get_leads( file_path: str, names: list = ['營業地址', '統一編號', '總機構統一編號', '營業人名稱', '資本額', '設立日期', '組織別名稱', '使用統一發票',
23
+ '行業代號', '名稱', '行業代號1', '名稱1', '行業代號2', '名稱2', '行業代號3', '名稱3']):
24
+ """
25
+ """
26
+ assert os.path.exists(file_path)
27
+ data = pd.read_csv( file_path, names=names)
28
+ return data
29
+
30
+ def get_serp( query: str, google_domain: str, gl: str, lr: str) -> dict:
31
+ """
32
+ """
33
+ results = []
34
+ search = GoogleSearch({
35
+ "q": query,
36
+ 'google_domain': google_domain,
37
+ 'gl': gl,
38
+ 'lr': lr,
39
+ "api_key": SERP_API_KEY
40
+ })
41
+ result = search.get_dict()
42
+ # print(result['organic_results'][0])
43
+ # return result['organic_results'][0]
44
+ return result
45
+
46
+ def test_get_serp():
47
+ # query = "原味商行"
48
+ # query = "南投縣中寮鄉中寮村鄉林巷43號 和興商店"
49
+ # query = "啓輝環管企業社"
50
+ # query = "蘭陽客棧小吃店"
51
+ # query = '韓笑味食品有限公司'
52
+ # query = '小阿姨的店'
53
+ query = '達米娜魚料理店'
54
+ res = get_serp(query, google_domain='google.com.tw')
55
+ print(res)
56
+
57
+ def get_condensed_result(result):
58
+ """
59
+ Argument
60
+ result
61
+ Return
62
+ condensed_result:
63
+ Example:
64
+ result['knowledge_graph'].keys() # 'title', 'thumbnail', 'type', 'entity_type', 'kgmid', 'knowledge_graph_search_link', 'serpapi_knowledge_graph_search_link', 'tabs', 'place_id', 'directions', 'local_map', 'rating', 'review_count', '服務項目', '地址', '地址_links', 'raw_hours', 'hours', '電話號碼', '電話號碼_links', 'popular_times', 'user_reviews', 'reviews_from_the_web', 'unclaimed_listing', '個人資料', '其他人也搜尋了以下項目', '其他人也搜尋了以下項目_link', '其他人也搜尋了以下項目_stick'
65
+ """
66
+ filtered_results = [
67
+ {"title": r.get('title',""), 'snippet': r.get('snippet',"")} for r in result['organic_results']
68
+ ]
69
+ if 'knowledge_graph' in result:
70
+ if 'user_reviews' in result['knowledge_graph']:
71
+ filtered_results.append( {'title': result['knowledge_graph']['title'], '顧客評價': "\t".join([ _.get('summary', '') for _ in result['knowledge_graph']['user_reviews']]) })
72
+ if '其他人也搜尋了以下項目' in result['knowledge_graph']:
73
+ filtered_results.append( {'title': "類似的店", 'snippet': "\t".join([ str(_.get('extensions', '')) for _ in result['knowledge_graph']['其他人也搜尋了以下項目']]) })
74
+ if '暫停營業' in result['knowledge_graph']:
75
+ filtered_results.append( {'status': '暫停營業' if result['knowledge_graph']['暫停營業'] else '營業中'})
76
+ if '電話號碼' in result['knowledge_graph']:
77
+ filtered_results.append( {'telephone_number': result['knowledge_graph']['電話號碼']})
78
+ condensed_result = json.dumps(filtered_results, ensure_ascii=False)
79
+ # print( condensed_results )
80
+ return condensed_result
81
+
82
+ def test_get_condensed_result():
83
+ # query = "原味商行"
84
+ # query = "南投縣中寮鄉中寮村鄉林巷43號 和興商店"
85
+ # query = "啓輝環管企業社"
86
+ # query = "蘭陽客棧小吃店"
87
+ # query = '韓笑味食品有限公司'
88
+ # query = '小阿姨的店'
89
+ query = '達米娜魚料理店'
90
+ res = get_serp(query)
91
+ cond_res = get_condensed_result(res)
92
+
93
+ def compose_analysis( client, query, search_results):
94
+ """
95
+ Argument
96
+ query: str
97
+ search_results: str
98
+ Return
99
+ response: str
100
+ """
101
+ chat_completion = client.chat.completions.create(
102
+ messages=[
103
+ {
104
+ "role": "system",
105
+ "content": '''
106
+ As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query,
107
+ your task is to first identify relevant information of the identical store based on store name and proxmity of address if known. After that, extract `store_name`, `address`, `description`, `category` and `phone_number` from the found relevant information, where `category` can only be `小吃店`, `日式料理(含居酒屋,串燒)`, `火(鍋/爐)`, `東南亞料理(不含日韓)`, `海鮮熱炒`, `特色餐廳(含雞、鵝、牛、羊肉)`, `傳統餐廳`, `燒烤`, `韓式料理(含火鍋,烤肉)` or `西餐廳(含美式,義式,��式)`.
108
+ It's very important to omit unrelated results. Do not make up any assumption.
109
+ Please think step by step, and output in json format. An example output json is like {"store_name": "...", "address": "...", "description": "... products, service or highlights ...", "category": "...", "phone_number": "..."}
110
+ If no relevant information has been found, simply output json with empty values.
111
+ I'll tip you and guarantee a place in heaven you do a great job completely according to my instruction.
112
+ '''
113
+ },
114
+ {
115
+ "role": "user",
116
+ "content": f'''
117
+ `query`: `{query}`,
118
+ `search_results`: {search_results}
119
+ ''',
120
+ }
121
+ ],
122
+ model = "gpt-4-0125-preview",
123
+ response_format = {"type": "json_object"},
124
+ temperature = 0,
125
+ # stream = True
126
+ )
127
+ # response = []
128
+ # for chunk in chat_completion:
129
+ # text = chunk.choices[0].delta.content or ""
130
+ # response.append(text)
131
+ # print( text, end="")
132
+ # return "".join(response)
133
+ response = chat_completion.choices[0].message.content
134
+ return response
135
+
136
+ def test_compose_analysis():
137
+ # query = "原味商行"
138
+ # query = "南投縣中寮鄉中寮村鄉林巷43號 和興商店"
139
+ # query = "啓輝環管企業社"
140
+ # query = "蘭陽客棧小吃店"
141
+ # query = '韓笑味食品有限公司'
142
+ # query = '小阿姨的店'
143
+ query = '達米娜魚料理店'
144
+ res = get_serp(query)
145
+ cond_res = get_condensed_result(res)
146
+ resp = compose_analysis( client, query = query, search_results = cond_res)
147
+ print( resp )
148
+
149
+ def compose_classication(
150
+ client,
151
+ evidence,
152
+ classes: list = ['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)'],
153
+ backup_classes: list = [ '中式', '西式'],
154
+ ) -> str:
155
+ """
156
+ Argument
157
+ client:
158
+ evidence: str
159
+ classes: list
160
+ Return
161
+ response: str
162
+ """
163
+ if isinstance(classes, list):
164
+ classes = ", ".join([ f"`{x}`" for x in classes])
165
+ elif isinstance(classes, str):
166
+ pass
167
+ else:
168
+ raise Exception(f"Incorrect classes type: {type(classes)}")
169
+ chat_completion = client.chat.completions.create(
170
+ messages=[
171
+ {
172
+ "role": "system",
173
+ "content": f'''
174
+ As a helpful and rigorous retail analyst, given the provided information about a store,
175
+ your task is two-fold. First, classify provided evidence below into the mostly relevant category from the following: {classes}.
176
+ Second, if no relevant information has been found, classify the evidence into the mostly relevant supercategory from the following: {backup_classes}.
177
+ It's very important to omit unrelated piece of evidence and don't make up any assumption.
178
+ Please think step by step, and output in json format. An example output json is like {{"category": "..."}}
179
+ If no relevant piece of information can ever be found at all, simply output json with empty string "".
180
+ I'll tip you and guarantee a place in heaven you do a great job completely according to my instruction.
181
+ '''
182
+ },
183
+ {
184
+ "role": "user",
185
+ "content": f'''
186
+ `evidence`: `{evidence}`
187
+ ''',
188
+ }
189
+ ],
190
+ model = "gpt-4-0125-preview",
191
+ response_format = {"type": "json_object"},
192
+ temperature = 0,
193
+ # stream = True
194
+ )
195
+ response = chat_completion.choices[0].message.content
196
+ return response
197
+
198
+ def test_compose_classification( evidence):
199
+ """
200
+ """
201
+ evidence = '[{"title": "年年有魚餐飲有限公司- 店家介紹", "snippet": "統一編號. 93769370 · 公司狀況. 營業中 · 公司名稱. 年年有魚餐飲有限公司 · 公司類型. 有限公司 · 資本總額. 6000000 · 所在地. 臺中市西區民龍里臺灣大道2段159號1樓."}, {"title": "年年有魚餐飲有限公司", "snippet": "營業地址, 臺中市西區民龍里臺灣大道2段159號1樓 ; 統編, 93769370 ; 營業名稱, 年年有魚餐飲有限公司 ; 資本額, 6,000,000 ; 設立日期, 1120713."}, {"title": "年年有魚餐飲有限公司", "snippet": "公司名稱, 年年有魚餐飲有限公司 ; 資本總額(元), 6,000,000 ; 負責人, 江敏 ; 登記地址, 看地圖 臺中市西區民龍里臺灣大道二段159號1樓 郵遞區號查詢 ; 設立 ..."}, {"title": "年年有魚餐飲有限公司", "snippet": "年年有魚餐飲有限公司 ; 負責人, 江敏 ; 登記地址, 台中市西區民龍��台灣大道二段159號1樓 ; 公司狀態, 核准設立 ; 資本額, 6,000,000元 ; 所在縣市, 台中市 西區 民龍里."}, {"title": "江_敏-年年有魚餐飲有限公司", "snippet": "負責人:江_敏·公司名:年年有魚餐飲有限公司·統一編號:93769370·公司地址:臺中市西區民龍里臺灣大道二段159號1樓·資本額:6000000·公司狀況:核准設立·核准設立 ..."}, {"title": "年年有魚餐飲有限公司/負責人:江_敏", "snippet": "公司名稱:年年有魚餐飲有限公司·代表人姓名:江_敏·公司所在地:臺中市西區民龍里臺灣大道二段159號1樓·統編:93769370資本總額:6000000·公司狀況:核准設立·核准設立 ..."}, {"title": "貓吃魚餐飲有限公司|工作徵才簡介", "snippet": "貓吃魚餐飲有限公司. 台中市西屯區. 時薪186元. 應徵人數:1 ~ 5人. 排休; 晚班; 工作經驗不拘; 學歷不拘. 1.佈置及清理餐桌2.為顧客帶位或安排座位3.上菜並提供有關用餐的 ..."}, {"title": "食力餐飲_食力國際有限公司|公司簡介", "snippet": "「食力國際有限公司」正式成立於2023年4月,目前短短時間已成立了四個品牌~ 一、【食力據點】 1:食力咖哩- 台中遠百店(台中市西屯區臺灣大道三段251號大遠百12樓大食 ..."}, {"title": "112 年臺中市優質餐飲店家分級評核獲獎名單", "snippet": "112 年臺中市優質餐飲店家分級評核獲獎名單-. 臺中市餐廳飲食店低碳認證書20 家. 1 築間幸福鍋物-臺中市政二店臺中市西屯區文心路二段213 號. 2 有之和牛-臺中文心店."}, {"title": "年年有魚水族館", "snippet": "營業地址, 臺中市西屯區何安里西屯路2段101-2號1樓 ; 統編, 21833774 ; 營業名稱, 年年有魚水族館 ; 資本額, 60,000 ; 設立日期, 0940502."}, {"title": "類似的店", "snippet": "[\'設計公司\']\\t[\'餐廳\']"}, {"telephone_number": "04 2376 6318"}]'
202
+ x = compose_classication( evidence )
203
+ print( x )
204
+
205
+ def classify_results(
206
+ analysis_results: pd.DataFrame,
207
+ input_column: str = 'evidence',
208
+ output_column: str = 'classified_category',
209
+ classes: list = ['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)'],
210
+ backup_classes: list = [ '中式', '西式']
211
+ ):
212
+ """
213
+ Argument
214
+ analysis_results: dataframe
215
+ input_column: str
216
+ output_column: str
217
+ classes: list
218
+ Return
219
+ analysis_results: dataframe
220
+ """
221
+ client = OpenAI( organization = ORGANIZATION_ID)
222
+ classified_results = analysis_results.copy()
223
+ empty_indices = []
224
+ labels = []
225
+ for idx, evidence in zip( analysis_results['index'], analysis_results[input_column]):
226
+ try:
227
+ label = json.loads(compose_classication( client, evidence, classes=classes, backup_classes=backup_classes))['category']
228
+ labels.append(label)
229
+ except Exception as e:
230
+ print(f"# CLASSIFICATION error -> evidence: {e}")
231
+ labels.append("")
232
+ empty_indices.append(idx)
233
+
234
+ classified_results[output_column] = labels
235
+ return {
236
+ "classified_results": classified_results,
237
+ "empty_indices": empty_indices
238
+ }
239
+
240
+ def classify_results_mp( extracted_results: pd.DataFrame, classified_file_path, classes, backup_classes, n_processes: int = 4):
241
+ """
242
+ Argument
243
+ extracted_results:
244
+ classified_file_path:
245
+ classes: ['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)']
246
+ backup_classes: [ '中式', '西式']
247
+ n_processes: int
248
+ Return
249
+ classified_results: dataframe
250
+ Reference
251
+ 200 records, 4 processes, 122.4695s
252
+ """
253
+ st = time.time()
254
+ # classified_file_path = "data/classified_result.joblib"
255
+ if not os.path.exists(classified_file_path):
256
+ split_data = split_dataframe(extracted_results)
257
+ with mp.Pool(args.n_processes) as pool:
258
+ classified_results = pool.starmap(
259
+ classify_results,
260
+ [ (
261
+ d,
262
+ 'evidence',
263
+ 'classified_category',
264
+ classes,
265
+ backup_classes
266
+ ) for d in split_data]
267
+ )
268
+ classified_results = merge_results( classified_results, dataframe_columns=['classified_results'], list_columns=['empty_indices'])
269
+ with open( classified_file_path, "wb") as f:
270
+ joblib.dump( classified_results, f)
271
+ else:
272
+ with open( classified_file_path, "rb") as f:
273
+ classified_results = joblib.load(f)
274
+ print( f"total time: {time.time() - st}")
275
+ return classified_results
276
+
277
+ def test_get_evidence_classification():
278
+ analysis_results = classify_results( analysis_results)
279
+ patch_analysis_results = classify_results( patch_analysis_results)
280
+
281
+ def compose_query( address, name, with_index: bool = True):
282
+ """
283
+ Argumemnt
284
+ # d: series with d[1]: 地址, d[4]: 營業人名稱 #
285
+ address: str
286
+ name: str
287
+ with_index: bool
288
+ Return
289
+ query: `縣市` `營業人名稱`
290
+ """
291
+ # if with_index: # .itertuples()
292
+ # query = f"{d[1][:3]} {d[4]}"
293
+ # else:
294
+ # query = f"{d[0][:3]} {d[3]}"
295
+ query = f"{address[:3]} {name}"
296
+ return query
297
+
298
+ def crawl_results( data: pd.DataFrame, google_domain: str = 'google.com.tw', gl: str = 'tw', lr: str = 'lang_zh-TW'):
299
+ """
300
+ Argument
301
+ data: dataframe
302
+ google_domain: str
303
+ gl: str
304
+ lr: str
305
+ Return
306
+ crawled_results
307
+ Reference
308
+ 200 records, 4 processes, 171.36490321159363
309
+ """
310
+ serp_results = []
311
+ condensed_results = []
312
+ crawled_results = []
313
+ empty_indices = []
314
+ for i, d in tqdm(enumerate(data.itertuples())):
315
+ idx = d[0]
316
+ address = d[1]
317
+ business_id = d[2]
318
+ business_name = d[4]
319
+ query = compose_query(address, business_name)
320
+ try:
321
+ res = get_serp( query, google_domain, gl, lr)
322
+ serp_results.append(res)
323
+ except:
324
+ print( f"# SERP error: i = {i}, idx = {idx}, query = {query}")
325
+ empty_indices.append(i)
326
+ continue
327
+ try:
328
+ cond_res = get_condensed_result(res)
329
+ condensed_results.append(cond_res)
330
+ except:
331
+ print(f"# CONDENSE error: i = {i}, idx = {idx}, res = {res}")
332
+ empty_indices.append(i)
333
+ continue
334
+
335
+ crawled_results.append( {
336
+ "index": idx,
337
+ "business_id": business_id,
338
+ "business_name": business_name,
339
+ "serp": res,
340
+ "evidence": cond_res,
341
+ "address": address,
342
+ } )
343
+ crawled_results = pd.DataFrame(crawled_results)
344
+
345
+ return {
346
+ "crawled_results": crawled_results,
347
+ "empty_indices": empty_indices
348
+ }
349
+
350
+ def crawl_results_mp( data: pd.DataFrame, crawl_file_path: str, n_processes: int = 4):
351
+ st = time.time()
352
+ # crawl_file_path = "data/crawled_results.joblib"
353
+ if not os.path.exists(crawl_file_path):
354
+ split_data = split_dataframe( data )
355
+ with mp.Pool(n_processes) as pool:
356
+ crawled_results = pool.map( crawl_results, split_data)
357
+ crawled_results = merge_results( crawled_results, dataframe_columns=['crawled_results'], list_columns=['empty_indices'])
358
+ with open( crawl_file_path, "wb") as f:
359
+ joblib.dump( crawled_results, f)
360
+ else:
361
+ with open( crawl_file_path, "rb") as f:
362
+ crawled_results = joblib.load(f)
363
+ print( f"total time: {time.time() - st}")
364
+ return crawled_results
365
+
366
+ def extract_results( data: pd.DataFrame ):
367
+ """
368
+ Argument
369
+ data: `evidence`, `result`
370
+ Return
371
+ extracted_results: dataframe of `extracted_evidence`
372
+ """
373
+ client = OpenAI( organization = ORGANIZATION_ID)
374
+ extracted_results = []
375
+ empty_indices = []
376
+ for i, d in tqdm(enumerate(data.itertuples())):
377
+ idx = d[1]
378
+ evidence = d.evidence
379
+ business_id = d[2]
380
+ business_name = d[3]
381
+ address = d[6]
382
+ query = compose_query( address, business_name)
383
+ try:
384
+ ana_res = compose_analysis( client, query = query, search_results = evidence)
385
+ ana_res = json.loads(ana_res)
386
+ except Exception as e:
387
+ print(f"# ANALYSIS error {e}: i = {i}, evidence = {evidence}")
388
+ empty_indices.append(i)
389
+ continue
390
+
391
+ extracted_results.append( {
392
+ "index": idx,
393
+ "business_id": business_id,
394
+ "business_name": business_name,
395
+ "evidence": evidence,
396
+ ** ana_res
397
+ } )
398
+ extracted_results = pd.DataFrame(extracted_results)
399
+
400
+ return {
401
+ "extracted_results": extracted_results,
402
+ "empty_indices": empty_indices
403
+ }
404
+
405
+ def extract_results_mp( crawled_results, extracted_file_path):
406
+ """
407
+ Argument
408
+ Return
409
+ Reference
410
+ 200 records, 4 processes, 502.26914715766907
411
+ """
412
+ st = time.time()
413
+ # args.extracted_file_path = "data/extracted_results.joblib"
414
+ if not os.path.exists(extracted_file_path):
415
+ split_data = split_dataframe( crawled_results)
416
+ with mp.Pool(args.n_processes) as pool:
417
+ extracted_results = pool.map( extract_results, split_data)
418
+ extracted_results = merge_results( extracted_results, dataframe_columns=['extracted_results'], list_columns=['empty_indices'])
419
+ with open( extracted_file_path, "wb") as f:
420
+ joblib.dump( extracted_results, f)
421
+ else:
422
+ with open( extracted_file_path, "rb") as f:
423
+ extracted_results = joblib.load(f)
424
+ print( f"total time: {time.time() - st}")
425
+ return extracted_results
426
+
427
+ def test_get_analysis_results():
428
+ data = pd.read_csv("data/餐廳類型分類.xlsx - 測試清單.csv")
429
+ analysis_results, empty_indices = extract_results( data )
430
+
431
+ def postprocess_result( results: pd.DataFrame, postprocessed_results_path, category_hierarchy: dict, column_name: str = 'category'):
432
+ """
433
+ Argument
434
+ analysis_result: `evidence`, `result`
435
+ postprocessed_results_path
436
+ Return
437
+ """
438
+ # index = analysis_result['result']['index']
439
+ # store_name = data.loc[index]['營業人名稱'] if len(analysis_result['result'].get('store_name',''))==0 else analysis_result['result']['store_name']
440
+ # address = data.loc[index]['營業地址'] if len(analysis_result['result'].get('address',''))==0 else analysis_result['result']['address']
441
+ # post_res = {
442
+ # "evidence": analysis_result['evidence'],
443
+ # "index": index,
444
+ # "begin_date": data.loc[index]['設立日期'],
445
+ # "store_name": store_name,
446
+ # "address": address,
447
+ # "description": analysis_result['result'].get('description', ""),
448
+ # "phone_number": analysis_result['result'].get('phone_number', ""),
449
+ # "category": analysis_result['result'].get('category', ""),
450
+ # "supercategory": category_hierarchy.get(analysis_result['result'].get('category', ""), analysis_result['result'].get('category',"")),
451
+ # }
452
+ if not os.path.exists(postprocessed_results_path):
453
+ postprocessed_results = results.copy()
454
+ postprocessed_results['supercategory'] = postprocessed_results[column_name].apply(lambda x: category_hierarchy.get(x, ''))
455
+ with open( postprocessed_results_path, "wb") as f:
456
+ joblib.dump( postprocessed_results, f)
457
+ else:
458
+ with open( postprocessed_results_path, "rb") as f:
459
+ postprocessed_results = joblib.load(f)
460
+ return postprocessed_results
461
+
462
+ def test_postprocess_result():
463
+ analysis_result = ""
464
+ pos_res = postprocess_result( analysis_result)
465
+
466
+ def combine_results( results: pd.DataFrame, combined_results_path: str, src_column: str = 'classified_category', tgt_column: str = 'category', strategy: str = 'replace'):
467
+ """
468
+ Argument
469
+ classified_results_df: dataframe
470
+ combined_results_path
471
+ src_column: str
472
+ strategy: str, 'replace' or 'patch'
473
+ Return
474
+ combined_results: dataframe
475
+ """
476
+ if not os.path.exists(combined_results_path):
477
+ combined_results = results.copy()
478
+ if strategy == 'replace':
479
+ condition = (combined_results[tgt_column]=='') | (combined_results[src_column]!=combined_results[tgt_column])
480
+ combined_results.loc[ condition, tgt_column] = combined_results[condition][src_column].values
481
+ elif strategy == 'patch':
482
+ condition = (combined_results[tgt_column]=='')
483
+ combined_results.loc[ condition, tgt_column] = combined_results[condition][src_column].values
484
+ else:
485
+ raise Exception(f"Strategy {strategy} not implemented")
486
+ with open( combined_results_path, "wb") as f:
487
+ joblib.dump( combined_results, f)
488
+ else:
489
+ with open( combined_results_path, "rb") as f:
490
+ combined_results = joblib.load(f)
491
+ return combined_results
492
+
493
+ def format_evidence(evidence):
494
+ """
495
+ """
496
+ formatted = []
497
+ evidence = json.loads(evidence)
498
+ # print( len(evidence) )
499
+ for i in range(len(evidence)):
500
+ if 'title' in evidence[i] and '顧客評價' in evidence[i]:
501
+ f = f"\n> 顧客評價: {evidence[i]['顧客評價']}"
502
+ elif 'title' in evidence[i] and evidence[i]['title']=='類似的店':
503
+ f = f"\n> 類似的店: {evidence[i]['snippet']}"
504
+ elif 'status' in evidence[i]:
505
+ f = f"\n> 經營狀態: {evidence[i]['status']}"
506
+ elif 'telephone_number' in evidence[i]:
507
+ f = f"\n> 電話號碼: {evidence[i]['telephone_number']}"
508
+ else:
509
+ try:
510
+ f = f"{i+1}. {evidence[i]['title']} ({evidence[i].get('snippet','')})"
511
+ except KeyError:
512
+ print( evidence[i] )
513
+ raise KeyError
514
+ formatted.append(f)
515
+ return "\n".join(formatted)
516
+
517
+ def format_output( df: pd.DataFrame, input_column: str = 'evidence', output_column: str = 'formatted_evidence', format_func = format_evidence):
518
+ """
519
+ Argument
520
+ df: `evidence`, `result`
521
+ input_column:
522
+ output_column:
523
+ format_func:
524
+ Return
525
+ formatted_df: dataframe of `formatted_evidence`
526
+ """
527
+ formatted_df = df.copy()
528
+ formatted_df[output_column] = formatted_df[input_column].apply(format_evidence)
529
+ return formatted_df
530
+
531
+ def merge_results( results: list, dataframe_columns: list, list_columns: list):
532
+ """
533
+ Argument
534
+ results: a list of dataframes
535
+ dataframe_columns: list
536
+ list_columns: list
537
+ """
538
+ assert len(results) > 0, "No results to merge"
539
+ merged_results = {}
540
+ for result in results:
541
+ for key in dataframe_columns:
542
+ mer_res = pd.concat([ r[key] for r in results], ignore_index=True)
543
+ merged_results[key] = mer_res
544
+
545
+ for key in list_columns:
546
+ mer_res = list(itertools.chain(*[ r[key] for r in results]))
547
+ merged_results[key] = mer_res
548
+
549
+ return merged_results
550
+
551
+
552
+ def split_dataframe( df: pd.DataFrame, n_processes: int = 4) -> list:
553
+ """
554
+ """
555
+ n = df.shape[0]
556
+ n_per_process = math.ceil(n / n_processes)
557
+ return [ df.iloc[i:i+n_per_process] for i in range(0, n, n_per_process)]
558
+
559
+ def main(args):
560
+ """
561
+ Argument
562
+ args: argparse
563
+ """
564
+
565
+ ## 讀取資料名單 ##
566
+ data = get_leads(args.data_path)
567
+
568
+ ## 進行爬蟲與分析 ##
569
+ # crawled_results = crawl_results(data)
570
+ crawled_results = crawl_results_mp( data, args.crawled_file_path, n_processes=args.n_processes)
571
+
572
+ ## 方法 1: 擷取關鍵資訊與分類 ##
573
+ # extracted_results = extract_results(
574
+ # crawled_results['crawled_results']
575
+ # )
576
+ extracted_results = extract_results_mp(
577
+ crawled_results = crawled_results['crawled_results'],
578
+ extracted_file_path = args.extracted_file_path
579
+ )
580
+
581
+ ## 方法2: 直接對爬蟲結果分類 ##
582
+ # classified_results = classify_results(
583
+ # extracted_results['extracted_results'],
584
+ # input_column = 'evidence',
585
+ # output_column = 'classified_category',
586
+ # classes = ['中式', '西式'],
587
+ # backup_classes = [ '中式', '西式']
588
+ # )
589
+ classified_results = classify_results_mp(
590
+ extracted_results['extracted_results'],
591
+ args.classified_file_path,
592
+ classes=args.classes,
593
+ backup_classes=args.backup_classes,
594
+ n_processes=args.n_processes
595
+ )
596
+
597
+ ## 合併分析結果 ##
598
+ combined_results = combine_results(
599
+ classified_results['classified_results'],
600
+ args.combined_file_path,
601
+ src_column='classified_category',
602
+ tgt_column='category',
603
+ strategy='replace'
604
+ )
605
+
606
+ ## 後處理分析結果 ##
607
+ postprossed_results = postprocess_result(
608
+ combined_results,
609
+ args.postprocessed_results,
610
+ category2supercategory
611
+ )
612
+
613
+ formatted_results = format_output( postprossed_results, input_column = 'evidence', output_column = 'formatted_evidence', format_func = format_evidence)
614
+ formatted_results.to_csv("data/formatted_results.csv", index=False)
615
+
616
+
617
+ category2supercategory = {
618
+ "小吃店": "中式",
619
+ "日式料理(含居酒屋,串燒)": "中式",
620
+ "火(鍋/爐)": "中式",
621
+ "東南亞料理(不含日韓)": "中式",
622
+ "海鮮熱炒": "中式",
623
+ "特色餐廳(含雞、鵝、牛、羊肉)": "中式",
624
+ "傳統餐廳": "中式",
625
+ "燒烤": "中式",
626
+ "韓式料理(含火鍋,烤肉)": "中式",
627
+ "西餐廳(含美式,義式,墨式)": "西式",
628
+ "中式": "中式",
629
+ "西式": "西式"
630
+ }
631
+
632
+ supercategory2category = {
633
+ "中式": [
634
+ "小吃店",
635
+ "日式料理(含居酒屋,串燒)",
636
+ "火(鍋/爐)",
637
+ "東南亞料理(不含日韓)",
638
+ "海鮮熱炒",
639
+ "特色餐廳(含雞、鵝、牛、羊肉)",
640
+ "傳統餐廳",
641
+ "燒烤",
642
+ "韓式料理(含火鍋,烤肉)"
643
+ ],
644
+ "西式": ["西餐廳(含美式,義式,墨式)"]
645
+ }
646
+
647
+ if __name__=='__main__':
648
+
649
+ base = "https://serpapi.com/search.json"
650
+ engine = 'google'
651
+ # query = "Coffee"
652
+ google_domain = 'google.com.tw'
653
+ gl = 'tw'
654
+ lr = 'lang_zh-TW'
655
+ # url = f"{base}?engine={engine}&q={query}&google_domain={google_domain}&gl={gl}&lr={lr}"
656
+ n_processes = 4
657
+ client = OpenAI( organization = ORGANIZATION_ID)
658
+
659
+ parser = argparse.ArgumentParser()
660
+ parser.add_argument("--data_path", type=str, default="data/餐廳類型分類.xlsx - 測試清單.csv")
661
+ parser.add_argument("--classified_file_path", type=str, default="data/classified_results.joblib")
662
+ parser.add_argument("--extracted_file_path", type=str, default="data/extracted_results.joblib")
663
+ parser.add_argument("--crawled_file_path", type=str, default="data/crawled_results.joblib")
664
+ parser.add_argument("--combined_file_path", type=str, default="data/combined_results.joblib")
665
+ parser.add_argument("--postprocessed_results", type=str, default="data/postprocessed_results.joblib")
666
+ parser.add_argument("--classes", type=list, default=['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)'])
667
+ parser.add_argument("--backup_classes", type=list, default=['中式', '西式'])
668
+ parser.add_argument("--n_processes", type=int, default=4)
669
+ args = parser.parse_args()
670
+
671
+ main(args)