Spaces:
Runtime error
Runtime error
Commit
·
c1ead4a
1
Parent(s):
6023585
fix prompt mistakes
Browse files
sheet.py
CHANGED
@@ -90,7 +90,7 @@ def test_get_condensed_result():
|
|
90 |
res = get_serp(query)
|
91 |
cond_res = get_condensed_result(res)
|
92 |
|
93 |
-
def compose_analysis( client, query, search_results, model: str = 'gpt-3.5-turbo-0125'):
|
94 |
"""
|
95 |
Argument
|
96 |
query: str
|
@@ -103,9 +103,9 @@ def compose_analysis( client, query, search_results, model: str = 'gpt-3.5-turbo
|
|
103 |
messages=[
|
104 |
{
|
105 |
"role": "system",
|
106 |
-
"content": '''
|
107 |
As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query,
|
108 |
-
your task is to first identify relevant information of the identical store based on store name and proxmity of address if known. After that, extract `store_name`, `address`, `description`, `category` and `phone_number` from the found relevant information, where `category` can only be
|
109 |
It's very important to omit unrelated results. Do not make up any assumption.
|
110 |
Please think step by step, and output in json format. An example output json is like {"store_name": "...", "address": "...", "description": "... products, service or highlights ...", "category": "...", "phone_number": "..."}
|
111 |
If no relevant information has been found, simply output json with empty values.
|
@@ -366,7 +366,7 @@ def crawl_results_mp( data: pd.DataFrame, crawl_file_path: str, n_processes: int
|
|
366 |
print( f"total time: {time.time() - st}")
|
367 |
return crawled_results
|
368 |
|
369 |
-
def extract_results( data: pd.DataFrame ):
|
370 |
"""
|
371 |
Argument
|
372 |
data: `evidence`, `result`
|
@@ -384,7 +384,7 @@ def extract_results( data: pd.DataFrame ):
|
|
384 |
address = d[6]
|
385 |
query = compose_query( address, business_name)
|
386 |
try:
|
387 |
-
ana_res = compose_analysis( client, query = query, search_results = evidence)
|
388 |
ana_res = json.loads(ana_res)
|
389 |
except Exception as e:
|
390 |
print(f"# ANALYSIS error {e}: i = {i}, evidence = {evidence}")
|
@@ -405,7 +405,7 @@ def extract_results( data: pd.DataFrame ):
|
|
405 |
"empty_indices": empty_indices
|
406 |
}
|
407 |
|
408 |
-
def extract_results_mp( crawled_results, extracted_file_path):
|
409 |
"""
|
410 |
Argument
|
411 |
Return
|
@@ -417,7 +417,7 @@ def extract_results_mp( crawled_results, extracted_file_path):
|
|
417 |
if not os.path.exists(extracted_file_path):
|
418 |
split_data = split_dataframe( crawled_results)
|
419 |
with mp.Pool(args.n_processes) as pool:
|
420 |
-
extracted_results = pool.
|
421 |
extracted_results = merge_results( extracted_results, dataframe_columns=['extracted_results'], list_columns=['empty_indices'])
|
422 |
with open( extracted_file_path, "wb") as f:
|
423 |
joblib.dump( extracted_results, f)
|
@@ -630,7 +630,7 @@ category2supercategory = {
|
|
630 |
"西餐廳(含美式,義式,墨式)": "西式",
|
631 |
"中式": "中式",
|
632 |
"西式": "西式",
|
633 |
-
"
|
634 |
"西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)": "西式",
|
635 |
"早餐": ""
|
636 |
}
|
|
|
90 |
res = get_serp(query)
|
91 |
cond_res = get_condensed_result(res)
|
92 |
|
93 |
+
def compose_analysis( client, query, search_results, classes: list, model: str = 'gpt-3.5-turbo-0125'):
|
94 |
"""
|
95 |
Argument
|
96 |
query: str
|
|
|
103 |
messages=[
|
104 |
{
|
105 |
"role": "system",
|
106 |
+
"content": f'''
|
107 |
As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query,
|
108 |
+
your task is to first identify relevant information of the identical store based on store name and proxmity of address if known. After that, extract `store_name`, `address`, `description`, `category` and `phone_number` from the found relevant information, where `category` can only be {",".join("`"+x+"`" for x in classes)}.
|
109 |
It's very important to omit unrelated results. Do not make up any assumption.
|
110 |
Please think step by step, and output in json format. An example output json is like {"store_name": "...", "address": "...", "description": "... products, service or highlights ...", "category": "...", "phone_number": "..."}
|
111 |
If no relevant information has been found, simply output json with empty values.
|
|
|
366 |
print( f"total time: {time.time() - st}")
|
367 |
return crawled_results
|
368 |
|
369 |
+
def extract_results( data: pd.DataFrame, classes: list ):
|
370 |
"""
|
371 |
Argument
|
372 |
data: `evidence`, `result`
|
|
|
384 |
address = d[6]
|
385 |
query = compose_query( address, business_name)
|
386 |
try:
|
387 |
+
ana_res = compose_analysis( client, query = query, search_results = evidence, classes = classes)
|
388 |
ana_res = json.loads(ana_res)
|
389 |
except Exception as e:
|
390 |
print(f"# ANALYSIS error {e}: i = {i}, evidence = {evidence}")
|
|
|
405 |
"empty_indices": empty_indices
|
406 |
}
|
407 |
|
408 |
+
def extract_results_mp( crawled_results, extracted_file_path, classes: list):
|
409 |
"""
|
410 |
Argument
|
411 |
Return
|
|
|
417 |
if not os.path.exists(extracted_file_path):
|
418 |
split_data = split_dataframe( crawled_results)
|
419 |
with mp.Pool(args.n_processes) as pool:
|
420 |
+
extracted_results = pool.starmap( extract_results, [ (x, classes) for x in split_data])
|
421 |
extracted_results = merge_results( extracted_results, dataframe_columns=['extracted_results'], list_columns=['empty_indices'])
|
422 |
with open( extracted_file_path, "wb") as f:
|
423 |
joblib.dump( extracted_results, f)
|
|
|
630 |
"西餐廳(含美式,義式,墨式)": "西式",
|
631 |
"中式": "中式",
|
632 |
"西式": "西式",
|
633 |
+
"西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)": "西式",
|
634 |
"西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)": "西式",
|
635 |
"早餐": ""
|
636 |
}
|