linpershey commited on
Commit
c1ead4a
1 Parent(s): 6023585

fix prompt mistakes

Browse files
Files changed (1) hide show
  1. sheet.py +8 -8
sheet.py CHANGED
@@ -90,7 +90,7 @@ def test_get_condensed_result():
90
  res = get_serp(query)
91
  cond_res = get_condensed_result(res)
92
 
93
- def compose_analysis( client, query, search_results, model: str = 'gpt-3.5-turbo-0125'):
94
  """
95
  Argument
96
  query: str
@@ -103,9 +103,9 @@ def compose_analysis( client, query, search_results, model: str = 'gpt-3.5-turbo
103
  messages=[
104
  {
105
  "role": "system",
106
- "content": '''
107
  As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query,
108
- your task is to first identify relevant information of the identical store based on store name and proxmity of address if known. After that, extract `store_name`, `address`, `description`, `category` and `phone_number` from the found relevant information, where `category` can only be `小吃店`, `日式料理(含居酒屋,串燒)`, `火(鍋/爐)`, `東南亞料理(不含日韓)`, `海鮮熱炒`, `特色餐廳(含雞、鵝、牛、羊肉)`, `傳統餐廳`, `燒烤`, `韓式料理(含火鍋,烤肉)` or `西餐廳(含美式,義式,墨式)`.
109
  It's very important to omit unrelated results. Do not make up any assumption.
110
  Please think step by step, and output in json format. An example output json is like {"store_name": "...", "address": "...", "description": "... products, service or highlights ...", "category": "...", "phone_number": "..."}
111
  If no relevant information has been found, simply output json with empty values.
@@ -366,7 +366,7 @@ def crawl_results_mp( data: pd.DataFrame, crawl_file_path: str, n_processes: int
366
  print( f"total time: {time.time() - st}")
367
  return crawled_results
368
 
369
- def extract_results( data: pd.DataFrame ):
370
  """
371
  Argument
372
  data: `evidence`, `result`
@@ -384,7 +384,7 @@ def extract_results( data: pd.DataFrame ):
384
  address = d[6]
385
  query = compose_query( address, business_name)
386
  try:
387
- ana_res = compose_analysis( client, query = query, search_results = evidence)
388
  ana_res = json.loads(ana_res)
389
  except Exception as e:
390
  print(f"# ANALYSIS error {e}: i = {i}, evidence = {evidence}")
@@ -405,7 +405,7 @@ def extract_results( data: pd.DataFrame ):
405
  "empty_indices": empty_indices
406
  }
407
 
408
- def extract_results_mp( crawled_results, extracted_file_path):
409
  """
410
  Argument
411
  Return
@@ -417,7 +417,7 @@ def extract_results_mp( crawled_results, extracted_file_path):
417
  if not os.path.exists(extracted_file_path):
418
  split_data = split_dataframe( crawled_results)
419
  with mp.Pool(args.n_processes) as pool:
420
- extracted_results = pool.map( extract_results, split_data)
421
  extracted_results = merge_results( extracted_results, dataframe_columns=['extracted_results'], list_columns=['empty_indices'])
422
  with open( extracted_file_path, "wb") as f:
423
  joblib.dump( extracted_results, f)
@@ -630,7 +630,7 @@ category2supercategory = {
630
  "西餐廳(含美式,義式,墨式)": "西式",
631
  "中式": "中式",
632
  "西式": "西式",
633
- "西餐廳(餐酒館、酒吧、標吧、pub、lounge bar)": "西式",
634
  "西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)": "西式",
635
  "早餐": ""
636
  }
 
90
  res = get_serp(query)
91
  cond_res = get_condensed_result(res)
92
 
93
+ def compose_analysis( client, query, search_results, classes: list, model: str = 'gpt-3.5-turbo-0125'):
94
  """
95
  Argument
96
  query: str
 
103
  messages=[
104
  {
105
  "role": "system",
106
+ "content": f'''
107
  As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query,
108
+ your task is to first identify relevant information of the identical store based on store name and proxmity of address if known. After that, extract `store_name`, `address`, `description`, `category` and `phone_number` from the found relevant information, where `category` can only be {",".join("`"+x+"`" for x in classes)}.
109
  It's very important to omit unrelated results. Do not make up any assumption.
110
  Please think step by step, and output in json format. An example output json is like {"store_name": "...", "address": "...", "description": "... products, service or highlights ...", "category": "...", "phone_number": "..."}
111
  If no relevant information has been found, simply output json with empty values.
 
366
  print( f"total time: {time.time() - st}")
367
  return crawled_results
368
 
369
+ def extract_results( data: pd.DataFrame, classes: list ):
370
  """
371
  Argument
372
  data: `evidence`, `result`
 
384
  address = d[6]
385
  query = compose_query( address, business_name)
386
  try:
387
+ ana_res = compose_analysis( client, query = query, search_results = evidence, classes = classes)
388
  ana_res = json.loads(ana_res)
389
  except Exception as e:
390
  print(f"# ANALYSIS error {e}: i = {i}, evidence = {evidence}")
 
405
  "empty_indices": empty_indices
406
  }
407
 
408
+ def extract_results_mp( crawled_results, extracted_file_path, classes: list):
409
  """
410
  Argument
411
  Return
 
417
  if not os.path.exists(extracted_file_path):
418
  split_data = split_dataframe( crawled_results)
419
  with mp.Pool(args.n_processes) as pool:
420
+ extracted_results = pool.starmap( extract_results, [ (x, classes) for x in split_data])
421
  extracted_results = merge_results( extracted_results, dataframe_columns=['extracted_results'], list_columns=['empty_indices'])
422
  with open( extracted_file_path, "wb") as f:
423
  joblib.dump( extracted_results, f)
 
630
  "西餐廳(含美式,義式,墨式)": "西式",
631
  "中式": "中式",
632
  "西式": "西式",
633
+ "西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)": "西式",
634
  "西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)": "西式",
635
  "早餐": ""
636
  }