linpershey commited on
Commit
7cfd43a
1 Parent(s): c1ead4a

fix wrong prompt format

Browse files
Files changed (2) hide show
  1. app.py +2 -1
  2. sheet.py +17 -12
app.py CHANGED
@@ -17,6 +17,7 @@ load_dotenv()
17
  logger = logging.getLogger(__name__)
18
  logger.setLevel(logging.DEBUG)
19
 
 
20
 
21
  def plot_wordcloud( text):
22
  """
@@ -71,7 +72,7 @@ def do( business_id, business_name, address):
71
 
72
  crawled_results = pd.DataFrame(crawled_results)
73
  # logger.debug(crawled_results)
74
- extracted_results = extract_results( crawled_results)
75
  # logger.error(extracted_results['extracted_results'].columns)
76
  extracted_results = extracted_results['extracted_results'][ [ 'business_id', 'business_name', 'address', 'category', 'evidence', 'phone_number', 'description', 'store_name'] ]
77
 
 
17
  logger = logging.getLogger(__name__)
18
  logger.setLevel(logging.DEBUG)
19
 
20
+ classes = list([ x for x in category2supercategory.keys() if len(x)>0])
21
 
22
  def plot_wordcloud( text):
23
  """
 
72
 
73
  crawled_results = pd.DataFrame(crawled_results)
74
  # logger.debug(crawled_results)
75
+ extracted_results = extract_results( crawled_results, classes=classes)
76
  # logger.error(extracted_results['extracted_results'].columns)
77
  extracted_results = extracted_results['extracted_results'][ [ 'business_id', 'business_name', 'address', 'category', 'evidence', 'phone_number', 'description', 'store_name'] ]
78
 
sheet.py CHANGED
@@ -99,18 +99,22 @@ def compose_analysis( client, query, search_results, classes: list, model: str =
99
  Return
100
  response: str
101
  """
102
- chat_completion = client.chat.completions.create(
103
- messages=[
104
- {
105
- "role": "system",
106
- "content": f'''
107
  As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query,
108
- your task is to first identify relevant information of the identical store based on store name and proxmity of address if known. After that, extract `store_name`, `address`, `description`, `category` and `phone_number` from the found relevant information, where `category` can only be {",".join("`"+x+"`" for x in classes)}.
109
  It's very important to omit unrelated results. Do not make up any assumption.
110
  Please think step by step, and output in json format. An example output json is like {"store_name": "...", "address": "...", "description": "... products, service or highlights ...", "category": "...", "phone_number": "..."}
111
  If no relevant information has been found, simply output json with empty values.
112
  I'll tip you and guarantee a place in heaven you do a great job completely according to my instruction.
113
  '''
 
 
 
 
 
 
114
  },
115
  {
116
  "role": "user",
@@ -150,7 +154,7 @@ def test_compose_analysis():
150
  def compose_classication(
151
  client,
152
  evidence,
153
- classes: list = ['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)'],
154
  backup_classes: list = [ '中式', '西式'],
155
  model: str = 'gpt-3.5-turbo-0125'
156
  ) -> str:
@@ -382,12 +386,13 @@ def extract_results( data: pd.DataFrame, classes: list ):
382
  business_id = d[2]
383
  business_name = d[3]
384
  address = d[6]
 
385
  query = compose_query( address, business_name)
386
  try:
387
  ana_res = compose_analysis( client, query = query, search_results = evidence, classes = classes)
388
  ana_res = json.loads(ana_res)
389
  except Exception as e:
390
- print(f"# ANALYSIS error {e}: i = {i}, evidence = {evidence}")
391
  empty_indices.append(i)
392
  continue
393
 
@@ -630,8 +635,8 @@ category2supercategory = {
630
  "西餐廳(含美式,義式,墨式)": "西式",
631
  "中式": "中式",
632
  "西式": "西式",
633
- "西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar": "西式",
634
- "西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)": "西式",
635
  "早餐": ""
636
  }
637
 
@@ -647,7 +652,7 @@ supercategory2category = {
647
  "燒烤",
648
  "韓式料理(含火鍋,烤肉)"
649
  ],
650
- "西式": ["西餐廳(含美式,義式,墨式)", "西餐廳(餐酒館、酒吧、標吧、pub、lounge bar", "西餐廳(土耳其、漢堡、法式、歐式、印度)"],
651
  "": ["早餐"]
652
  }
653
 
@@ -671,7 +676,7 @@ if __name__=='__main__':
671
  parser.add_argument("--combined_file_path", type=str, default="data/gpt3.5/combined_results.joblib")
672
  parser.add_argument("--postprocessed_results", type=str, default="data/gpt3.5/postprocessed_results.joblib")
673
  parser.add_argument("--formatted_results", type=str, default="data/gpt3.5/formatted_results.csv")
674
- parser.add_argument("--classes", type=list, default=['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)'])
675
  parser.add_argument("--backup_classes", type=list, default=['中式', '西式'])
676
  parser.add_argument("--strategy", type=str, default='replace', choices=['replace', 'patch'])
677
  parser.add_argument("--n_processes", type=int, default=4)
 
99
  Return
100
  response: str
101
  """
102
+ categories = ", ".join([ "`"+x+"`" for x in classes if x!='早餐' ])+ " or " + "`早餐`"
103
+ # print(f"categoreis: {categories}")
104
+ system_prompt = '''
 
 
105
  As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query,
106
+ your task is to first identify relevant information of the identical store based on store name and proxmity of address if known. After that, extract `store_name`, `address`, `description`, `category` and `phone_number` from the found relevant information, where `category` can only be `小吃店`, `日式料理(含居酒屋,串燒)`, `火(鍋/爐)`, `東南亞料理(不含日韓)`, `海鮮熱炒`, `特色餐廳(含雞、鵝、牛、羊肉)`, `傳統餐廳`, `燒烤`, `韓式料理(含火鍋,烤肉)`, `西餐廳(含美式,義式,墨式)`, `西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)`, `西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)` or `早餐`.
107
  It's very important to omit unrelated results. Do not make up any assumption.
108
  Please think step by step, and output in json format. An example output json is like {"store_name": "...", "address": "...", "description": "... products, service or highlights ...", "category": "...", "phone_number": "..."}
109
  If no relevant information has been found, simply output json with empty values.
110
  I'll tip you and guarantee a place in heaven you do a great job completely according to my instruction.
111
  '''
112
+ # print(f"system prompt = {system_prompt}")
113
+ chat_completion = client.chat.completions.create(
114
+ messages=[
115
+ {
116
+ "role": "system",
117
+ "content": system_prompt
118
  },
119
  {
120
  "role": "user",
 
154
  def compose_classication(
155
  client,
156
  evidence,
157
+ classes: list = ['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)', ],
158
  backup_classes: list = [ '中式', '西式'],
159
  model: str = 'gpt-3.5-turbo-0125'
160
  ) -> str:
 
386
  business_id = d[2]
387
  business_name = d[3]
388
  address = d[6]
389
+ ana_res = None
390
  query = compose_query( address, business_name)
391
  try:
392
  ana_res = compose_analysis( client, query = query, search_results = evidence, classes = classes)
393
  ana_res = json.loads(ana_res)
394
  except Exception as e:
395
+ print(f"# ANALYSIS error {e}: i = {i}, ana_res = {ana_res}")
396
  empty_indices.append(i)
397
  continue
398
 
 
635
  "西餐廳(含美式,義式,墨式)": "西式",
636
  "中式": "中式",
637
  "西式": "西式",
638
+ "西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)": "西式",
639
+ "西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)": "西式",
640
  "早餐": ""
641
  }
642
 
 
652
  "燒烤",
653
  "韓式料理(含火鍋,烤肉)"
654
  ],
655
+ "西式": ["西餐廳(含美式,義式,墨式)", "西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)", "西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)"],
656
  "": ["早餐"]
657
  }
658
 
 
676
  parser.add_argument("--combined_file_path", type=str, default="data/gpt3.5/combined_results.joblib")
677
  parser.add_argument("--postprocessed_results", type=str, default="data/gpt3.5/postprocessed_results.joblib")
678
  parser.add_argument("--formatted_results", type=str, default="data/gpt3.5/formatted_results.csv")
679
+ parser.add_argument("--classes", type=list, default=['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)', '西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)', '西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)', '早餐'])
680
  parser.add_argument("--backup_classes", type=list, default=['中式', '西式'])
681
  parser.add_argument("--strategy", type=str, default='replace', choices=['replace', 'patch'])
682
  parser.add_argument("--n_processes", type=int, default=4)