linpershey commited on
Commit
7c9c311
1 Parent(s): a5003e3

add model param, -inurl to exclude unhelpful websites

Browse files
Files changed (1) hide show
  1. sheet.py +17 -12
sheet.py CHANGED
@@ -90,11 +90,12 @@ def test_get_condensed_result():
90
  res = get_serp(query)
91
  cond_res = get_condensed_result(res)
92
 
93
- def compose_analysis( client, query, search_results):
94
  """
95
  Argument
96
  query: str
97
  search_results: str
 
98
  Return
99
  response: str
100
  """
@@ -119,7 +120,7 @@ def compose_analysis( client, query, search_results):
119
  ''',
120
  }
121
  ],
122
- model = "gpt-4-0125-preview",
123
  response_format = {"type": "json_object"},
124
  temperature = 0,
125
  # stream = True
@@ -151,12 +152,14 @@ def compose_classication(
151
  evidence,
152
  classes: list = ['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)'],
153
  backup_classes: list = [ '中式', '西式'],
 
154
  ) -> str:
155
  """
156
  Argument
157
  client:
158
  evidence: str
159
  classes: list
 
160
  Return
161
  response: str
162
  """
@@ -187,7 +190,7 @@ def compose_classication(
187
  ''',
188
  }
189
  ],
190
- model = "gpt-4-0125-preview",
191
  response_format = {"type": "json_object"},
192
  temperature = 0,
193
  # stream = True
@@ -278,7 +281,7 @@ def test_get_evidence_classification():
278
  analysis_results = classify_results( analysis_results)
279
  patch_analysis_results = classify_results( patch_analysis_results)
280
 
281
- def compose_query( address, name, with_index: bool = True):
282
  """
283
  Argumemnt
284
  # d: series with d[1]: 地址, d[4]: 營業人名稱 #
@@ -292,7 +295,7 @@ def compose_query( address, name, with_index: bool = True):
292
  # query = f"{d[1][:3]} {d[4]}"
293
  # else:
294
  # query = f"{d[0][:3]} {d[3]}"
295
- query = f"{address[:3]} {name}"
296
  return query
297
 
298
  def crawl_results( data: pd.DataFrame, google_domain: str = 'google.com.tw', gl: str = 'tw', lr: str = 'lang_zh-TW'):
@@ -563,7 +566,7 @@ def main(args):
563
  """
564
 
565
  ## 讀取資料名單 ##
566
- data = get_leads(args.data_path)
567
 
568
  ## 進行爬蟲與分析 ##
569
  # crawled_results = crawl_results(data)
@@ -611,7 +614,7 @@ def main(args):
611
  )
612
 
613
  formatted_results = format_output( postprossed_results, input_column = 'evidence', output_column = 'formatted_evidence', format_func = format_evidence)
614
- formatted_results.to_csv("data/formatted_results.csv", index=False)
615
 
616
 
617
  category2supercategory = {
@@ -658,13 +661,15 @@ if __name__=='__main__':
658
 
659
  parser = argparse.ArgumentParser()
660
  parser.add_argument("--data_path", type=str, default="data/餐廳類型分類.xlsx - 測試清單.csv")
661
- parser.add_argument("--classified_file_path", type=str, default="data/classified_results.joblib")
662
- parser.add_argument("--extracted_file_path", type=str, default="data/extracted_results.joblib")
663
- parser.add_argument("--crawled_file_path", type=str, default="data/crawled_results.joblib")
664
- parser.add_argument("--combined_file_path", type=str, default="data/combined_results.joblib")
665
- parser.add_argument("--postprocessed_results", type=str, default="data/postprocessed_results.joblib")
 
666
  parser.add_argument("--classes", type=list, default=['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)'])
667
  parser.add_argument("--backup_classes", type=list, default=['中式', '西式'])
 
668
  parser.add_argument("--n_processes", type=int, default=4)
669
  args = parser.parse_args()
670
 
 
90
  res = get_serp(query)
91
  cond_res = get_condensed_result(res)
92
 
93
+ def compose_analysis( client, query, search_results, model: str = 'gpt-3.5-turbo-0125'):
94
  """
95
  Argument
96
  query: str
97
  search_results: str
98
+ model: "gpt-4-0125-preview" or 'gpt-3.5-turbo-0125'
99
  Return
100
  response: str
101
  """
 
120
  ''',
121
  }
122
  ],
123
+ model = model,
124
  response_format = {"type": "json_object"},
125
  temperature = 0,
126
  # stream = True
 
152
  evidence,
153
  classes: list = ['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)'],
154
  backup_classes: list = [ '中式', '西式'],
155
+ model: str = 'gpt-3.5-turbo-0125'
156
  ) -> str:
157
  """
158
  Argument
159
  client:
160
  evidence: str
161
  classes: list
162
+ model: 'gpt-3.5-turbo-0125', 'gpt-4-0125-preview'
163
  Return
164
  response: str
165
  """
 
190
  ''',
191
  }
192
  ],
193
+ model = model,
194
  response_format = {"type": "json_object"},
195
  temperature = 0,
196
  # stream = True
 
281
  analysis_results = classify_results( analysis_results)
282
  patch_analysis_results = classify_results( patch_analysis_results)
283
 
284
+ def compose_query( address, name, with_index: bool = True, exclude: str = "-inurl:twincn.com -inurl:findcompany.com.tw -inurl:iyp.com.tw -inurl:twypage.com -inurl:alltwcompany.com -inurl:zhupiter.com -inurl:twinc.com.tw"):
285
  """
286
  Argumemnt
287
  # d: series with d[1]: 地址, d[4]: 營業人名稱 #
 
295
  # query = f"{d[1][:3]} {d[4]}"
296
  # else:
297
  # query = f"{d[0][:3]} {d[3]}"
298
+ query = f"{address[:3]} {name} {exclude}"
299
  return query
300
 
301
  def crawl_results( data: pd.DataFrame, google_domain: str = 'google.com.tw', gl: str = 'tw', lr: str = 'lang_zh-TW'):
 
566
  """
567
 
568
  ## 讀取資料名單 ##
569
+ data = get_leads(args.data_path).head(20)
570
 
571
  ## 進行爬蟲與分析 ##
572
  # crawled_results = crawl_results(data)
 
614
  )
615
 
616
  formatted_results = format_output( postprossed_results, input_column = 'evidence', output_column = 'formatted_evidence', format_func = format_evidence)
617
+ formatted_results.to_csv( args.formatted_results, index=False)
618
 
619
 
620
  category2supercategory = {
 
661
 
662
  parser = argparse.ArgumentParser()
663
  parser.add_argument("--data_path", type=str, default="data/餐廳類型分類.xlsx - 測試清單.csv")
664
+ parser.add_argument("--classified_file_path", type=str, default="data/gpt3.5/classified_results.joblib")
665
+ parser.add_argument("--extracted_file_path", type=str, default="data/gpt3.5/extracted_results.joblib")
666
+ parser.add_argument("--crawled_file_path", type=str, default="data/gpt3.5/crawled_results.joblib")
667
+ parser.add_argument("--combined_file_path", type=str, default="data/gpt3.5/combined_results.joblib")
668
+ parser.add_argument("--postprocessed_results", type=str, default="data/gpt3.5/postprocessed_results.joblib")
669
+ parser.add_argument("--formatted_results", type=str, default="data/gpt3.5/formatted_results.csv")
670
  parser.add_argument("--classes", type=list, default=['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)'])
671
  parser.add_argument("--backup_classes", type=list, default=['中式', '西式'])
672
+ parser.add_argument("--strategy", type=str, default='replace', choices=['replace', 'patch'])
673
  parser.add_argument("--n_processes", type=int, default=4)
674
  args = parser.parse_args()
675