Hansimov commited on
Commit
eb0ce75
1 Parent(s): d3368e0

:boom: [Fix] SearchAPIApp: incorrect order of extracted contents to urls

Browse files
Files changed (1) hide show
  1. apis/search_api.py +15 -13
apis/search_api.py CHANGED
@@ -93,33 +93,35 @@ class SearchAPIApp:
93
  overwrite=overwrite_webpage_html,
94
  output_parent=query_search_results["query"],
95
  )
 
 
96
  html_paths = [
97
  str(url_and_html_path["html_path"])
98
  for url_and_html_path in url_and_html_path_list
99
  ]
100
-
101
- # Extract webpage contents from htmls
102
  batch_webpage_content_extractor = BatchWebpageContentExtractor()
103
  html_path_and_extracted_content_list = (
104
  batch_webpage_content_extractor.extract(html_paths)
105
  )
106
 
107
- # Write extracted contents (as 'text' field) to query_search_results
108
- url_and_extracted_content_dict = {}
109
-
110
- for item in url_and_html_path_list:
111
- url = item["url"]
112
- html_path = str(item["html_path"])
113
- extracted_content = html_path_and_extracted_content_list[
114
- html_paths.index(html_path)
115
- ]["extracted_content"]
116
- url_and_extracted_content_dict[url] = extracted_content
 
117
 
 
118
  for query_result_idx, query_result in enumerate(
119
  query_search_results["query_results"]
120
  ):
121
  url = query_result["url"]
122
- extracted_content = url_and_extracted_content_dict[url]
123
  queries_search_results[query_idx]["query_results"][query_result_idx][
124
  "text"
125
  ] = extracted_content
 
93
  overwrite=overwrite_webpage_html,
94
  output_parent=query_search_results["query"],
95
  )
96
+
97
+ # Extract webpage contents from htmls
98
  html_paths = [
99
  str(url_and_html_path["html_path"])
100
  for url_and_html_path in url_and_html_path_list
101
  ]
 
 
102
  batch_webpage_content_extractor = BatchWebpageContentExtractor()
103
  html_path_and_extracted_content_list = (
104
  batch_webpage_content_extractor.extract(html_paths)
105
  )
106
 
107
+ # Build the map of url to extracted_content
108
+ html_path_to_url_dict = {
109
+ str(url_and_html_path["html_path"]): url_and_html_path["url"]
110
+ for url_and_html_path in url_and_html_path_list
111
+ }
112
+ url_to_extracted_content_dict = {
113
+ html_path_to_url_dict[
114
+ html_path_and_extracted_content["html_path"]
115
+ ]: html_path_and_extracted_content["extracted_content"]
116
+ for html_path_and_extracted_content in html_path_and_extracted_content_list
117
+ }
118
 
119
+ # Write extracted contents (as 'text' field) to query_search_results
120
  for query_result_idx, query_result in enumerate(
121
  query_search_results["query_results"]
122
  ):
123
  url = query_result["url"]
124
+ extracted_content = url_to_extracted_content_dict[url]
125
  queries_search_results[query_idx]["query_results"][query_result_idx][
126
  "text"
127
  ] = extracted_content