Liyan06 commited on
Commit
98d958b
1 Parent(s): e9622b9

generator is not json seralizable; go back to previous version

Browse files
Files changed (1) hide show
  1. handler.py +16 -24
handler.py CHANGED
@@ -1,6 +1,5 @@
1
  from minicheck_web.minicheck import MiniCheck
2
  from web_retrieval import *
3
- import json
4
 
5
 
6
  def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
@@ -32,47 +31,40 @@ class EndpointHandler():
32
  ranked_docs, scores = sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk)
33
 
34
  outputs = {
35
- 'ranked_docs': ranked_docs,
36
- 'scores': scores
37
- }
38
-
39
- return outputs
40
 
41
  else:
42
  assert len(data['inputs']['claims']) == 1, "Only one claim is allowed for web retrieval for the current version."
43
 
44
  claim = data['inputs']['claims'][0]
45
- progress_generator = self.search_relevant_docs(claim)
46
-
47
- def generate_sse():
48
- for progress in progress_generator:
49
- yield f"data: {progress}\n"
50
- ranked_docs, scores, ranked_urls = progress_generator.send(None)
51
- outputs = {
52
- 'ranked_docs': ranked_docs,
53
- 'scores': scores,
54
- 'ranked_urls': ranked_urls
55
- }
56
- yield f"data: {json.dumps(outputs)}\n"
57
-
58
- return generate_sse()
59
 
 
 
 
 
 
 
 
 
60
 
61
  def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=5, allow_duplicated_urls=False):
62
 
63
  search_results = search_google(claim, timeout=timeout)
64
 
65
- yield 'Searching webpages...'
66
  start = time()
67
  with concurrent.futures.ThreadPoolExecutor() as e:
68
  scraped_results = e.map(scrape_url, search_results, itertools.repeat(timeout))
69
  end = time()
70
- yield f"Finished searching in {round((end - start), 1)} seconds."
71
  scraped_results = [(r[0][:50000], r[1]) for r in scraped_results if r[0] and '��' not in r[0] and ".pdf" not in r[1]]
72
 
73
  retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
74
 
75
- yield 'Scoring webpages...'
76
  start = time()
77
  retrieved_data = {
78
  'inputs': {
@@ -83,7 +75,7 @@ class EndpointHandler():
83
  _, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=retrieved_data)
84
  end = time()
85
  num_chunks = len([item for items in used_chunk for item in items])
86
- yield f'Finished {num_chunks} entailment checks in {round((end - start), 1)} seconds ({round(num_chunks / (end - start) * 60)} Doc./min).'
87
 
88
  ranked_docs, scores, ranked_urls = order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=allow_duplicated_urls)
89
 
 
1
  from minicheck_web.minicheck import MiniCheck
2
  from web_retrieval import *
 
3
 
4
 
5
  def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
 
31
  ranked_docs, scores = sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk)
32
 
33
  outputs = {
34
+ 'ranked_docs': ranked_docs,
35
+ 'scores': scores
36
+ }
 
 
37
 
38
  else:
39
  assert len(data['inputs']['claims']) == 1, "Only one claim is allowed for web retrieval for the current version."
40
 
41
  claim = data['inputs']['claims'][0]
42
+ ranked_docs, scores, ranked_urls = self.search_relevant_docs(claim)
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ outputs = {
45
+ 'ranked_docs': ranked_docs,
46
+ 'scores': scores,
47
+ 'ranked_urls': ranked_urls
48
+ }
49
+
50
+ return outputs
51
+
52
 
53
  def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=5, allow_duplicated_urls=False):
54
 
55
  search_results = search_google(claim, timeout=timeout)
56
 
57
+ print('Searching webpages...')
58
  start = time()
59
  with concurrent.futures.ThreadPoolExecutor() as e:
60
  scraped_results = e.map(scrape_url, search_results, itertools.repeat(timeout))
61
  end = time()
62
+ print(f"Finished searching in {round((end - start), 1)} seconds.\n")
63
  scraped_results = [(r[0][:50000], r[1]) for r in scraped_results if r[0] and '��' not in r[0] and ".pdf" not in r[1]]
64
 
65
  retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
66
 
67
+ print('Scoring webpages...')
68
  start = time()
69
  retrieved_data = {
70
  'inputs': {
 
75
  _, _, used_chunk, support_prob_per_chunk = self.scorer.score(data=retrieved_data)
76
  end = time()
77
  num_chunks = len([item for items in used_chunk for item in items])
78
+ print(f'Finished {num_chunks} entailment checks in {round((end - start), 1)} seconds ({round(num_chunks / (end - start) * 60)} Doc./min).')
79
 
80
  ranked_docs, scores, ranked_urls = order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=allow_duplicated_urls)
81