Chenxi Whitehouse commited on
Commit
17da9c4
1 Parent(s): 2b35800

update file

Browse files
README.md CHANGED
@@ -41,5 +41,5 @@ python -m src.reranking.bm25_sentences
41
  ### Generate questions for each evidence sentence
42
  We use [BLOOM](https://huggingface.co/bigscience/bloom-7b1) to generate questions for each evidence sentence using the closet examples from the training set. See [question_generation_top_sentences.py](https://huggingface.co/chenxwh/AVeriTeC/blob/main/src/reranking/question_generation_top_sentences.py) for more argument options.
43
  ```
44
- python -m retrieval_reranking.question_generation_top_sentences
45
  ```
 
41
  ### Generate questions for each evidence sentence
42
  We use [BLOOM](https://huggingface.co/bigscience/bloom-7b1) to generate questions for each evidence sentence using the closet examples from the training set. See [question_generation_top_sentences.py](https://huggingface.co/chenxwh/AVeriTeC/blob/main/src/reranking/question_generation_top_sentences.py) for more argument options.
43
  ```
44
+ python -m src.reranking.question_generation_top_sentences
45
  ```
src/reranking/bm25_sentences.py CHANGED
@@ -115,3 +115,4 @@ if __name__ == "__main__":
115
  }
116
  output_json.write(json.dumps(json_data, ensure_ascii=False) + "\n")
117
  done += 1
 
 
115
  }
116
  output_json.write(json.dumps(json_data, ensure_ascii=False) + "\n")
117
  done += 1
118
+ output_file.flush()
src/reranking/question_generation_top_sentences.py CHANGED
@@ -113,16 +113,12 @@ if __name__ == "__main__":
113
  prompt_lookup_str = sentences_urls["sentence"]
114
  url = sentences_urls["url"]
115
 
116
- st = time.time()
117
  prompt_s = prompt_bm25.get_scores(
118
  nltk.word_tokenize(prompt_lookup_str)
119
  )
120
  prompt_n = 10
121
  prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
122
  prompt_docs = [prompt_corpus[i] for i in prompt_top_n]
123
- print(
124
- f"Got top 100 prompt for sent {sent_i} in file {i}. Time elapsed: {time.time() - st}"
125
- )
126
 
127
  claim_prompt = (
128
  "Evidence: "
@@ -135,7 +131,7 @@ if __name__ == "__main__":
135
  inputs = tokenizer([prompt], padding=True, return_tensors="pt").to(
136
  model.device
137
  )
138
-
139
  outputs = model.generate(
140
  inputs["input_ids"],
141
  max_length=5000,
@@ -143,6 +139,9 @@ if __name__ == "__main__":
143
  no_repeat_ngram_size=2,
144
  early_stopping=True,
145
  )
 
 
 
146
 
147
  tgt_text = tokenizer.batch_decode(
148
  outputs[:, inputs["input_ids"].shape[-1] :],
@@ -165,7 +164,5 @@ if __name__ == "__main__":
165
  "claim": claim,
166
  "bm25_qau": bm25_qau,
167
  }
168
- output_file.write(
169
- json.dumps(json_data, ensure_ascii=False, indent=4) + "\n"
170
- )
171
  output_file.flush()
 
113
  prompt_lookup_str = sentences_urls["sentence"]
114
  url = sentences_urls["url"]
115
 
 
116
  prompt_s = prompt_bm25.get_scores(
117
  nltk.word_tokenize(prompt_lookup_str)
118
  )
119
  prompt_n = 10
120
  prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
121
  prompt_docs = [prompt_corpus[i] for i in prompt_top_n]
 
 
 
122
 
123
  claim_prompt = (
124
  "Evidence: "
 
131
  inputs = tokenizer([prompt], padding=True, return_tensors="pt").to(
132
  model.device
133
  )
134
+ st = time.time()
135
  outputs = model.generate(
136
  inputs["input_ids"],
137
  max_length=5000,
 
139
  no_repeat_ngram_size=2,
140
  early_stopping=True,
141
  )
142
+ print(
143
+ f"Generated QA for sent {sent_i} in file {i}. Time elapsed: {time.time() - st}"
144
+ )
145
 
146
  tgt_text = tokenizer.batch_decode(
147
  outputs[:, inputs["input_ids"].shape[-1] :],
 
164
  "claim": claim,
165
  "bm25_qau": bm25_qau,
166
  }
167
+ output_file.write(json.dumps(json_data, ensure_ascii=False) + "\n")
 
 
168
  output_file.flush()