chenxwh
/

AVeriTeC

Model card Files Files and versions Community

Chenxi Whitehouse commited on Apr 8

Commit

17da9c4

•

1 Parent(s): 2b35800

update file

Browse files

Files changed (3) hide show

README.md +1 -1
src/reranking/bm25_sentences.py +1 -0
src/reranking/question_generation_top_sentences.py +5 -8

README.md CHANGED Viewed

@@ -41,5 +41,5 @@ python -m src.reranking.bm25_sentences
 ### Generate questions for each evidence sentence
 We use [BLOOM](https://huggingface.co/bigscience/bloom-7b1) to generate questions for each evidence sentence using the closet examples from the training set. See [question_generation_top_sentences.py](https://huggingface.co/chenxwh/AVeriTeC/blob/main/src/reranking/question_generation_top_sentences.py) for more argument options.
 ```
-python -m retrieval_reranking.question_generation_top_sentences
 ```

 ### Generate questions for each evidence sentence
 We use [BLOOM](https://huggingface.co/bigscience/bloom-7b1) to generate questions for each evidence sentence using the closet examples from the training set. See [question_generation_top_sentences.py](https://huggingface.co/chenxwh/AVeriTeC/blob/main/src/reranking/question_generation_top_sentences.py) for more argument options.
 ```
+python -m src.reranking.question_generation_top_sentences
 ```

src/reranking/bm25_sentences.py CHANGED Viewed

@@ -115,3 +115,4 @@ if __name__ == "__main__":
                 }
                 output_json.write(json.dumps(json_data, ensure_ascii=False) + "\n")
                 done += 1

                 }
                 output_json.write(json.dumps(json_data, ensure_ascii=False) + "\n")
                 done += 1
+                output_file.flush()

src/reranking/question_generation_top_sentences.py CHANGED Viewed

@@ -113,16 +113,12 @@ if __name__ == "__main__":
                     prompt_lookup_str = sentences_urls["sentence"]
                     url = sentences_urls["url"]
-                    st = time.time()
                     prompt_s = prompt_bm25.get_scores(
                         nltk.word_tokenize(prompt_lookup_str)
                     )
                     prompt_n = 10
                     prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
                     prompt_docs = [prompt_corpus[i] for i in prompt_top_n]
-                    print(
-                        f"Got top 100 prompt for sent {sent_i} in file {i}. Time elapsed: {time.time() - st}"
-                    )
                     claim_prompt = (
                         "Evidence: "
@@ -135,7 +131,7 @@ if __name__ == "__main__":
                     inputs = tokenizer([prompt], padding=True, return_tensors="pt").to(
                         model.device
                     )
                     outputs = model.generate(
                         inputs["input_ids"],
                         max_length=5000,
@@ -143,6 +139,9 @@ if __name__ == "__main__":
                         no_repeat_ngram_size=2,
                         early_stopping=True,
                     )
                     tgt_text = tokenizer.batch_decode(
                         outputs[:, inputs["input_ids"].shape[-1] :],
@@ -165,7 +164,5 @@ if __name__ == "__main__":
                     "claim": claim,
                     "bm25_qau": bm25_qau,
                 }
-                output_file.write(
-                    json.dumps(json_data, ensure_ascii=False, indent=4) + "\n"
-                )
                 output_file.flush()

                     prompt_lookup_str = sentences_urls["sentence"]
                     url = sentences_urls["url"]
                     prompt_s = prompt_bm25.get_scores(
                         nltk.word_tokenize(prompt_lookup_str)
                     )
                     prompt_n = 10
                     prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
                     prompt_docs = [prompt_corpus[i] for i in prompt_top_n]
                     claim_prompt = (
                         "Evidence: "
                     inputs = tokenizer([prompt], padding=True, return_tensors="pt").to(
                         model.device
                     )
+                    st = time.time()
                     outputs = model.generate(
                         inputs["input_ids"],
                         max_length=5000,
                         no_repeat_ngram_size=2,
                         early_stopping=True,
                     )
+                    print(
+                        f"Generated QA for sent {sent_i} in file {i}. Time elapsed: {time.time() - st}"
+                    )
                     tgt_text = tokenizer.batch_decode(
                         outputs[:, inputs["input_ids"].shape[-1] :],
                     "claim": claim,
                     "bm25_qau": bm25_qau,
                 }
+                output_file.write(json.dumps(json_data, ensure_ascii=False) + "\n")
                 output_file.flush()