Chenxi Whitehouse
commited on
Commit
•
17da9c4
1
Parent(s):
2b35800
update file
Browse files
README.md
CHANGED
@@ -41,5 +41,5 @@ python -m src.reranking.bm25_sentences
|
|
41 |
### Generate questions for each evidence sentence
|
42 |
We use [BLOOM](https://huggingface.co/bigscience/bloom-7b1) to generate questions for each evidence sentence using the closet examples from the training set. See [question_generation_top_sentences.py](https://huggingface.co/chenxwh/AVeriTeC/blob/main/src/reranking/question_generation_top_sentences.py) for more argument options.
|
43 |
```
|
44 |
-
python -m
|
45 |
```
|
|
|
41 |
### Generate questions for each evidence sentence
|
42 |
We use [BLOOM](https://huggingface.co/bigscience/bloom-7b1) to generate questions for each evidence sentence using the closet examples from the training set. See [question_generation_top_sentences.py](https://huggingface.co/chenxwh/AVeriTeC/blob/main/src/reranking/question_generation_top_sentences.py) for more argument options.
|
43 |
```
|
44 |
+
python -m src.reranking.question_generation_top_sentences
|
45 |
```
|
src/reranking/bm25_sentences.py
CHANGED
@@ -115,3 +115,4 @@ if __name__ == "__main__":
|
|
115 |
}
|
116 |
output_json.write(json.dumps(json_data, ensure_ascii=False) + "\n")
|
117 |
done += 1
|
|
|
|
115 |
}
|
116 |
output_json.write(json.dumps(json_data, ensure_ascii=False) + "\n")
|
117 |
done += 1
|
118 |
+
output_file.flush()
|
src/reranking/question_generation_top_sentences.py
CHANGED
@@ -113,16 +113,12 @@ if __name__ == "__main__":
|
|
113 |
prompt_lookup_str = sentences_urls["sentence"]
|
114 |
url = sentences_urls["url"]
|
115 |
|
116 |
-
st = time.time()
|
117 |
prompt_s = prompt_bm25.get_scores(
|
118 |
nltk.word_tokenize(prompt_lookup_str)
|
119 |
)
|
120 |
prompt_n = 10
|
121 |
prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
|
122 |
prompt_docs = [prompt_corpus[i] for i in prompt_top_n]
|
123 |
-
print(
|
124 |
-
f"Got top 100 prompt for sent {sent_i} in file {i}. Time elapsed: {time.time() - st}"
|
125 |
-
)
|
126 |
|
127 |
claim_prompt = (
|
128 |
"Evidence: "
|
@@ -135,7 +131,7 @@ if __name__ == "__main__":
|
|
135 |
inputs = tokenizer([prompt], padding=True, return_tensors="pt").to(
|
136 |
model.device
|
137 |
)
|
138 |
-
|
139 |
outputs = model.generate(
|
140 |
inputs["input_ids"],
|
141 |
max_length=5000,
|
@@ -143,6 +139,9 @@ if __name__ == "__main__":
|
|
143 |
no_repeat_ngram_size=2,
|
144 |
early_stopping=True,
|
145 |
)
|
|
|
|
|
|
|
146 |
|
147 |
tgt_text = tokenizer.batch_decode(
|
148 |
outputs[:, inputs["input_ids"].shape[-1] :],
|
@@ -165,7 +164,5 @@ if __name__ == "__main__":
|
|
165 |
"claim": claim,
|
166 |
"bm25_qau": bm25_qau,
|
167 |
}
|
168 |
-
output_file.write(
|
169 |
-
json.dumps(json_data, ensure_ascii=False, indent=4) + "\n"
|
170 |
-
)
|
171 |
output_file.flush()
|
|
|
113 |
prompt_lookup_str = sentences_urls["sentence"]
|
114 |
url = sentences_urls["url"]
|
115 |
|
|
|
116 |
prompt_s = prompt_bm25.get_scores(
|
117 |
nltk.word_tokenize(prompt_lookup_str)
|
118 |
)
|
119 |
prompt_n = 10
|
120 |
prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
|
121 |
prompt_docs = [prompt_corpus[i] for i in prompt_top_n]
|
|
|
|
|
|
|
122 |
|
123 |
claim_prompt = (
|
124 |
"Evidence: "
|
|
|
131 |
inputs = tokenizer([prompt], padding=True, return_tensors="pt").to(
|
132 |
model.device
|
133 |
)
|
134 |
+
st = time.time()
|
135 |
outputs = model.generate(
|
136 |
inputs["input_ids"],
|
137 |
max_length=5000,
|
|
|
139 |
no_repeat_ngram_size=2,
|
140 |
early_stopping=True,
|
141 |
)
|
142 |
+
print(
|
143 |
+
f"Generated QA for sent {sent_i} in file {i}. Time elapsed: {time.time() - st}"
|
144 |
+
)
|
145 |
|
146 |
tgt_text = tokenizer.batch_decode(
|
147 |
outputs[:, inputs["input_ids"].shape[-1] :],
|
|
|
164 |
"claim": claim,
|
165 |
"bm25_qau": bm25_qau,
|
166 |
}
|
167 |
+
output_file.write(json.dumps(json_data, ensure_ascii=False) + "\n")
|
|
|
|
|
168 |
output_file.flush()
|