Spaces:
Sleeping
Sleeping
| import json | |
| from src.rag.retrieval import DocDB | |
| from src.utils.string_utils import extract_tag_content | |
| from src.data_processor.raw_data_processor import DatasetProcessor | |
| class HotpotQAProcessor(DatasetProcessor): | |
| def process_queries(self, input_file: str) -> list: | |
| queries = [] | |
| with open(input_file, "r", encoding="utf-8") as file: | |
| for line in file: | |
| data = json.loads(line) | |
| query = { | |
| "input": data["input"], | |
| "output": { | |
| "answer": data["output"][0]["answer"], | |
| "provenance": [ | |
| { | |
| "title": item["title"], | |
| "wikipedia_id": int(item["wikipedia_id"]), | |
| } | |
| for item in data["output"][0]["provenance"] | |
| ], | |
| }, | |
| } | |
| queries.append(query) | |
| return queries | |
| def process_documents( | |
| self, query_file: str, db: DocDB, queries: dict = None, **kwargs | |
| ) -> dict: | |
| documents = {} | |
| # if sampled queries are provided, use them instead of the queries in the query_file | |
| # however, for medlfqa, the query file is mandatory | |
| if queries is None: | |
| with open(query_file, "r", encoding="utf-8") as jsonfile: | |
| queries = json.load(jsonfile) | |
| for query in queries: | |
| for provenance in query["output"]["provenance"]: | |
| title = provenance["title"] | |
| if title not in documents: | |
| document = self._get_documents_per_query(title, db) | |
| documents[title] = document | |
| return documents | |
| def _get_documents_per_query(self, title: str, db: DocDB) -> list: | |
| """Returns a list of documents for a given query.""" | |
| contents = "" | |
| try: | |
| docs = db.get_text_from_title(title) | |
| for data in docs: | |
| contents += data["text"] | |
| return extract_tag_content(contents) | |
| except Exception as e: | |
| print(f"Error retrieving documents for title {title}: {e}") | |
| return [] | |