RobustRAG / scripts /dataset.py
AaronCIH's picture
Upload folder using huggingface_hub
a8c50ce verified
# datasets
"""
# load datasets(train)的方法:
from datasets import load_dataset
db = load_dataset(...)["train"]
for x in db:
# x 是一個 set{}, , e.g.
# {"corpus-id": "6519.png", "image": <PIL.PngImagePlugin.PngImageFile\
# image mode=RGBA size=1263x700 at 0x7F0303CD6AD0>}
...
## load datasets(test)的方法:
from datasets import load_dataset
dbcorpus = load_dataset(..., "corpus")["train"]
dbqrels = load_dataset(..., "qrels")["train"]
dbqueries = load_dataset(..., "queries")["train"]
## 如果是圖片集合
for x in dbcorpus:
# x 是一個 set{}, , e.g.
# {"corpus-id": "圖片的id", "image": <PIL.PngImagePlugin.PngImageFile\
# image mode=RGBA size=1263x700 at 0x7F0303CD6AD0>}
...
for x in dbqrels:
# x 是一個 set{}, , e.g.
# {"query-id": "問題的id", "corpus-id": "圖片的id",}
...
for x in dbqueries:
# x 是一個 set{}, , e.g.
# {"query-id": "問題的id", "query": "問題", "answer":"問題的答案"}
...
## 如果是OCR資料集
for x in dbcorpus:
# x 是一個 set{}, , e.g.
# {"corpus-id": "6519.png", "text": "string to describe a photo"}
...
for x in dbqrels:
# x 是一個 set{}, , e.g.
# {"query-id": "問題的id", "corpus-id": "圖片的id",}
...
for x in dbqueries:
# x 是一個 set{}, , e.g.
# {"query-id": "問題的id", "query": "問題", "answer":"問題的答案"}
...
"""
from datasets import load_dataset
save_root = r"/group-volume/Human-Action-Analysis/users/hsiang.chen/Robust/datasets/"
# Train datasets:
## arxiv, plotqa, ... 的122k的indomain資料集
load_dataset("openbmb/VisRAG-Ret-Train-In-domain-data", cache_dir=save_root)["train"]
## 合成的239k的資料集
load_dataset("openbmb/VisRAG-Ret-Train-Synthetic-data", cache_dir=save_root)["train"]
# Test datasets: (每個test datasets分3個split)(有圖片版本 跟 OCR版本)
# 圖片版本
## 乾淨的PlotQA
load_dataset("openbmb/VisRAG-Ret-Test-PlotQA", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-PlotQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-PlotQA", "queries", cache_dir=save_root)["train"]
## 乾淨的SlideVQA
load_dataset("openbmb/VisRAG-Ret-Test-SlideVQA", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-SlideVQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-SlideVQA", "queries", cache_dir=save_root)["train"]
## 乾淨的InfoVQA
load_dataset("openbmb/VisRAG-Ret-Test-InfoVQA", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-InfoVQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-InfoVQA", "queries", cache_dir=save_root)["train"]
## 乾淨的ArxivQA
oad_dataset("openbmb/VisRAG-Ret-Test-ArxivQA", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-ArxivQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-ArxivQA", "queries", cache_dir=save_root)["train"]
## 乾淨的ChartQA
load_dataset("openbmb/VisRAG-Ret-Test-ChartQA", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-ChartQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-ChartQA", "queries", cache_dir=save_root)["train"]
## 乾淨的MP-DocVQA
load_dataset("openbmb/VisRAG-Ret-Test-MP-DocVQA", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-MP-DocVQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-MP-DocVQA", "queries", cache_dir=save_root)["train"]
## PlotQA (degraded(synthetic)), 跟乾淨的共用 "quels" 跟 "queries"
load_dataset("rweics5cs7/exo3-original-PlotQA-deg", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-PlotQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-PlotQA", "queries", cache_dir=save_root)["train"]
## SlideVQA (degraded(synthetic)), 跟乾淨的共用 "quels" 跟 "queries"
load_dataset("rweics5cs7/exo3-original-SlideVQA-deg", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-SlideVQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-SlideVQA", "queries", cache_dir=save_root)["train"]
## InfoVQA (degraded(synthetic)), 跟乾淨的共用 "quels" 跟 "queries"
load_dataset("rweics5cs7/exo3-original-InfoVQA-deg", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-InfoVQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-InfoVQA", "queries", cache_dir=save_root)["train"]
## ArxivQA (degraded(synthetic)), 跟乾淨的共用 "quels" 跟 "queries"
load_dataset("rweics5cs7/exo3-original-ArxivQA-deg", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-ArxivQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-ArxivQA", "queries", cache_dir=save_root)["train"]
## ChartQA (degraded(synthetic)), 跟乾淨的共用 "quels" 跟 "queries"
load_dataset("rweics5cs7/exo3-original-ChartQA-deg", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-ChartQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-ChartQA", "queries", cache_dir=save_root)["train"]
## MP-DocVQA (degraded(synthetic)), 跟乾淨的共用 "quels" 跟 "queries"
load_dataset("rweics5cs7/exo3-original-MP-DocVQA-deg", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-MP-DocVQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-MP-DocVQA", "queries", cache_dir=save_root)["train"]
## rvl cdip (3k) 乾淨的
load_dataset("rweics5cs7/exo7-realworld-db-combined", "corpus", cache_dir=save_root)["train"]
load_dataset("rweics5cs7/exo7-realworld-db-combined", "qrels", cache_dir=save_root)["train"]
load_dataset("rweics5cs7/exo7-realworld-db-combined", "queries", cache_dir=save_root)["train"]
## rvl cdip (REALWORLD) (3k) degraded realworld
load_dataset("rweics5cs7/exo7-realworld-db-combined-deg", "corpus", cache_dir=save_root)["train"]
load_dataset("rweics5cs7/exo7-realworld-db-combined-deg", "qrels", cache_dir=save_root)["train"]
load_dataset("rweics5cs7/exo7-realworld-db-combined-deg", "queries", cache_dir=save_root)["train"]
## MP-DocVQA (REALWORLD) (741) degraded realworld
load_dataset("rweics5cs7/exo9-realworld-db-combined", "corpus", cache_dir=save_root)["train"]
load_dataset("rweics5cs7/exo9-realworld-db-combined", "qrels", cache_dir=save_root)["train"]
load_dataset("rweics5cs7/exo9-realworld-db-combined", "queries", cache_dir=save_root)["train"]
## ArxivQA (REALWORLD) (3000) degraded realworld
load_dataset("rweics5cs7/exo10-realworld-db-combined", "corpus", cache_dir=save_root)["train"]
load_dataset("rweics5cs7/exo10-realworld-db-combined", "qrels", cache_dir=save_root)["train"]
load_dataset("rweics5cs7/exo10-realworld-db-combined", "queries", cache_dir=save_root)["train"]
# # OCR版本 (PPOCR-v5)
# ## 乾淨的PlotQA
# load_dataset("rweics5cs7/exo3-original-PlotQA-text", "corpus", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-PlotQA", "qrels", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-PlotQA", "queries", cache_dir=save_root)["train"]
# ## 乾淨的SlideVQA
# load_dataset("rweics5cs7/exo3-original-SlideVQA-text", "corpus", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-SlideVQA", "qrels", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-SlideVQA", "queries", cache_dir=save_root)["train"]
# ## 乾淨的InfoVQA
# load_dataset("rweics5cs7/exo3-original-InfoVQA-text", "corpus", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-InfoVQA", "qrels", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-InfoVQA", "queries", cache_dir=save_root)["train"]
# ## 乾淨的ArxivQA
# oad_dataset("rweics5cs7/exo3-original-ArxivQA-text", "corpus", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-ArxivQA", "qrels", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-ArxivQA", "queries", cache_dir=save_root)["train"]
# ## 乾淨的ChartQA
# load_dataset("rweics5cs7/exo3-original-ChartQA-text", "corpus", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-ChartQA", "qrels", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-ChartQA", "queries", cache_dir=save_root)["train"]
# ## 乾淨的MP-DocVQA
# load_dataset("rweics5cs7/exo3-original-MP-DocVQA-text", "corpus", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-MP-DocVQA", "qrels", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-MP-DocVQA", "queries", cache_dir=save_root)["train"]
# ## PlotQA (degraded(synthetic)), 跟乾淨的共用 "quels" 跟 "queries"
# load_dataset("rweics5cs7/exo3-original-PlotQA-text-deg", "corpus", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-PlotQA", "qrels", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-PlotQA", "queries", cache_dir=save_root)["train"]
# ## SlideVQA (degraded(synthetic)), 跟乾淨的共用 "quels" 跟 "queries"
# load_dataset("rweics5cs7/exo3-original-SlideVQA-text-deg", "corpus", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-SlideVQA", "qrels", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-SlideVQA", "queries", cache_dir=save_root)["train"]
# ## InfoVQA (degraded(synthetic)), 跟乾淨的共用 "quels" 跟 "queries"
# load_dataset("rweics5cs7/exo3-original-InfoVQA-text-deg", "corpus", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-InfoVQA", "qrels", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-InfoVQA", "queries", cache_dir=save_root)["train"]
# ## ArxivQA (degraded(synthetic)), 跟乾淨的共用 "quels" 跟 "queries"
# load_dataset("rweics5cs7/exo3-original-ArxivQA-text-deg", "corpus", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-ArxivQA", "qrels", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-ArxivQA", "queries", cache_dir=save_root)["train"]
# ## ChartQA (degraded(synthetic)), 跟乾淨的共用 "quels" 跟 "queries"
# load_dataset("rweics5cs7/exo3-original-ChartQA-text-deg", "corpus", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-ChartQA", "qrels", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-ChartQA", "queries", cache_dir=save_root)["train"]
# ## MP-DocVQA (degraded(synthetic)), 跟乾淨的共用 "quels" 跟 "queries"
# load_dataset("rweics5cs7/exo3-original-MP-DocVQA-text-deg", "corpus", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-MP-DocVQA", "qrels", cache_dir=save_root)["train"]
# load_dataset("openbmb/VisRAG-Ret-Test-MP-DocVQA", "queries", cache_dir=save_root)["train"]
# ## rvl cdip (3k) 乾淨的
# load_dataset("rweics5cs7/exo8-realworld-db-combined-text", "corpus", cache_dir=save_root)["train"]
# load_dataset("rweics5cs7/exo8-realworld-db-combined-text", "qrels", cache_dir=save_root)["train"]
# load_dataset("rweics5cs7/exo8-realworld-db-combined-text", "queries", cache_dir=save_root)["train"]
# ## rvl cdip (REALWORLD) (3k) degraded realworld
# load_dataset("rweics5cs7/exo8-realworld-db-combined-text-deg", "corpus", cache_dir=save_root)["train"]
# load_dataset("rweics5cs7/exo8-realworld-db-combined-text-deg", "qrels", cache_dir=save_root)["train"]
# load_dataset("rweics5cs7/exo8-realworld-db-combined-text-deg", "queries", cache_dir=save_root)["train"]
# ## MP-DocVQA (REALWORLD) (741) degraded realworld
# load_dataset("rweics5cs7/exo9-realworld-db-combined-text", "corpus", cache_dir=save_root)["train"]
# load_dataset("rweics5cs7/exo9-realworld-db-combined-text", "qrels", cache_dir=save_root)["train"]
# load_dataset("rweics5cs7/exo9-realworld-db-combined-text", "queries", cache_dir=save_root)["train"]
# ## ArxivQA (REALWORLD) (3000) degraded realworld
# load_dataset("rweics5cs7/exo10-realworld-db-combined-text", "corpus", cache_dir=save_root)["train"]
# load_dataset("rweics5cs7/exo10-realworld-db-combined-text", "qrels", cache_dir=save_root)["train"]
# load_dataset("rweics5cs7/exo10-realworld-db-combined-text", "queries", cache_dir=save_root)["train"]
# OCR版本 (PPOCR-v3)
## 乾淨的PlotQA
load_dataset("rweics5cs7/exo3-original-PlotQA-text-v3", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-PlotQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-PlotQA", "queries", cache_dir=save_root)["train"]
## 乾淨的SlideVQA
load_dataset("rweics5cs7/exo3-original-SlideVQA-text-v3", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-SlideVQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-SlideVQA", "queries", cache_dir=save_root)["train"]
## 乾淨的InfoVQA
load_dataset("rweics5cs7/exo3-original-InfoVQA-text-v3", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-InfoVQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-InfoVQA", "queries", cache_dir=save_root)["train"]
## 乾淨的ArxivQA
oad_dataset("rweics5cs7/exo3-original-ArxivQA-text-v3", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-ArxivQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-ArxivQA", "queries", cache_dir=save_root)["train"]
## 乾淨的ChartQA
load_dataset("rweics5cs7/exo3-original-ChartQA-text-v3", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-ChartQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-ChartQA", "queries", cache_dir=save_root)["train"]
## 乾淨的MP-DocVQA
load_dataset("rweics5cs7/exo3-original-MP-DocVQA-text-v3", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-MP-DocVQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-MP-DocVQA", "queries", cache_dir=save_root)["train"]
## PlotQA (degraded(synthetic)), 跟乾淨的共用 "quels" 跟 "queries"
load_dataset("rweics5cs7/exo3-original-PlotQA-text-deg-v3", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-PlotQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-PlotQA", "queries", cache_dir=save_root)["train"]
## SlideVQA (degraded(synthetic)), 跟乾淨的共用 "quels" 跟 "queries"
load_dataset("rweics5cs7/exo3-original-SlideVQA-text-deg-v3", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-SlideVQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-SlideVQA", "queries", cache_dir=save_root)["train"]
## InfoVQA (degraded(synthetic)), 跟乾淨的共用 "quels" 跟 "queries"
load_dataset("rweics5cs7/exo3-original-InfoVQA-text-deg-v3", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-InfoVQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-InfoVQA", "queries", cache_dir=save_root)["train"]
## ArxivQA (degraded(synthetic)), 跟乾淨的共用 "quels" 跟 "queries"
load_dataset("rweics5cs7/exo3-original-ArxivQA-text-deg-v3", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-ArxivQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-ArxivQA", "queries", cache_dir=save_root)["train"]
## ChartQA (degraded(synthetic)), 跟乾淨的共用 "quels" 跟 "queries"
load_dataset("rweics5cs7/exo3-original-ChartQA-text-deg-v3", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-ChartQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-ChartQA", "queries", cache_dir=save_root)["train"]
## MP-DocVQA (degraded(synthetic)), 跟乾淨的共用 "quels" 跟 "queries"
load_dataset("rweics5cs7/exo3-original-MP-DocVQA-text-deg-v3", "corpus", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-MP-DocVQA", "qrels", cache_dir=save_root)["train"]
load_dataset("openbmb/VisRAG-Ret-Test-MP-DocVQA", "queries", cache_dir=save_root)["train"]
## rvl cdip (3k) 乾淨的
load_dataset("rweics5cs7/exo8-realworld-db-combined-text-v3", "corpus", cache_dir=save_root)["train"]
load_dataset("rweics5cs7/exo8-realworld-db-combined-text-v3", "qrels", cache_dir=save_root)["train"]
load_dataset("rweics5cs7/exo8-realworld-db-combined-text-v3", "queries", cache_dir=save_root)["train"]
## rvl cdip (REALWORLD) (3k) degraded realworld
load_dataset("rweics5cs7/exo8-realworld-db-combined-text-deg-v3", "corpus", cache_dir=save_root)["train"]
load_dataset("rweics5cs7/exo8-realworld-db-combined-text-deg-v3", "qrels", cache_dir=save_root)["train"]
load_dataset("rweics5cs7/exo8-realworld-db-combined-text-deg-v3", "queries", cache_dir=save_root)["train"]
## MP-DocVQA (REALWORLD) (741) degraded realworld
load_dataset("rweics5cs7/exo9-realworld-db-combined-text-v3", "corpus", cache_dir=save_root)["train"]
load_dataset("rweics5cs7/exo9-realworld-db-combined-text-v3", "qrels", cache_dir=save_root)["train"]
load_dataset("rweics5cs7/exo9-realworld-db-combined-text-v3", "queries", cache_dir=save_root)["train"]
## ArxivQA (REALWORLD) (3000) degraded realworld
load_dataset("rweics5cs7/exo10-realworld-db-combined-text-v3", "corpus", cache_dir=save_root)["train"]
load_dataset("rweics5cs7/exo10-realworld-db-combined-text-v3", "qrels", cache_dir=save_root)["train"]
load_dataset("rweics5cs7/exo10-realworld-db-combined-text-v3", "queries", cache_dir=save_root)["train"]