| | |
| | import csv, sys, time, requests |
| |
|
| | BASE = "https://www.cbioportal.org/api" |
| | HEADERS = {"Accept": "application/json"} |
| |
|
| | def get_all_studies(page_size=500): |
| | |
| | studies = [] |
| | page = 0 |
| | while True: |
| | params = {"pageSize": page_size, "pageNumber": page} |
| | r = requests.get(f"{BASE}/studies", headers=HEADERS, params=params, timeout=60) |
| | r.raise_for_status() |
| | batch = r.json() |
| | if not batch: |
| | break |
| | studies.extend(batch) |
| | page += 1 |
| | |
| | time.sleep(0.2) |
| | return studies |
| |
|
| | def to_list(x): |
| | if x is None: |
| | return [] |
| | if isinstance(x, list): |
| | return x |
| | |
| | return [s.strip() for s in str(x).split(",") if s.strip()] |
| |
|
| | def main(out_csv="cbioportal_study_pmids.csv"): |
| | studies = get_all_studies() |
| | |
| | rows = [] |
| | for s in studies: |
| | pmids = to_list(s.get("pmid")) |
| | for pmid in pmids: |
| | rows.append({ |
| | "studyId": s.get("studyId"), |
| | |
| | |
| | "pmid": pmid |
| | }) |
| | |
| | with open(out_csv, "w", newline="", encoding="utf-8") as f: |
| | |
| | w = csv.DictWriter(f, fieldnames=["studyId", "pmids"]) |
| | w.writeheader() |
| | w.writerows(rows) |
| | print(f"wrote {len(rows)} rows to {out_csv}") |
| |
|
| | if __name__ == "__main__": |
| | out = sys.argv[1] if len(sys.argv) > 1 else "cbioportal_study_pmids.csv" |
| | main(out) |
| |
|