import os import pandas as pd import re import requests import time from collections import defaultdict from io import StringIO # Common mistakes need to be maintained mistakes = { '1': ['7', 'I', 'L', 'T'], '7': ['1', 'I', 'L', 'T'], 'I': ['1', '7', 'L', 'T'], 'L': ['1', '7', 'I', 'T'], 'T': ['1', '7', 'I', 'L'], '0': ['D', 'O', 'V'], 'D': ['0', 'O', 'V'], 'O': ['0', 'D', 'V'], 'V': ['0', 'D', 'O'], '4': ['A', 'X'], 'A': ['4', 'X'], 'X': ['4', 'A'], '5': ['S'], 'S': ['5'], 'F': ['H'], 'H': ['F'], '9': ['P'], 'P': ['9'] } raw_url = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative" gwas_path = "resources/gwas_catalog.tsv" def permutate(word): if len(word) == 0: return [''] change = [] res = permutate(word[1:]) if word[0] in mistakes: for m in mistakes[word[0]]: change.extend([m + r for r in res]) return [word[0] + r for r in res] + change def call(url): while True: try: res = requests.get(url) time.sleep(1) break except Exception as e: print(e) return res def generate_raw_files(): # Load Raw GWAS files if os.path.exists(gwas_path): gwas = pd.read_csv(gwas_path, delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']] else: data = requests.get(raw_url).content.decode('utf-8') gwas = pd.read_csv(StringIO(data), delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']] # Load Genes and SNPs from GWAS gwas_gene_rsid = gwas[['MAPPED_GENE', 'SNPS']] gwas_gene_rsid.dropna(inplace=True, ignore_index=True) gwas_gene_rsid['MAPPED_GENE'] = gwas_gene_rsid['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper()) # Generate Genes and SNPs mapping ground_truth = defaultdict(list) for i in gwas_gene_rsid.index: gene = gwas_gene_rsid.loc[i, 'MAPPED_GENE'] snp = gwas_gene_rsid.loc[i, 'SNPS'] pattern = r"[,x\-]" genes = re.split(pattern, gene) snps = re.split(pattern, snp) for gene in genes: for snp in snps: ground_truth[gene].append(snp) ground_truth[snp].append(gene) return gwas, ground_truth gwas, ground_truth = generate_raw_files() def integrate(df): # Loop through extractor result df_db = pd.DataFrame() for i in df.index: gene, snp = df.loc[i, 'Genes'], df.loc[i, 'rsID'] df_gwas = gwas[(gwas['MAPPED_GENE'].str.contains(gene, na=False)) & \ (gwas['SNPS'].str.contains(snp, na=False))] df_db = pd.concat([df_db, df_gwas]) # Adjust new column df_db.rename(columns={ 'DISEASE/TRAIT': 'Traits', 'MAPPED_GENE': 'Genes', 'SNPS': 'rsID', 'P-VALUE': 'P Value', 'OR or BETA': 'OR Value' }, inplace=True) df_db.drop(columns=['CHR_ID'], inplace=True, errors='ignore') df_db['Beta Value'] = df_db.get('OR Value') df_db['Source'] = 'Database' # Combine raw and database df_db = df_db.get(df.columns) df = pd.concat([df, df_db]) df.reset_index(drop=True, inplace=True) return df