|
|
|
"""
|
|
Created on Wed Jun 8 09:26:57 2022
|
|
|
|
@author: luol2
|
|
|
|
Pipeline: first gene NER, then species assignment
|
|
input: species NER bioc xml file
|
|
output: gene ner and species assignment results bioc xml file
|
|
"""
|
|
import argparse
|
|
import os
|
|
import io
|
|
import time
|
|
import sys
|
|
import re
|
|
import shutil
|
|
from src_python.GeneNER import model_ner,ner_tag
|
|
from src_python.SpeAss import model_sa,sa_tag
|
|
|
|
import tensorflow as tf
|
|
|
|
import bioc
|
|
import stanza
|
|
nlp_token = stanza.Pipeline(model_dir='gnorm_trained_models/stanza', lang='en', processors={'tokenize': 'spacy'},package='None', download_method=None)
|
|
|
|
def NER_BioC(infolder,infile,outpath,nn_model):
|
|
|
|
with open(infolder+"/"+infile, 'r',encoding='utf-8') as fin:
|
|
with open(outpath+"/"+infile,'w', encoding='utf8') as fout:
|
|
collection = bioc.load(fin)
|
|
|
|
Total_n=len(collection.documents)
|
|
print('Total number of sub-documents:', Total_n)
|
|
pmid_n=0
|
|
for document in collection.documents:
|
|
print("Processing:{0}%".format(round(pmid_n * 100 / Total_n)), end="\r")
|
|
pmid_n+=1
|
|
|
|
mention_num_new=0
|
|
for passage in document.passages:
|
|
if passage.text!='' and (not passage.text.isspace()) and passage.infons['type']!='ref':
|
|
passage_offset=passage.offset
|
|
tag_result=ner_tag.ML_Tag(passage.text,nn_model,nlp_token)
|
|
mention_num=0
|
|
for ele in tag_result:
|
|
bioc_note = bioc.BioCAnnotation()
|
|
bioc_note.id = str(mention_num)
|
|
mention_num+=1
|
|
bioc_note.infons['type'] = ele[2]
|
|
start = int(ele[0])
|
|
last = int(ele[1])
|
|
loc = bioc.BioCLocation(offset=str(passage_offset+start), length= str(last-start))
|
|
bioc_note.locations.append(loc)
|
|
bioc_note.text = passage.text[start:last]
|
|
passage.annotations.append(bioc_note)
|
|
|
|
for temp_annotation in passage.annotations:
|
|
temp_annotation.id=str(mention_num_new)
|
|
mention_num_new+=1
|
|
bioc.dump(collection, fout, pretty_print=True)
|
|
|
|
def NER_PubTator(infolder,infile,outpath,nn_model):
|
|
with open(infolder+"/"+infile, 'r',encoding='utf-8') as fin:
|
|
with open(outpath+"/"+infile,'w', encoding='utf-8') as fout:
|
|
title=''
|
|
abstract=''
|
|
all_text=fin.read().strip().split('\n\n')
|
|
Total_n=len(all_text)
|
|
print('Total number of sub-documents:', Total_n)
|
|
pmid_n=0
|
|
for doc in all_text:
|
|
print("Processing:{0}%".format(round(pmid_n * 100 / Total_n)), end="\r")
|
|
pmid_n+=1
|
|
lines = doc.split('\n')
|
|
seg=lines[0].split('|t|')
|
|
pmid=seg[0]
|
|
title=""
|
|
if len(seg)>1:
|
|
title=seg[1]
|
|
abstract=""
|
|
if len(lines)>1:
|
|
seg=lines[1].split('|a|')
|
|
abstract=seg[1]
|
|
if len(seg)>1:
|
|
abstract=seg[1]
|
|
|
|
intext=title+' '+abstract
|
|
tag_result=ner_tag.ML_Tag(intext,nn_model,nlp_token)
|
|
fout.write(doc+'\n')
|
|
for ele in tag_result:
|
|
ent_start = ele[0]
|
|
ent_last = ele[1]
|
|
ent_mention = intext[int(ele[0]):int(ele[1])]
|
|
ent_type=ele[2]
|
|
fout.write(pmid+"\t"+ent_start+"\t"+ent_last+"\t"+ent_mention+"\t"+ent_type+"\n")
|
|
fout.write('\n')
|
|
title=''
|
|
abstract=''
|
|
|
|
def geneNER(infolder, outpath, modelfile):
|
|
|
|
print('loading NER models........')
|
|
|
|
if modelfile.lower().find('bioformer')>=0:
|
|
vocabfiles={'labelfile':'./vocab/GeneNER_label.vocab',
|
|
'checkpoint_path':'./gnorm_trained_models/bioformer-cased-v1.0/',
|
|
'lowercase':False,
|
|
}
|
|
else:
|
|
vocabfiles={'labelfile':'./vocab/GeneNER_label.vocab',
|
|
'checkpoint_path':'./gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/',
|
|
'lowercase':True,
|
|
}
|
|
|
|
nn_model=model_ner.HUGFACE_NER(vocabfiles)
|
|
nn_model.build_encoder()
|
|
nn_model.build_softmax_decoder()
|
|
nn_model.load_model(modelfile)
|
|
|
|
|
|
print("begin GeneNER tagging........")
|
|
start_time=time.time()
|
|
|
|
for infile in os.listdir(infolder):
|
|
if os.path.isfile(outpath+"/"+infile):
|
|
print(infile+' has exsited.')
|
|
else:
|
|
print('processing:',infile)
|
|
fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
|
|
input_format=""
|
|
for line in fin:
|
|
pattern_bioc = re.compile('.*<collection>.*')
|
|
pattern_pubtator = re.compile('^([^\|]+)\|[^\|]+\|(.*)')
|
|
if pattern_bioc.search(line):
|
|
input_format="BioC"
|
|
break
|
|
elif pattern_pubtator.search(line):
|
|
input_format="PubTator"
|
|
break
|
|
fin.close()
|
|
if(input_format == "PubTator"):
|
|
NER_PubTator(infolder,infile,outpath,nn_model)
|
|
elif(input_format == "BioC"):
|
|
NER_BioC(infolder,infile,outpath,nn_model)
|
|
|
|
print('tag done:',time.time()-start_time)
|
|
|
|
|
|
|
|
def SA_BioC(infolder,infile,outpath,nn_model,virus_set,prefix_dict):
|
|
|
|
|
|
|
|
|
|
|
|
fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
|
|
|
|
fin_pubtator0=io.StringIO()
|
|
fin_pubtator1=io.StringIO()
|
|
fin_pubtator2=io.StringIO()
|
|
collection = bioc.load(fin)
|
|
fin.close()
|
|
ori_ann_index={}
|
|
species_count={}
|
|
gene_set=['Gene','FamilyName']
|
|
final_sa_results={}
|
|
for document in collection.documents:
|
|
doc_pmid=document.id
|
|
doc_title=''
|
|
doc_abstract=''
|
|
doc_annotation=[]
|
|
_ann_index={}
|
|
_species_num={}
|
|
_gene_num=0
|
|
_passage_num=0
|
|
if len(document.passages)<=2:
|
|
for passage in document.passages:
|
|
passage_offset=passage.offset
|
|
_passage_num+=1
|
|
|
|
|
|
if _passage_num==1:
|
|
doc_title=passage.text
|
|
for temp_annotation in passage.annotations:
|
|
if temp_annotation.infons['type'] in gene_set:
|
|
_gene_num+=1
|
|
ent_start=temp_annotation.locations[0].offset-passage_offset
|
|
ent_end=ent_start+temp_annotation.locations[0].length
|
|
|
|
_ann_index[temp_annotation.id]=str(ent_start)+'-'+str(ent_end)
|
|
|
|
if 'Identifier' in temp_annotation.infons.keys():
|
|
|
|
species_ID=temp_annotation.infons['Identifier']
|
|
if species_ID.find('*')>=0:
|
|
if species_ID not in _species_num.keys():
|
|
_species_num[species_ID]=1
|
|
else:
|
|
_species_num[species_ID]+=1
|
|
doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']+'\t'+species_ID)
|
|
else:
|
|
doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type'])
|
|
|
|
|
|
else:
|
|
doc_abstract=passage.text
|
|
for temp_annotation in passage.annotations:
|
|
if temp_annotation.infons['type'] in gene_set:
|
|
_gene_num+=1
|
|
ent_start=len(doc_title)+1+temp_annotation.locations[0].offset-passage_offset
|
|
ent_end=ent_start+temp_annotation.locations[0].length
|
|
|
|
_ann_index[temp_annotation.id]=str(ent_start)+'-'+str(ent_end)
|
|
if 'Identifier' in temp_annotation.infons.keys():
|
|
|
|
species_ID=temp_annotation.infons['Identifier']
|
|
if species_ID.find('*')>=0:
|
|
if species_ID not in _species_num.keys():
|
|
_species_num[species_ID]=1
|
|
else:
|
|
_species_num[species_ID]+=1
|
|
doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']+'\t'+species_ID)
|
|
else:
|
|
doc_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type'])
|
|
|
|
if len(_species_num)>=2 and _gene_num>0:
|
|
fin_pubtator2.write(doc_pmid+'|t|'+doc_title+'\n')
|
|
fin_pubtator2.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
|
for ele in doc_annotation:
|
|
fin_pubtator2.write(ele+'\n')
|
|
fin_pubtator2.write('\n')
|
|
elif len(_species_num)==1 and _gene_num>0:
|
|
fin_pubtator1.write(doc_pmid+'|t|'+doc_title+'\n')
|
|
fin_pubtator1.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
|
major_speicesid,=_species_num
|
|
fin_pubtator1.write(major_speicesid[1:]+'\n')
|
|
for ele in doc_annotation:
|
|
fin_pubtator1.write(ele+'\n')
|
|
fin_pubtator1.write('\n')
|
|
elif len(_species_num)==0 and _gene_num>0:
|
|
fin_pubtator0.write(doc_pmid+'|t|'+doc_title+'\n')
|
|
fin_pubtator0.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
|
for ele in doc_annotation:
|
|
fin_pubtator0.write(ele+'\n')
|
|
fin_pubtator0.write('\n')
|
|
|
|
else:
|
|
for passage in document.passages:
|
|
passage_annotation=[]
|
|
_species_num_passage={}
|
|
_gene_num_passage=0
|
|
passage_offset=passage.offset
|
|
|
|
if passage.text!='' and (not passage.text.isspace()) and passage.infons['type']!='ref':
|
|
doc_title=passage.text
|
|
for temp_annotation in passage.annotations:
|
|
if temp_annotation.infons['type'] in gene_set:
|
|
_gene_num_passage+=1
|
|
ent_start=temp_annotation.locations[0].offset-passage_offset
|
|
ent_end=ent_start+temp_annotation.locations[0].length
|
|
|
|
_ann_index[temp_annotation.id]=str(ent_start)+'-'+str(ent_end)
|
|
|
|
if 'Identifier' in temp_annotation.infons.keys():
|
|
|
|
species_ID=temp_annotation.infons['Identifier']
|
|
if species_ID.find('*')>=0:
|
|
if species_ID not in _species_num.keys():
|
|
_species_num[species_ID]=1
|
|
else:
|
|
_species_num[species_ID]+=1
|
|
if species_ID not in _species_num_passage.keys():
|
|
_species_num_passage[species_ID]=1
|
|
else:
|
|
_species_num_passage[species_ID]+=1
|
|
passage_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type']+'\t'+species_ID)
|
|
else:
|
|
passage_annotation.append(doc_pmid+'\t'+temp_annotation.id+'\t'+str(ent_start)+'\t'+str(ent_end)+'\t'+temp_annotation.text+'\t'+temp_annotation.infons['type'])
|
|
|
|
|
|
if len(_species_num_passage)>=2 and _gene_num_passage>0:
|
|
fin_pubtator2.write(doc_pmid+'|t|'+doc_title+'\n')
|
|
fin_pubtator2.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
|
for ele in passage_annotation:
|
|
fin_pubtator2.write(ele+'\n')
|
|
fin_pubtator2.write('\n')
|
|
elif len(_species_num_passage)==1 and _gene_num_passage>0:
|
|
fin_pubtator1.write(doc_pmid+'|t|'+doc_title+'\n')
|
|
fin_pubtator1.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
|
major_speicesid,=_species_num_passage
|
|
fin_pubtator1.write(major_speicesid[1:]+'\n')
|
|
for ele in passage_annotation:
|
|
fin_pubtator1.write(ele+'\n')
|
|
fin_pubtator1.write('\n')
|
|
elif len(_species_num_passage)==0 and _gene_num_passage>0:
|
|
fin_pubtator0.write(doc_pmid+'|t|'+doc_title+'\n')
|
|
fin_pubtator0.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
|
for ele in passage_annotation:
|
|
fin_pubtator0.write(ele+'\n')
|
|
fin_pubtator0.write('\n')
|
|
|
|
|
|
ori_ann_index[doc_pmid]=_ann_index
|
|
species_count[doc_pmid]=_species_num
|
|
|
|
|
|
cache_geneid={}
|
|
|
|
if fin_pubtator2.getvalue()!='':
|
|
|
|
|
|
ml_out= sa_tag.ml_tag_main(fin_pubtator2,nlp_token, nn_model)
|
|
|
|
fin_result=io.StringIO(ml_out.getvalue())
|
|
all_in=fin_result.read().strip().split('\n\n')
|
|
|
|
fin_result.close()
|
|
|
|
prefix_speid_allset=set(prefix_dict.keys())
|
|
|
|
for doc in all_in:
|
|
lines=doc.split('\n')
|
|
pmid=lines[0].split('|t|')[0]
|
|
_prefix_str2id_dict={}
|
|
doc_species=list(species_count[pmid].keys())
|
|
for _spe_ele in doc_species:
|
|
if _spe_ele[1:] in prefix_speid_allset:
|
|
for ele in prefix_dict[_spe_ele[1:]]:
|
|
_prefix_str2id_dict[ele]=_spe_ele[1:]
|
|
|
|
for i in range(2,len(lines)):
|
|
segs=lines[i].split('\t')
|
|
if pmid not in final_sa_results.keys():
|
|
final_sa_results[pmid]={segs[1]:'Focus:'+segs[-1]}
|
|
else:
|
|
final_sa_results[pmid][segs[1]]='Focus:'+segs[-1]
|
|
|
|
if segs[5] in gene_set:
|
|
if segs[4][0:2] in _prefix_str2id_dict:
|
|
|
|
|
|
if pmid not in final_sa_results.keys():
|
|
final_sa_results[pmid]={segs[1]:'Focus:'+_prefix_str2id_dict[segs[4][0:2]]}
|
|
else:
|
|
final_sa_results[pmid][segs[1]]='Focus:'+_prefix_str2id_dict[segs[4][0:2]]
|
|
if pmid not in cache_geneid.keys():
|
|
cache_geneid[pmid]={segs[4]:{'Focus:'+segs[-1]:1}}
|
|
else:
|
|
if segs[4] not in cache_geneid[pmid].keys():
|
|
cache_geneid[pmid][segs[4]]={'Focus:'+segs[-1]:1}
|
|
else:
|
|
if segs[-1] not in cache_geneid[pmid][segs[4]].keys():
|
|
cache_geneid[pmid][segs[4]]['Focus:'+segs[-1]]=1
|
|
else:
|
|
cache_geneid[pmid][segs[4]]['Focus:'+segs[-1]]+=1
|
|
|
|
|
|
|
|
|
|
if fin_pubtator1.getvalue()!='':
|
|
fin_result=io.StringIO(fin_pubtator1.getvalue())
|
|
all_in=fin_result.read().strip().split('\n\n')
|
|
fin_result.close()
|
|
|
|
for doc in all_in:
|
|
lines=doc.split('\n')
|
|
pmid=lines[0].split('|t|')[0]
|
|
major_speicesid=lines[2]
|
|
for i in range(3,len(lines)):
|
|
segs=lines[i].split('\t')
|
|
if len(segs)>=7:
|
|
if pmid not in final_sa_results.keys():
|
|
final_sa_results[pmid]={segs[1]:segs[-1]}
|
|
else:
|
|
final_sa_results[pmid][segs[1]]=segs[-1]
|
|
else:
|
|
marjor_species='Focus:'+major_speicesid
|
|
if pmid not in final_sa_results.keys():
|
|
final_sa_results[pmid]={segs[1]:marjor_species}
|
|
else:
|
|
final_sa_results[pmid][segs[1]]=marjor_species
|
|
if pmid not in cache_geneid.keys():
|
|
cache_geneid[pmid]={segs[4]:{marjor_species:1}}
|
|
else:
|
|
if segs[4] not in cache_geneid[pmid].keys():
|
|
cache_geneid[pmid][segs[4]]={marjor_species:1}
|
|
else:
|
|
if segs[-1] not in cache_geneid[pmid][segs[4]].keys():
|
|
cache_geneid[pmid][segs[4]][marjor_species]=1
|
|
else:
|
|
cache_geneid[pmid][segs[4]][marjor_species]+=1
|
|
|
|
|
|
|
|
fin_result=io.StringIO(fin_pubtator0.getvalue())
|
|
all_in=fin_result.read().strip().split('\n\n')
|
|
fin_result.close()
|
|
|
|
for doc in all_in:
|
|
lines=doc.split('\n')
|
|
pmid=lines[0].split('|t|')[0]
|
|
|
|
for i in range(2,len(lines)):
|
|
segs=lines[i].split('\t')
|
|
if (pmid in cache_geneid.keys()) and (segs[4] in cache_geneid[pmid].keys()):
|
|
marjor_species = max(zip(cache_geneid[pmid][segs[4]].values(), cache_geneid[pmid][segs[4]].keys()))
|
|
if pmid not in final_sa_results.keys():
|
|
final_sa_results[pmid]={segs[1]:marjor_species[1]}
|
|
else:
|
|
final_sa_results[pmid][segs[1]]=marjor_species[1]
|
|
else:
|
|
if (pmid in species_count.keys()) and len(species_count[pmid])>0:
|
|
marjor_species = max(zip(species_count[pmid].values(), species_count[pmid].keys()))
|
|
|
|
if pmid not in final_sa_results.keys():
|
|
final_sa_results[pmid]={segs[1]:'Focus:'+marjor_species[1][1:]}
|
|
else:
|
|
final_sa_results[pmid][segs[1]]='Focus:'+marjor_species[1][1:]
|
|
else:
|
|
if pmid not in final_sa_results.keys():
|
|
final_sa_results[pmid]={segs[1]:'Focus:9606'}
|
|
else:
|
|
final_sa_results[pmid][segs[1]]='Focus:9606'
|
|
|
|
|
|
|
|
|
|
fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
|
|
fout_xml=open(outpath+"/"+infile,'w', encoding='utf8')
|
|
collection = bioc.load(fin)
|
|
for document in collection.documents:
|
|
doc_pmid=document.id
|
|
|
|
|
|
for passage in document.passages:
|
|
for temp_annotation in passage.annotations:
|
|
if 'Identifier' not in temp_annotation.infons.keys():
|
|
if temp_annotation.id in final_sa_results[doc_pmid].keys():
|
|
if final_sa_results[doc_pmid][temp_annotation.id][6:] in virus_set:
|
|
temp_annotation.infons['Identifier']=final_sa_results[doc_pmid][temp_annotation.id]+',9606'
|
|
|
|
else:
|
|
temp_annotation.infons['Identifier']=final_sa_results[doc_pmid][temp_annotation.id]
|
|
else:
|
|
if (doc_pmid in cache_geneid.keys()) and (temp_annotation.text in cache_geneid[doc_pmid].keys()):
|
|
marjor_species = max(zip(cache_geneid[doc_pmid][temp_annotation.text].values(), cache_geneid[doc_pmid][temp_annotation.text].keys()))
|
|
temp_annotation.infons['Identifier']=marjor_species[1]
|
|
else:
|
|
|
|
temp_annotation.infons['Identifier']='Focus:9606'
|
|
bioc.dump(collection, fout_xml, pretty_print=True)
|
|
fin.close()
|
|
fout_xml.close()
|
|
|
|
|
|
|
|
def SA_PubTator(infolder,infile,outpath,nn_model,virus_set,prefix_dict):
|
|
|
|
|
|
|
|
|
|
|
|
fin = open(infolder+"/"+infile, 'r',encoding='utf-8')
|
|
|
|
fin_pubtator2=io.StringIO()
|
|
all_in_ori=fin.read().strip().split('\n\n')
|
|
fin.close()
|
|
species_gene_count={}
|
|
gene_set=['Gene','FamilyName']
|
|
ML_results={}
|
|
|
|
prefix_speid_allset=set(prefix_dict.keys())
|
|
|
|
for document in all_in_ori:
|
|
lines=document.split('\n')
|
|
doc_pmid=lines[0].split('|t|')[0]
|
|
doc_title=lines[0].split('|t|')[1]
|
|
doc_abstract=lines[1].split('|a|')[1]
|
|
doc_annotation=[]
|
|
_species_num=set()
|
|
_gene_num=0
|
|
_ML_gene_num=0
|
|
_entity_num=0
|
|
_prefix_str2id_dict={}
|
|
for i in range(2,len(lines)):
|
|
segs=lines[i].split('\t')
|
|
if segs[4] in gene_set:
|
|
_gene_num+=1
|
|
if len(segs)>=6:
|
|
doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:]))
|
|
species_ID=segs[-1]
|
|
if species_ID.find('*')>=0:
|
|
_species_num.add(species_ID)
|
|
if species_ID[1:] in prefix_speid_allset:
|
|
for ele in prefix_dict[species_ID[1:]]:
|
|
_prefix_str2id_dict[ele]=species_ID[1:]
|
|
else:
|
|
if segs[3][0:2] in _prefix_str2id_dict:
|
|
if _prefix_str2id_dict[segs[3][0:2]] in virus_set:
|
|
doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:])+'\tFocus:'+_prefix_str2id_dict[segs[3][0:2]]+',9606')
|
|
if doc_pmid not in ML_results.keys():
|
|
ML_results[doc_pmid]={segs[1]+'-'+segs[2]:_prefix_str2id_dict[segs[3][0:2]]+',9606'}
|
|
else:
|
|
ML_results[doc_pmid][segs[1]+'-'+segs[2]]=_prefix_str2id_dict[segs[3][0:2]]+',9606'
|
|
|
|
|
|
else:
|
|
doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:])+'\tFocus:'+_prefix_str2id_dict[segs[3][0:2]])
|
|
if doc_pmid not in ML_results.keys():
|
|
ML_results[doc_pmid]={segs[1]+'-'+segs[2]:_prefix_str2id_dict[segs[3][0:2]]}
|
|
else:
|
|
ML_results[doc_pmid][segs[1]+'-'+segs[2]]=_prefix_str2id_dict[segs[3][0:2]]
|
|
|
|
|
|
else:
|
|
doc_annotation.append(segs[0]+'\t'+str(_entity_num)+'\t'+'\t'.join(segs[1:]))
|
|
if segs[4] in gene_set:
|
|
_ML_gene_num+=1
|
|
_entity_num+=1
|
|
|
|
if len(_species_num)>=2 and _ML_gene_num>0:
|
|
fin_pubtator2.write(doc_pmid+'|t|'+doc_title+'\n')
|
|
fin_pubtator2.write(doc_pmid+'|a|'+doc_abstract+'\n')
|
|
for ele in doc_annotation:
|
|
fin_pubtator2.write(ele+'\n')
|
|
fin_pubtator2.write('\n')
|
|
|
|
species_gene_count[doc_pmid]={'spec':_species_num,'gene':_gene_num}
|
|
|
|
if fin_pubtator2.getvalue()!='':
|
|
|
|
|
|
ml_out= sa_tag.ml_tag_main(fin_pubtator2,nlp_token, nn_model)
|
|
|
|
fin_result=io.StringIO(ml_out.getvalue())
|
|
all_in=fin_result.read().strip().split('\n\n')
|
|
|
|
fin_result.close()
|
|
for doc in all_in:
|
|
lines=doc.split('\n')
|
|
pmid=lines[0].split('|t|')[0]
|
|
|
|
for i in range(2,len(lines)):
|
|
segs=lines[i].split('\t')
|
|
if pmid not in ML_results.keys():
|
|
ML_results[pmid]={segs[2]+'-'+segs[3]:segs[-1]}
|
|
else:
|
|
ML_results[pmid][segs[2]+'-'+segs[3]]=segs[-1]
|
|
|
|
|
|
fout_pubtator=open(outpath+"/"+infile,'w', encoding='utf8')
|
|
for doc in all_in_ori:
|
|
lines=doc.split('\n')
|
|
pmid=lines[0].split('|t|')[0]
|
|
fout_pubtator.write(lines[0]+'\n'+lines[1]+'\n')
|
|
if len(species_gene_count[pmid]['spec'])>1 and species_gene_count[pmid]['gene']>0:
|
|
for i in range(2,len(lines)):
|
|
segs=lines[i].split('\t')
|
|
if len(segs)>=6:
|
|
fout_pubtator.write(lines[i]+'\n')
|
|
else:
|
|
if ML_results[pmid][segs[1]+'-'+segs[2]] in virus_set:
|
|
fout_pubtator.write(lines[i]+'\tFocus:'+ML_results[pmid][segs[1]+'-'+segs[2]]+',9606'+'\n')
|
|
|
|
else:
|
|
fout_pubtator.write(lines[i]+'\tFocus:'+ML_results[pmid][segs[1]+'-'+segs[2]]+'\n')
|
|
fout_pubtator.write('\n')
|
|
|
|
elif len(species_gene_count[pmid]['spec'])==1 and species_gene_count[pmid]['gene']>0:
|
|
for i in range(2,len(lines)):
|
|
segs=lines[i].split('\t')
|
|
if len(segs)>=6:
|
|
fout_pubtator.write(lines[i]+'\n')
|
|
else:
|
|
major_species,=species_gene_count[pmid]['spec']
|
|
if major_species[1:] in virus_set:
|
|
fout_pubtator.write(lines[i]+'\tFocus:'+major_species[1:]+',9606'+'\n')
|
|
|
|
fout_pubtator.write(lines[i]+'\tFocus:'+major_species[1:]+'\n')
|
|
fout_pubtator.write('\n')
|
|
|
|
elif len(species_gene_count[pmid]['spec'])==0 and species_gene_count[pmid]['gene']>0:
|
|
for i in range(2,len(lines)):
|
|
segs=lines[i].split('\t')
|
|
if len(segs)>=6:
|
|
fout_pubtator.write(lines[i]+'\n')
|
|
else:
|
|
fout_pubtator.write(lines[i]+'\tFocus:9606'+'\n')
|
|
fout_pubtator.write('\n')
|
|
|
|
else:
|
|
for i in range(2,len(lines)):
|
|
fout_pubtator.write(lines[i]+'\n')
|
|
fout_pubtator.write('\n')
|
|
fout_pubtator.close()
|
|
|
|
|
|
|
|
def speciesAss(infolder,outpath, modelfile):
|
|
|
|
if modelfile.lower().find('bioformer')>=0:
|
|
model_type='bioformer'
|
|
else:
|
|
model_type='pubmedbert'
|
|
|
|
print('loading SA models........')
|
|
if model_type=='bioformer':
|
|
|
|
vocabfiles={'labelfile':'./vocab/SpeAss_IO_label.vocab',
|
|
'checkpoint_path':'./gnorm_trained_models/bioformer-cased-v1.0/',
|
|
'lowercase':False,
|
|
}
|
|
else:
|
|
vocabfiles={'labelfile':'./vocab/SpeAss_IO_label.vocab',
|
|
'checkpoint_path':'./gnorm_trained_models/BiomedNLP-PubMedBERT-base-uncased-abstract/',
|
|
'lowercase':True,
|
|
}
|
|
|
|
nn_model=model_sa.HUGFACE_NER(vocabfiles)
|
|
nn_model.build_encoder()
|
|
nn_model.build_softmax_decoder()
|
|
nn_model.load_model(modelfile)
|
|
|
|
dict_filename={'prefix':'./Dictionary/SPPrefix.txt',
|
|
'virus':'./Dictionary/SP_Virus2HumanList.txt'}
|
|
fin=open(dict_filename['virus'],'r',encoding='utf-8')
|
|
virus_set=set(fin.read().strip().split('\n'))
|
|
fin.close()
|
|
|
|
prefix_dict={}
|
|
fin=open(dict_filename['prefix'],'r',encoding='utf-8')
|
|
for line in fin:
|
|
seg= line.strip().split('\t')
|
|
if seg[0] not in prefix_dict.keys():
|
|
prefix_dict[seg[0]]=seg[1].split('|')
|
|
else:
|
|
prefix_dict[seg[0]].extend(seg[1].split('|'))
|
|
fin.close()
|
|
|
|
|
|
|
|
print("begin species assignment........")
|
|
start_time=time.time()
|
|
|
|
for infile in os.listdir(infolder):
|
|
if os.path.isfile(outpath+"/"+infile):
|
|
print(infile+' has exsited.')
|
|
else:
|
|
print('Processing:',infile)
|
|
fin=open(infolder+"/"+infile, 'r',encoding='utf-8')
|
|
file_format=""
|
|
for line in fin:
|
|
pattern_bioc = re.compile('.*<collection>.*')
|
|
pattern_pubtator = re.compile('^([^\|]+)\|[^\|]+\|(.*)')
|
|
if pattern_bioc.search(line):
|
|
file_format="BioC"
|
|
break
|
|
elif pattern_pubtator.search(line):
|
|
file_format="PubTator"
|
|
break
|
|
fin.close()
|
|
if(file_format == "PubTator"):
|
|
SA_PubTator(infolder,infile,outpath,nn_model,virus_set,prefix_dict)
|
|
elif(file_format == "BioC"):
|
|
SA_BioC(infolder,infile,outpath,nn_model,virus_set,prefix_dict)
|
|
|
|
|
|
print('species assignment done:',time.time()-start_time)
|
|
|
|
if __name__=='__main__':
|
|
|
|
parser = argparse.ArgumentParser(description='run GeneNER and species assignment, python GeneNER_SpeAss_run.py -i input -n NERmodel -s SAmodel -r neroutput -a saoutput')
|
|
parser.add_argument('--infolder', '-i', help="input folder",default='./example/input/')
|
|
parser.add_argument('--NERmodel', '-n', help="trained deep learning NER model file",default='')
|
|
parser.add_argument('--SAmodel', '-s', help="trained deep learning species assignment model file",default='')
|
|
parser.add_argument('--NERoutpath', '-r', help="output folder to save the NER tagged results",default='./example/ner_output/')
|
|
parser.add_argument('--SAoutpath', '-a', help="output folder to save the SA tagged results",default='./example/sa_output/')
|
|
parser.add_argument('--NUM_THREADS', '-t', help="Number of threads",default='3')
|
|
args = parser.parse_args()
|
|
|
|
|
|
if args.NUM_THREADS.isdigit() == False:
|
|
args.NUM_THREADS='3'
|
|
|
|
tf.config.threading.set_inter_op_parallelism_threads(int(args.NUM_THREADS))
|
|
tf.config.threading.set_intra_op_parallelism_threads(int(args.NUM_THREADS))
|
|
|
|
if args.NERmodel!='' and args.SAmodel!='':
|
|
|
|
|
|
print('==============\n| GeneNER and SpeAss |\n==============')
|
|
|
|
|
|
|
|
if args.infolder[-1]!='/':
|
|
args.infolder+='/'
|
|
if not os.path.exists(args.infolder):
|
|
os.makedirs(args.infolder)
|
|
|
|
if args.NERoutpath[-1]!='/':
|
|
args.NERoutpath+='/'
|
|
if not os.path.exists(args.NERoutpath):
|
|
os.makedirs(args.NERoutpath)
|
|
|
|
if args.SAoutpath[-1]!='/':
|
|
args.SAoutpath+='/'
|
|
if not os.path.exists(args.SAoutpath):
|
|
os.makedirs(args.SAoutpath)
|
|
|
|
|
|
geneNER(args.infolder,args.NERoutpath, args.NERmodel)
|
|
|
|
|
|
|
|
speciesAss(args.NERoutpath,args.SAoutpath, args.SAmodel)
|
|
|
|
elif args.NERmodel!='' and args.SAmodel=='':
|
|
if args.infolder[-1]!='/':
|
|
args.infolder+='/'
|
|
if not os.path.exists(args.infolder):
|
|
os.makedirs(args.infolder)
|
|
|
|
|
|
if args.NERoutpath[-1]!='/':
|
|
args.NERoutpath+='/'
|
|
if not os.path.exists(args.NERoutpath):
|
|
os.makedirs(args.NERoutpath)
|
|
|
|
print('==============\n| GeneNER |\n==============')
|
|
geneNER(args.infolder,args.NERoutpath,args.NERmodel)
|
|
|
|
elif args.NERmodel=='' and args.SAmodel!='':
|
|
|
|
if args.SAoutpath[-1]!='/':
|
|
args.SAoutpath+='/'
|
|
if not os.path.exists(args.SAoutpath):
|
|
os.makedirs(args.SAoutpath)
|
|
|
|
print('==============\n| SpeAss |\n==============')
|
|
speciesAss(args.infolder,args.SAoutpath,args.SAmodel)
|
|
else:
|
|
print('Please provide models!')
|
|
|
|
|
|
|