tracer / pharmap_url.py
mishtert's picture
Update pharmap_url.py
76645e5
import streamlit as st
import pandas as pd
# import utils.pharmap_utils.layout as lt
from batutils import *
# import stanza
import requests
# import os.path
import io
# import PyPDF2
from pypdf.pdf import PdfFileReader
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from bs4.element import Comment
# from utils.pharmap_utils.dtxutils import *
# from utils.pharmap_utils.dictutils import *
from stanzautils import *
# @st.cache(show_spinner=True)
def get_ner(contents):
print('inside get ner')
content_list = []
st.write('Reading the page...')
nlp = call_nlp_pipeline()
doc = nlp(contents.strip())
st.write('Getting disease names...')
for ent in doc.entities:
if ent.type == 'DISEASE':
content_list.append(ent.text.replace('\n', ''))
content_list = list(set(content_list))
print('got the disease names', content_list)
st.write('Got the disease names...')
return content_list
def get_ta_mapped_url(content_list):
print('inside get_ta_mapped')
st.write(content_list)
# content_list = content_list
st.write('Trying to get Mesh Name..')
print('Trying to get Mesh Name..')
ta_list = []
ta = []
for condition_text in content_list:
# print("printing inside the for loop",condition_text)
ta = non_url_flow(condition_text)
# print(ta)
ta_list.append(ta)
# print(ta_list)
flat_list = [item for sublist in ta_list for item in sublist]
ta = list(set(flat_list))
print("Outside the loop", ta)
return ta
def check_pdf_html(url):
r = requests.get(url)
content_type = r.headers.get('content-type')
print(content_type)
if 'application/pdf' in content_type:
ext = 'pdf'
elif 'text/html' in content_type:
ext = 'html'
else:
ext = ''
print('Unknown type: {}'.format(content_type))
print(ext)
return ext
# @st.cache
def get_disease_html(u):
print('inside get disease html')
# u="https://www.exelixis.com/pipeline/"
# "https://www.roche.com/dam/jcr:22160102-e04d-4484-ae3b-0f474105647e/en/diaq321.pdf"
url = Request(u, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(url).read()
soup = BeautifulSoup(html, features="html.parser")
for script in soup(["script", "style"]):
script.extract()
for footer in soup.findAll('header'):
footer.decompose()
for footer in soup.findAll('footer'):
footer.decompose()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
# st.write(text)
result = get_ner(text)
return result
# @st.cache(persist=True,show_spinner=True)
def get_disease_pdf(url):
st.write('get pdf disease')
r = requests.get(url)
f = io.BytesIO(r.content)
reader = PdfFileReader(f)
# pnum = reader.getNumPages()
# p_num = []
data = []
df = pd.DataFrame()
content_list = []
pnum = 2
for p in range(pnum):
contents = reader.getPage(p).extractText()
content_list = get_ner(contents)
# doc = nlp(contents.strip())
# for ent in doc.entities:
# if ent.type=='DISEASE':
# content_list.append(ent.text.replace('\n',''))
# content_list = list(set(content_list))
# print(content_list)
# p_num = [p+1]
# print('pagenum',p_num)
# print('values',content_list)
a_dictionary = {'pno:': [p + 1],
'conditions': content_list
}
content_list = []
# print('a_dictionary',a_dictionary)
data.append(a_dictionary)
f.close()
df = df.append(data, True)
return df
def get_link_mapped(url):
# st.write(url)
# url = 'https://www.gene.com/medical-professionals/pipeline'
try:
get = check_pdf_html(url)
# st.write(get)
except:
get = 'invalid URL'
if get == 'pdf':
# st.write('inside pdf')
pdf_mapped_df = get_disease_pdf(url)
st.dataframe(pdf_mapped_df)
elif get == 'html':
# st.write('inside html')
# st.write(url)
# print('html')
content_list = get_disease_html(url)
ta = get_ta_mapped_url(content_list)
st.write(ta)
elif get == 'invalid URL':
print('invalid')