File size: 4,047 Bytes
7b7b4e7 0da9720 7b7b4e7 76645e5 7b7b4e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import streamlit as st
import pandas as pd
# import utils.pharmap_utils.layout as lt
from batutils import *
# import stanza
import requests
# import os.path
import io
# import PyPDF2
from pypdf.pdf import PdfFileReader
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from bs4.element import Comment
# from utils.pharmap_utils.dtxutils import *
# from utils.pharmap_utils.dictutils import *
from stanzautils import *
# @st.cache(show_spinner=True)
def get_ner(contents):
print('inside get ner')
content_list = []
st.write('Reading the page...')
nlp = call_nlp_pipeline()
doc = nlp(contents.strip())
st.write('Getting disease names...')
for ent in doc.entities:
if ent.type == 'DISEASE':
content_list.append(ent.text.replace('\n', ''))
content_list = list(set(content_list))
print('got the disease names', content_list)
st.write('Got the disease names...')
return content_list
def get_ta_mapped_url(content_list):
print('inside get_ta_mapped')
st.write(content_list)
# content_list = content_list
st.write('Trying to get Mesh Name..')
print('Trying to get Mesh Name..')
ta_list = []
ta = []
for condition_text in content_list:
# print("printing inside the for loop",condition_text)
ta = non_url_flow(condition_text)
# print(ta)
ta_list.append(ta)
# print(ta_list)
flat_list = [item for sublist in ta_list for item in sublist]
ta = list(set(flat_list))
print("Outside the loop", ta)
return ta
def check_pdf_html(url):
r = requests.get(url)
content_type = r.headers.get('content-type')
print(content_type)
if 'application/pdf' in content_type:
ext = 'pdf'
elif 'text/html' in content_type:
ext = 'html'
else:
ext = ''
print('Unknown type: {}'.format(content_type))
print(ext)
return ext
# @st.cache
def get_disease_html(u):
print('inside get disease html')
# u="https://www.exelixis.com/pipeline/"
# "https://www.roche.com/dam/jcr:22160102-e04d-4484-ae3b-0f474105647e/en/diaq321.pdf"
url = Request(u, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(url).read()
soup = BeautifulSoup(html, features="html.parser")
for script in soup(["script", "style"]):
script.extract()
for footer in soup.findAll('header'):
footer.decompose()
for footer in soup.findAll('footer'):
footer.decompose()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
# st.write(text)
result = get_ner(text)
return result
# @st.cache(persist=True,show_spinner=True)
def get_disease_pdf(url):
st.write('get pdf disease')
r = requests.get(url)
f = io.BytesIO(r.content)
reader = PdfFileReader(f)
# pnum = reader.getNumPages()
# p_num = []
data = []
df = pd.DataFrame()
content_list = []
pnum = 2
for p in range(pnum):
contents = reader.getPage(p).extractText()
content_list = get_ner(contents)
# doc = nlp(contents.strip())
# for ent in doc.entities:
# if ent.type=='DISEASE':
# content_list.append(ent.text.replace('\n',''))
# content_list = list(set(content_list))
# print(content_list)
# p_num = [p+1]
# print('pagenum',p_num)
# print('values',content_list)
a_dictionary = {'pno:': [p + 1],
'conditions': content_list
}
content_list = []
# print('a_dictionary',a_dictionary)
data.append(a_dictionary)
f.close()
df = df.append(data, True)
return df
def get_link_mapped(url):
# st.write(url)
# url = 'https://www.gene.com/medical-professionals/pipeline'
try:
get = check_pdf_html(url)
# st.write(get)
except:
get = 'invalid URL'
if get == 'pdf':
# st.write('inside pdf')
pdf_mapped_df = get_disease_pdf(url)
st.dataframe(pdf_mapped_df)
elif get == 'html':
# st.write('inside html')
# st.write(url)
# print('html')
content_list = get_disease_html(url)
ta = get_ta_mapped_url(content_list)
st.write(ta)
elif get == 'invalid URL':
print('invalid')
|