\n
'''
if 'biorxiv' in self.XRxiv_servers and 'medrxiv' not in self.XRxiv_servers:
# print('Searching biorxiv\n')
max_biorxiv_papers = max_papers_in_server[2]
journals_str = f'%20jcode%3Abiorxiv'
if 'biorxiv' not in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
# print('Searching medrxiv\n')
max_biorxiv_papers = max_papers_in_server[3]
journals_str = f'%20jcode%3Amedrxiv'
if 'biorxiv' in self.XRxiv_servers and 'medrxiv' in self.XRxiv_servers:
# print('Searching both biorxiv and medrxiv\n')
max_biorxiv_papers = max_papers_in_server[3]+ max_papers_in_server[2] # birxiv and medrxiv are together.
journals_str = f'%20jcode%3Abiorxiv%7C%7Cmedrxiv'
subject_str = ('%20').join(self.search_query[0].split())
for subject in search_query[1:]:
subject_str = subject_str + '%252B' + ('%20').join(subject.split())
current_dateTime = datetime.now()
today = str(current_dateTime)[:10]
start_day = '2013-01-01'
arXiv_url = f'https://www.biorxiv.org/search/'
arXiv_url += subject_str + journals_str + f'%20limit_from%3A2{start_day}%20limit_to%3A{today}%20numresults%3A{max_biorxiv_papers}%20sort%3Arelevance-rank%20format_result%3Astandard'
url_response = requests.post(arXiv_url)
html = bs(url_response.text, features='html.parser')
pdf_entries = html.find_all(attrs={'class': 'search-result'})
pdf_titles = []
pdf_authors = []
pdf_urls = []
pdf_categories = []
folder_names = []
pdf_citation = []
pdf_years = []
for i, pdf in enumerate(pdf_entries):
pdf_titles.append(pdf.find('span', attrs={'class': 'highwire-cite-title'}).text.strip())
pdf_authors.append(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip().split(', '))
pdf_url = pdf.find('a', href=True)['href']
if pdf_url[:4] != 'http':
pdf_url = f'http://www.biorxiv.org'+ pdf_url
pdf_urls.append(pdf_url)
pdf_categories.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-journal highwire-cite-metadata'}).text.strip())
folder_names.append(self.folder_name)
pdf_years.append(pdf.find('span', attrs={'class': 'highwire-cite-metadata-pages highwire-cite-metadata'}).text.strip()[:4])
pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {pdf_categories[i]} ({pdf_years[i]}), (available at {pdf_urls[i]}).")
pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
self.all_pdf_info.append(pdf_info)
self.all_pdf_info = [item for sublist in self.all_pdf_info for item in sublist]
return self.all_pdf_info
def download_pdf(self):
all_reference_text = []
for i,p in enumerate(stqdm(self.all_pdf_info, desc='🔍 Searching and downloading papers')):
pdf_title=p[0]
pdf_category=p[3]
pdf_url=p[1]
if pdf_category in ['medRxiv', 'bioRxiv']:
pdf_url += '.full.pdf'
pdf_file_name=p[0].replace(':','').replace('/','').replace('.','').replace('\n','')
folder_name=p[4]
pdf_citation=p[5]
r = requests.get(pdf_url, allow_redirects=True)
if i == 0:
if not os.path.exists(f'{folder_name}'):
os.makedirs(f"{folder_name}")
else:
shutil.rmtree(f'{folder_name}')
os.makedirs(f"{folder_name}")
with open(f'{folder_name}/{pdf_file_name}.pdf', 'wb') as f:
f.write(r.content)
if i == 0:
st.markdown("###### Papers found:")
st.markdown(f"{i+1}. {pdf_citation}")
time.sleep(0.15)
all_reference_text.append(f"{i+1}. {pdf_citation}\n")
if 'all_reference_text' not in st.session_state:
st.session_state.key = 'all_reference_text'
st.session_state['all_reference_text'] = ' '.join(all_reference_text)
def distibute_max_papers(max_results, XRxiv_servers):
fixed_length = len(XRxiv_servers)
sample = np.random.multinomial(max_results - fixed_length, np.ones(fixed_length)/fixed_length, size=1)[0] + 1
max_papers_in_server = np.zeros(4, dtype=int)
all_servers = ['rxiv', 'chemrxiv', 'biorxiv', 'medrxiv']
for i,s in enumerate(XRxiv_servers):
max_papers_in_server[all_servers.index(s)] = int(sample[i])
return max_papers_in_server