Yijun-Yang commited on
Commit
92bcd1d
1 Parent(s): 78f8e89

updategradiofrontend

Browse files
app.py CHANGED
@@ -167,21 +167,28 @@ def update_repo_info():
167
  if os.path.exists(repodir):
168
  pdffiles = glob.glob(os.path.join(repodir, '*.pdf'))
169
  number_of_pdf = len(pdffiles)
 
170
  if os.path.exists(os.path.join(repodir,'info.json')):
171
 
172
  with open(os.path.join(repodir,'info.json'), 'r') as f:
173
  repo_info = json.load(f)
174
 
175
  keywords = repo_info['keywords']
176
- length = repo_info['len']
177
  retmax = repo_info['retmax']
178
- failed = repo_info['failed_pmids']
 
 
179
 
180
- return keywords,length,retmax,failed,number_of_pdf
 
 
 
 
 
181
  else:
182
- return None,None,None,None,number_of_pdf
183
  else:
184
- return None,None,None,None,None
185
 
186
  def upload_file(files):
187
  repodir, workdir, _ = get_ready('repo_work')
@@ -196,12 +203,11 @@ def upload_file(files):
196
 
197
  return files
198
 
199
- def generate_articles_repo(strings:str,retmax:int):
200
 
201
- string = [k.strip() for k in strings.split('\n')]
202
-
203
- pmids = [k for k in string if k.isdigit()]
204
- keys = [k for k in string if not k.isdigit()]
205
 
206
  repodir, _, _ = get_ready('repo_work')
207
 
@@ -225,15 +231,26 @@ def delete_articles_repo():
225
  visible = True)
226
 
227
  def update_repo():
228
- keys,len,retmax,failed,pdflen = update_repo_info()
229
- if keys or len:
230
- newinfo = f"搜索得到文献:\n 关键词:{keys}\n 文献数量:{len}\n 获取上限:{retmax}\n 失败PMID:{failed}\n\n上传文献:\n 数量:{pdflen}"
231
- else:
 
232
  if pdflen:
233
- newinfo = f'搜索得到文献:无\n上传文献:\n 数量:{pdflen}'
234
  else:
235
- newinfo = '目前还没有文献库'
236
-
 
 
 
 
 
 
 
 
 
 
237
  return gr.Textbox(label="文献库概况",lines =1,
238
  value = newinfo,
239
  visible = True)
@@ -464,11 +481,12 @@ def main_interface():
464
  gr.Markdown("""
465
  #### 查找文献 📚
466
 
467
- 1. **输入关键词批量PubMed PMC文献**
468
  - 在“感兴趣的关键词”框中输入您感兴趣的关键词,每行一个。
469
- - 设置查找数量(0-1000)。
470
- - 点击“搜索PubMed PMC”按钮进行文献查找。
471
-
 
472
  2. **上传PDF**
473
  - 通过“上传PDF”按钮上传您已有的PDF文献文件。
474
 
@@ -492,36 +510,43 @@ def main_interface():
492
  """)
493
  with gr.Row(equal_height=True):
494
  with gr.Column(scale=1):
495
- input_keys = gr.Textbox(label="感兴趣的关键词",
496
- value = "输入关键词或者PMID, 换行分隔",
 
497
  lines = 5)
498
- retmax = gr.Slider(
499
- minimum=0,
500
- maximum=1000,
501
- value=500,
502
- interactive=True,
503
- label="查多少",
504
- )
505
- generate_repo_button = gr.Button("搜索PubMed PMC")
506
- with gr.Column(scale=2):
 
 
 
 
 
 
 
507
  file_output = gr.File(scale=2)
508
  upload_button = gr.UploadButton("上传PDF",
509
- file_types=[".pdf",".csv",".doc"],
510
- file_count="multiple",scale=0)
511
 
512
  with gr.Row(equal_height=True):
513
  with gr.Column(scale=0):
514
  delete_repo_button = gr.Button("删除文献库")
515
  update_repo_button = gr.Button("更新文献库情况")
516
  with gr.Column(scale=2):
517
-
518
- repo_summary =gr.Textbox(label= '文献库概况', value="目前还没有文献库")
519
 
520
  generate_repo_button.click(generate_articles_repo,
521
- inputs=[input_keys,retmax],
522
  outputs = [repo_summary])
523
 
524
-
525
  delete_repo_button.click(delete_articles_repo, inputs=None,
526
  outputs = repo_summary)
527
  update_repo_button.click(update_repo, inputs=None,
@@ -535,7 +560,6 @@ def main_interface():
535
  minimum=128, maximum=4096,value=1024,step=1,
536
  interactive=True)
537
  ncluster = gr.CheckboxGroup(["10", "20", "50", '100','200','500','1000'],
538
- # default=["20", "50", '100'],
539
  label="Number of Clusters",
540
  info="How many Clusters you want to generate")
541
 
 
167
  if os.path.exists(repodir):
168
  pdffiles = glob.glob(os.path.join(repodir, '*.pdf'))
169
  number_of_pdf = len(pdffiles)
170
+ # 判断info.json是否存在
171
  if os.path.exists(os.path.join(repodir,'info.json')):
172
 
173
  with open(os.path.join(repodir,'info.json'), 'r') as f:
174
  repo_info = json.load(f)
175
 
176
  keywords = repo_info['keywords']
 
177
  retmax = repo_info['retmax']
178
+ search_len = len(repo_info['search_pmids'])
179
+ import_len = len(repo_info['import_pmids'])
180
+ failed_pmid_len = len(repo_info['failed_pmids'])
181
 
182
+ pmc_success = repo_info['pmc_success_d']
183
+ scihub_success = repo_info['scihub_success_d']
184
+ failed_download = repo_info['failed_download']
185
+
186
+ number_of_upload = number_of_pdf-scihub_success
187
+ return keywords, retmax, search_len, import_len, failed_pmid_len, pmc_success, scihub_success, number_of_pdf, failed_download, number_of_upload
188
  else:
189
+ return None,None,None,None,None,None,None,None,None,number_of_pdf
190
  else:
191
+ return None,None,None,None,None,None,None,None,None,None
192
 
193
  def upload_file(files):
194
  repodir, workdir, _ = get_ready('repo_work')
 
203
 
204
  return files
205
 
206
+ def generate_articles_repo(keys:str,pmids,retmax:int):
207
 
208
+ keys = [k.strip() for k in keys.split('\n')]
209
+ pmids = [k.strip() for k in pmids.split('\n')]
210
+ pmids = [k for k in pmids if k.isdigit()]
 
211
 
212
  repodir, _, _ = get_ready('repo_work')
213
 
 
231
  visible = True)
232
 
233
  def update_repo():
234
+ keys, retmax, search_len, import_len, _, pmc_success, scihub_success, pdflen, failed, pdflen = update_repo_info()
235
+ newinfo = ""
236
+ if keys == None:
237
+ newinfo += '无关键词搜索相关信息\n'
238
+ newinfo += '无导入的PMID\n'
239
  if pdflen:
240
+ newinfo += f'上传的PDF数量: {pdflen}\n'
241
  else:
242
+ newinfo += '无上传的PDF\n'
243
+ else:
244
+ newinfo += f'关键词搜索:'
245
+ newinfo += f' 关键词: {keys}\n'
246
+ newinfo += f' 搜索上限: {retmax}\n'
247
+ newinfo += f' 搜索到的PMID数量: {search_len}\n'
248
+ newinfo += f'导入的PMID数量: {import_len}\n'
249
+ newinfo += f'成功获取PMC全文数量: {pmc_success}\n'
250
+ newinfo += f'成功获取SciHub全文数量: {scihub_success}\n'
251
+ newinfo += f"下载失败的ID: {failed}\n"
252
+ newinfo += f'上传的PDF数量: {pdflen}\n'
253
+
254
  return gr.Textbox(label="文献库概况",lines =1,
255
  value = newinfo,
256
  visible = True)
 
481
  gr.Markdown("""
482
  #### 查找文献 📚
483
 
484
+ 1. **输入关键词或PMID批量PubMed PMC文献**
485
  - 在“感兴趣的关键词”框中输入您感兴趣的关键词,每行一个。
486
+ - 设置查找数量(0-500)。
487
+ - 在“输入PMID”框中输入在PubMed中导出的PMID,每行一个。
488
+ - 点击“搜索PubMed 并拉取全文”按钮进行文献查找。目前主要基于PMC数据库和scihub, 在PMC中未收录的文献将使用scihub下载,scihub近年文献未收录
489
+
490
  2. **上传PDF**
491
  - 通过“上传PDF”按钮上传您已有的PDF文献文件。
492
 
 
510
  """)
511
  with gr.Row(equal_height=True):
512
  with gr.Column(scale=1):
513
+ with gr.Row():
514
+ with gr.Column(scale=1):
515
+ input_keys = gr.Textbox(label="感兴趣的关键词, 换行分隔, 不太好用别用等我改改",
516
  lines = 5)
517
+ retmax = gr.Slider(
518
+ minimum=0,
519
+ maximum=500,
520
+ value=250,
521
+ interactive=True,
522
+ label="搜索上限",
523
+ info="How many articles you want to retrieve?"
524
+ )
525
+
526
+ with gr.Column(scale=1):
527
+ input_pmids = gr.Textbox(label="输入PMID, 换行分隔",
528
+ lines = 5)
529
+
530
+ generate_repo_button = gr.Button("搜索PubMed并拉取全文")
531
+
532
+ with gr.Column(scale=1):
533
  file_output = gr.File(scale=2)
534
  upload_button = gr.UploadButton("上传PDF",
535
+ file_types=[".pdf"],
536
+ file_count="multiple",scale=1)
537
 
538
  with gr.Row(equal_height=True):
539
  with gr.Column(scale=0):
540
  delete_repo_button = gr.Button("删除文献库")
541
  update_repo_button = gr.Button("更新文献库情况")
542
  with gr.Column(scale=2):
543
+ repo_summary =gr.Textbox(label= '文献库概况',
544
+ value="目前还没有文献库")
545
 
546
  generate_repo_button.click(generate_articles_repo,
547
+ inputs=[input_keys,input_pmids,retmax],
548
  outputs = [repo_summary])
549
 
 
550
  delete_repo_button.click(delete_articles_repo, inputs=None,
551
  outputs = repo_summary)
552
  update_repo_button.click(update_repo, inputs=None,
 
560
  minimum=128, maximum=4096,value=1024,step=1,
561
  interactive=True)
562
  ncluster = gr.CheckboxGroup(["10", "20", "50", '100','200','500','1000'],
 
563
  label="Number of Clusters",
564
  info="How many Clusters you want to generate")
565
 
applocal.py CHANGED
@@ -167,21 +167,28 @@ def update_repo_info():
167
  if os.path.exists(repodir):
168
  pdffiles = glob.glob(os.path.join(repodir, '*.pdf'))
169
  number_of_pdf = len(pdffiles)
 
170
  if os.path.exists(os.path.join(repodir,'info.json')):
171
 
172
  with open(os.path.join(repodir,'info.json'), 'r') as f:
173
  repo_info = json.load(f)
174
 
175
  keywords = repo_info['keywords']
176
- length = repo_info['len']
177
  retmax = repo_info['retmax']
178
- failed = repo_info['failed_pmids']
 
 
179
 
180
- return keywords,length,retmax,failed,number_of_pdf
 
 
 
 
 
181
  else:
182
- return None,None,None,None,number_of_pdf
183
  else:
184
- return None,None,None,None,None
185
 
186
  def upload_file(files):
187
  repodir, workdir, _ = get_ready('repo_work')
@@ -196,12 +203,11 @@ def upload_file(files):
196
 
197
  return files
198
 
199
- def generate_articles_repo(strings:str,retmax:int):
200
 
201
- string = [k.strip() for k in strings.split('\n')]
202
-
203
- pmids = [k for k in string if k.isdigit()]
204
- keys = [k for k in string if not k.isdigit()]
205
 
206
  repodir, _, _ = get_ready('repo_work')
207
 
@@ -225,15 +231,26 @@ def delete_articles_repo():
225
  visible = True)
226
 
227
  def update_repo():
228
- keys,len,retmax,failed,pdflen = update_repo_info()
229
- if keys or len:
230
- newinfo = f"搜索得到文献:\n 关键词:{keys}\n 文献数量:{len}\n 获取上限:{retmax}\n 失败PMID:{failed}\n\n上传文献:\n 数量:{pdflen}"
231
- else:
 
232
  if pdflen:
233
- newinfo = f'搜索得到文献:无\n上传文献:\n 数量:{pdflen}'
234
  else:
235
- newinfo = '目前还没有文献库'
236
-
 
 
 
 
 
 
 
 
 
 
237
  return gr.Textbox(label="文献库概况",lines =1,
238
  value = newinfo,
239
  visible = True)
@@ -464,11 +481,12 @@ def main_interface():
464
  gr.Markdown("""
465
  #### 查找文献 📚
466
 
467
- 1. **输入关键词批量PubMed PMC文献**
468
  - 在“感兴趣的关键词”框中输入您感兴趣的关键词,每行一个。
469
- - 设置查找数量(0-1000)。
470
- - 点击“搜索PubMed PMC”按钮进行文献查找。
471
-
 
472
  2. **上传PDF**
473
  - 通过“上传PDF”按钮上传您已有的PDF文献文件。
474
 
@@ -492,36 +510,43 @@ def main_interface():
492
  """)
493
  with gr.Row(equal_height=True):
494
  with gr.Column(scale=1):
495
- input_keys = gr.Textbox(label="感兴趣的关键词",
496
- value = "输入关键词或者PMID, 换行分隔",
 
497
  lines = 5)
498
- retmax = gr.Slider(
499
- minimum=0,
500
- maximum=1000,
501
- value=500,
502
- interactive=True,
503
- label="查多少",
504
- )
505
- generate_repo_button = gr.Button("搜索PubMed PMC")
506
- with gr.Column(scale=2):
 
 
 
 
 
 
 
507
  file_output = gr.File(scale=2)
508
  upload_button = gr.UploadButton("上传PDF",
509
- file_types=[".pdf",".csv",".doc"],
510
- file_count="multiple",scale=0)
511
 
512
  with gr.Row(equal_height=True):
513
  with gr.Column(scale=0):
514
  delete_repo_button = gr.Button("删除文献库")
515
  update_repo_button = gr.Button("更新文献库情况")
516
  with gr.Column(scale=2):
517
-
518
- repo_summary =gr.Textbox(label= '文献库概况', value="目前还没有文献库")
519
 
520
  generate_repo_button.click(generate_articles_repo,
521
- inputs=[input_keys,retmax],
522
  outputs = [repo_summary])
523
 
524
-
525
  delete_repo_button.click(delete_articles_repo, inputs=None,
526
  outputs = repo_summary)
527
  update_repo_button.click(update_repo, inputs=None,
@@ -535,7 +560,6 @@ def main_interface():
535
  minimum=128, maximum=4096,value=1024,step=1,
536
  interactive=True)
537
  ncluster = gr.CheckboxGroup(["10", "20", "50", '100','200','500','1000'],
538
- # default=["20", "50", '100'],
539
  label="Number of Clusters",
540
  info="How many Clusters you want to generate")
541
 
 
167
  if os.path.exists(repodir):
168
  pdffiles = glob.glob(os.path.join(repodir, '*.pdf'))
169
  number_of_pdf = len(pdffiles)
170
+ # 判断info.json是否存在
171
  if os.path.exists(os.path.join(repodir,'info.json')):
172
 
173
  with open(os.path.join(repodir,'info.json'), 'r') as f:
174
  repo_info = json.load(f)
175
 
176
  keywords = repo_info['keywords']
 
177
  retmax = repo_info['retmax']
178
+ search_len = len(repo_info['search_pmids'])
179
+ import_len = len(repo_info['import_pmids'])
180
+ failed_pmid_len = len(repo_info['failed_pmids'])
181
 
182
+ pmc_success = repo_info['pmc_success_d']
183
+ scihub_success = repo_info['scihub_success_d']
184
+ failed_download = repo_info['failed_download']
185
+
186
+ number_of_upload = number_of_pdf-scihub_success
187
+ return keywords, retmax, search_len, import_len, failed_pmid_len, pmc_success, scihub_success, number_of_pdf, failed_download, number_of_upload
188
  else:
189
+ return None,None,None,None,None,None,None,None,None,number_of_pdf
190
  else:
191
+ return None,None,None,None,None,None,None,None,None,None
192
 
193
  def upload_file(files):
194
  repodir, workdir, _ = get_ready('repo_work')
 
203
 
204
  return files
205
 
206
+ def generate_articles_repo(keys:str,pmids,retmax:int):
207
 
208
+ keys = [k.strip() for k in keys.split('\n')]
209
+ pmids = [k.strip() for k in pmids.split('\n')]
210
+ pmids = [k for k in pmids if k.isdigit()]
 
211
 
212
  repodir, _, _ = get_ready('repo_work')
213
 
 
231
  visible = True)
232
 
233
  def update_repo():
234
+ keys, retmax, search_len, import_len, _, pmc_success, scihub_success, pdflen, failed, pdflen = update_repo_info()
235
+ newinfo = ""
236
+ if keys == None:
237
+ newinfo += '无关键词搜索相关信息\n'
238
+ newinfo += '无导入的PMID\n'
239
  if pdflen:
240
+ newinfo += f'上传的PDF数量: {pdflen}\n'
241
  else:
242
+ newinfo += '无上传的PDF\n'
243
+ else:
244
+ newinfo += f'关键词搜索:'
245
+ newinfo += f' 关键词: {keys}\n'
246
+ newinfo += f' 搜索上限: {retmax}\n'
247
+ newinfo += f' 搜索到的PMID数量: {search_len}\n'
248
+ newinfo += f'导入的PMID数量: {import_len}\n'
249
+ newinfo += f'成功获取PMC全文数量: {pmc_success}\n'
250
+ newinfo += f'成功获取SciHub全文数量: {scihub_success}\n'
251
+ newinfo += f"下载失败的ID: {failed}\n"
252
+ newinfo += f'上传的PDF数量: {pdflen}\n'
253
+
254
  return gr.Textbox(label="文献库概况",lines =1,
255
  value = newinfo,
256
  visible = True)
 
481
  gr.Markdown("""
482
  #### 查找文献 📚
483
 
484
+ 1. **输入关键词或PMID批量PubMed PMC文献**
485
  - 在“感兴趣的关键词”框中输入您感兴趣的关键词,每行一个。
486
+ - 设置查找数量(0-500)。
487
+ - 在“输入PMID”框中输入在PubMed中导出的PMID,每行一个。
488
+ - 点击“搜索PubMed 并拉取全文”按钮进行文献查找。目前主要基于PMC数据库和scihub, 在PMC中未收录的文献将使用scihub下载,scihub近年文献未收录
489
+
490
  2. **上传PDF**
491
  - 通过“上传PDF”按钮上传您已有的PDF文献文件。
492
 
 
510
  """)
511
  with gr.Row(equal_height=True):
512
  with gr.Column(scale=1):
513
+ with gr.Row():
514
+ with gr.Column(scale=1):
515
+ input_keys = gr.Textbox(label="感兴趣的关键词, 换行分隔, 不太好用别用等我改改",
516
  lines = 5)
517
+ retmax = gr.Slider(
518
+ minimum=0,
519
+ maximum=500,
520
+ value=250,
521
+ interactive=True,
522
+ label="搜索上限",
523
+ info="How many articles you want to retrieve?"
524
+ )
525
+
526
+ with gr.Column(scale=1):
527
+ input_pmids = gr.Textbox(label="输入PMID, 换行分隔",
528
+ lines = 5)
529
+
530
+ generate_repo_button = gr.Button("搜索PubMed并拉取全文")
531
+
532
+ with gr.Column(scale=1):
533
  file_output = gr.File(scale=2)
534
  upload_button = gr.UploadButton("上传PDF",
535
+ file_types=[".pdf"],
536
+ file_count="multiple",scale=1)
537
 
538
  with gr.Row(equal_height=True):
539
  with gr.Column(scale=0):
540
  delete_repo_button = gr.Button("删除文献库")
541
  update_repo_button = gr.Button("更新文献库情况")
542
  with gr.Column(scale=2):
543
+ repo_summary =gr.Textbox(label= '文献库概况',
544
+ value="目前还没有文献库")
545
 
546
  generate_repo_button.click(generate_articles_repo,
547
+ inputs=[input_keys,input_pmids,retmax],
548
  outputs = [repo_summary])
549
 
 
550
  delete_repo_button.click(delete_articles_repo, inputs=None,
551
  outputs = repo_summary)
552
  update_repo_button.click(update_repo, inputs=None,
 
560
  minimum=128, maximum=4096,value=1024,step=1,
561
  interactive=True)
562
  ncluster = gr.CheckboxGroup(["10", "20", "50", '100','200','500','1000'],
 
563
  label="Number of Clusters",
564
  info="How many Clusters you want to generate")
565
 
config.ini CHANGED
@@ -4,7 +4,7 @@ embedding_model_path = "/root/models/bce-embedding-base_v1"
4
  reranker_model_path = "/root/models/bce-reranker-base_v1"
5
  repo_dir = "repodir"
6
  work_dir = "workdir"
7
- n_clusters = [20, 50]
8
  chunk_size = 1024
9
 
10
  [web_search]
@@ -13,7 +13,7 @@ domain_partial_order = ["openai.com", "pytorch.org", "readthedocs.io", "nvidia.c
13
  save_dir = "logs/web_search_result"
14
 
15
  [llm]
16
- enable_local = 0
17
  enable_remote = 1
18
  client_url = "http://127.0.0.1:8888/inference"
19
 
@@ -21,11 +21,11 @@ client_url = "http://127.0.0.1:8888/inference"
21
  local_llm_path = "/root/models/Qwen1.5-7B-Chat"
22
  local_llm_max_text_length = 32000
23
  local_llm_bind_port = 8888
24
- remote_type = "deepseek"
25
- remote_api_key = "sk-f36f5336010841399abccdfeb6bd1f54"
26
  remote_base_url = ""
27
  remote_llm_max_text_length = 32000
28
- remote_llm_model = "deepseek-chat"
29
  rpm = 500
30
 
31
  [worker]
 
4
  reranker_model_path = "/root/models/bce-reranker-base_v1"
5
  repo_dir = "repodir"
6
  work_dir = "workdir"
7
+ n_clusters = [10, 20]
8
  chunk_size = 1024
9
 
10
  [web_search]
 
13
  save_dir = "logs/web_search_result"
14
 
15
  [llm]
16
+ enable_local = 1
17
  enable_remote = 1
18
  client_url = "http://127.0.0.1:8888/inference"
19
 
 
21
  local_llm_path = "/root/models/Qwen1.5-7B-Chat"
22
  local_llm_max_text_length = 32000
23
  local_llm_bind_port = 8888
24
+ remote_type = ""
25
+ remote_api_key = ""
26
  remote_base_url = ""
27
  remote_llm_max_text_length = 32000
28
+ remote_llm_model = ""
29
  rpm = 500
30
 
31
  [worker]
huixiangdou/service/findarticles.py CHANGED
@@ -7,19 +7,56 @@ import json
7
  import shutil
8
  from loguru import logger
9
  from lxml import etree
10
- import sys
11
- from scihub_cn.scihub import main
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- def scihub_download(doi_file_Path = None,doi = None,output_dir = None):
14
- args = ["scihub-cn"] # This is the program name as expected in argv[0]
15
- if doi is not None:
16
- args.extend(["-d", doi])
17
- if doi_file_Path is not None:
18
- args.extend(["-i", doi_file_Path, "--doi"])
19
- if output_dir is not None:
20
- args.extend(["-o", output_dir])
21
- sys.argv = args # Set sys.argv to our simulated command line arguments
22
- sys.exit(main())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
 
25
  class ArticleRetrieval:
@@ -83,11 +120,15 @@ class ArticleRetrieval:
83
  response = requests.get(base_url, params=params)
84
  root = ET.fromstring(response.content)
85
  idlist = root.find('.//IdList')
86
- pmids = [id_element.text for id_element in idlist.findall('.//Id')]
 
 
 
 
87
  print(f"Found {len(pmids)} articles for keywords {self.keywords}.")
 
88
  self.pmids.extend(pmids)
89
 
90
-
91
  # 解析XML文件
92
  def _get_all_text(self, element):
93
  """递归获取XML元素及其所有子元素的文本内容。确保element不为None."""
@@ -115,8 +156,16 @@ class ArticleRetrieval:
115
  if not os.path.exists(self.repo_dir):
116
  os.makedirs(self.repo_dir)
117
  print(f"Saving articles to {self.repo_dir}.")
118
- self.success = 0
 
 
 
119
  for id in tqdm(self.pmc_ids, desc="Fetching full texts", unit="article"):
 
 
 
 
 
120
  base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
121
  params = {
122
  "db": "pmc",
@@ -127,20 +176,33 @@ class ArticleRetrieval:
127
  response = requests.get(base_url, params=params)
128
  full_text = self._clean_xml(response.text)
129
  if full_text.strip() == '':
 
130
  continue
131
  else:
132
- logger.info(full_text[:500])
133
  with open(os.path.join(self.repo_dir,f'{id}.txt'), 'w') as f:
134
  f.write(full_text)
135
- self.success += 1
136
  for doi in tqdm(self.scihub_doi, desc="Fetching full texts", unit="article"):
137
- scihub_download(doi = doi,output_dir=self.repo_dir)
138
- self.success += 1
 
 
 
 
 
 
 
 
139
 
140
  def save_config(self):
141
  config = {
142
- 'keywords': self.keywords,
143
  'repo_dir': self.repo_dir,
 
 
 
 
 
144
  'result': [
145
  {
146
  'pmid': r[0],
@@ -148,9 +210,10 @@ class ArticleRetrieval:
148
  'doi': r[2]
149
  } for r in self.esummary
150
  ],
151
- 'len': self.success,
152
- 'retmax': self.retmax,
153
- 'failed_pmids': self.failed_pmids
 
154
  }
155
  with open(os.path.join(self.repo_dir, 'info.json'), 'w') as f:
156
  json.dump(config, f, indent=4, ensure_ascii=False)
@@ -169,14 +232,10 @@ if __name__ == '__main__':
169
  shutil.rmtree('repodir')
170
 
171
  strings = """
172
- 36944324
173
- 38453907
174
- 38300432
175
- 38651453
176
- 38398096
177
- 38255885
178
- 38035547
179
- 38734498"""
180
  string = [k.strip() for k in strings.split('\n')]
181
 
182
  pmids = [k for k in string if k.isdigit()]
 
7
  import shutil
8
  from loguru import logger
9
  from lxml import etree
10
+ import requests
11
+ from bs4 import BeautifulSoup
12
+ import os
13
+
14
+ def download_pdfs(path, doi_list): #fox dalao contribution https://github.com/BigWhiteFox
15
+ # 确保下载目录存在
16
+ if not os.path.exists(path):
17
+ os.makedirs(path)
18
+ if isinstance(doi_list, str):
19
+ doi_list = [doi_list]
20
+ href_list = []
21
+
22
+ for doi in doi_list:
23
+ url = f"https://sci-hub.se/{doi}"
24
+ response = requests.get(url)
25
 
26
+ # 检查请求是否成功
27
+ if response.status_code == 200:
28
+ print(f"成功请求:{url}")
29
+ else:
30
+ print(f"请求失败:{url},状态码:{response.status_code}")
31
+ continue # 如果请求失败,跳过本次循环
32
+
33
+ soup = BeautifulSoup(response.text, 'html.parser')
34
+ buttons = soup.find_all('button', onclick=True)
35
+
36
+ for button in buttons:
37
+ onclick = button.get('onclick')
38
+ if onclick:
39
+ pdf_url = onclick.split("'")[1]
40
+ href_list.append((pdf_url, doi))
41
+ print("pdf_url:", pdf_url)
42
+ print("href_list:", href_list)
43
+
44
+ # 遍历href_list中的每个URL
45
+ for href, doi in href_list:
46
+ pdf_url = f"https:{href}"
47
+ try:
48
+ response = requests.get(pdf_url, stream=True)
49
+ if response.status_code == 200:
50
+ filename = doi.replace("/", "_") + ".pdf"
51
+ file_path = os.path.join(path, filename)
52
+ with open(file_path, 'wb') as f:
53
+ for chunk in response.iter_content(chunk_size=8192):
54
+ f.write(chunk)
55
+ print(f"File downloaded and saved as: {file_path}")
56
+ else:
57
+ print(f"Download failed, Status Code: {response.status_code}, URL: {pdf_url}")
58
+ except requests.RequestException as e:
59
+ print(f"Failed to download due to an exception: {e}")
60
 
61
 
62
  class ArticleRetrieval:
 
120
  response = requests.get(base_url, params=params)
121
  root = ET.fromstring(response.content)
122
  idlist = root.find('.//IdList')
123
+ try:
124
+ pmids = [id_element.text for id_element in idlist.findall('.//Id')]
125
+ except:
126
+ pmids = []
127
+
128
  print(f"Found {len(pmids)} articles for keywords {self.keywords}.")
129
+ self.search_pmid = pmids
130
  self.pmids.extend(pmids)
131
 
 
132
  # 解析XML文件
133
  def _get_all_text(self, element):
134
  """递归获取XML元素及其所有子元素的文本内容。确保element不为None."""
 
156
  if not os.path.exists(self.repo_dir):
157
  os.makedirs(self.repo_dir)
158
  print(f"Saving articles to {self.repo_dir}.")
159
+ self.pmc_success = 0
160
+ self.scihub_success = 0
161
+ self.failed_download = []
162
+ downloaded = os.listdir(self.repo_dir)
163
  for id in tqdm(self.pmc_ids, desc="Fetching full texts", unit="article"):
164
+ # check if file already downloaded
165
+ if f"{id}.txt" in downloaded:
166
+ print(f"File already downloaded: {id}")
167
+ self.pmc_success += 1
168
+ continue
169
  base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
170
  params = {
171
  "db": "pmc",
 
176
  response = requests.get(base_url, params=params)
177
  full_text = self._clean_xml(response.text)
178
  if full_text.strip() == '':
179
+ self.failed_download.append(id)
180
  continue
181
  else:
182
+ logger.info(full_text[:200])
183
  with open(os.path.join(self.repo_dir,f'{id}.txt'), 'w') as f:
184
  f.write(full_text)
185
+ self.pmc_success += 1
186
  for doi in tqdm(self.scihub_doi, desc="Fetching full texts", unit="article"):
187
+ # check if file already downloaded
188
+ if f"{doi.replace('/','_')}.pdf" in downloaded:
189
+ print(f"File already downloaded: {doi}")
190
+ self.scihub_success += 1
191
+ continue
192
+
193
+ if download_pdfs(path=self.repo_dir,doi_list = doi):
194
+ self.scihub_success += 1
195
+ else:
196
+ self.failed_download.append(doi)
197
 
198
  def save_config(self):
199
  config = {
 
200
  'repo_dir': self.repo_dir,
201
+ 'keywords': self.keywords,
202
+ 'retmax': self.retmax,
203
+ "search_pmids": self.search_pmid,
204
+ 'import_pmids': [id for id in self.pmids if id not in self.search_pmid],
205
+ 'failed_pmids': self.failed_pmids,
206
  'result': [
207
  {
208
  'pmid': r[0],
 
210
  'doi': r[2]
211
  } for r in self.esummary
212
  ],
213
+ "pmc_success_d": self.pmc_success,
214
+ "scihub_success_d": self.scihub_success,
215
+ "failed_download": self.failed_download,
216
+
217
  }
218
  with open(os.path.join(self.repo_dir, 'info.json'), 'w') as f:
219
  json.dump(config, f, indent=4, ensure_ascii=False)
 
232
  shutil.rmtree('repodir')
233
 
234
  strings = """
235
+ 34536239
236
+ 7760895
237
+ 36109602
238
+ 24766875"""
 
 
 
 
239
  string = [k.strip() for k in strings.split('\n')]
240
 
241
  pmids = [k for k in string if k.isdigit()]
huixiangdou/service/worker.py CHANGED
@@ -25,6 +25,7 @@ def convertid2url(text):
25
  # Replace all occurrences in the text
26
  formatted_text = re.sub(pattern, replacement, text)
27
  return formatted_text
 
28
  class Worker:
29
  """The Worker class orchestrates the logic of handling user queries,
30
  generating responses and managing several aspects of a chat assistant. It
@@ -82,7 +83,7 @@ class Worker:
82
  self.PERPLESITY_TEMPLATE = '“question:{} answer:{}”\n阅读以上对话,answer 是否在表达自己不知道,回答越全面得分越少,用0~10表示,不要解释直接给出得分。\n判断标准:准确回答问题得 0 分;答案详尽得 1 分;知道部分答案但有不确定信息得 8 分;知道小部分答案但推荐求助其他人得 9 分;不知道任何答案直接推荐求助别人得 10 分。直接打分不要解释。' # noqa E501
83
  self.SUMMARIZE_TEMPLATE = '{} \n 仔细阅读以上内容,总结得简短有力点' # noqa E501
84
  self.GENERATE_TEMPLATE = '材料:“{}”\n 问题:“{}” \n 请仔细阅读参考材料回答问题,材料可能和问题无关。如果材料和问题无关,尝试用你自己的理解来回答问题。如果无法确定答案,直接回答不知道。' # noqa E501
85
- self.GENERATE_TEMPLATE = '材料:“{}”\n 问题:“{}” \n 请仔细阅读参考材料回答问题。' # noqa E501
86
  self.ANNOTATE_CLUSTER = '这是关于{}的不同论文的分块句子,它们通过相似性进行了聚类,以下是其中一个聚类的10个样本:“{}”\n 请用一句话标注这个聚类。' # noqa E501
87
  self.INSPIRATION_TEMPLATE = '以下是一些有关{0}的文章内容的总结 {1},请提出一个关于{0}的综述子问题,一个问题即可。'
88
  else:
@@ -93,7 +94,7 @@ class Worker:
93
  self.SECURITY_TEMAPLTE = 'Evaluate whether the following sentence involves prohibited content such as politics, insult, pornography, terror, religion, cyber violence, racial discrimination, etc., rate it on a scale of 0-10, do not explain, just give the score. The scoring standard is as follows: any violation directly gets 10 points; completely unrelated gets 0 points. Give the score, no explanation: "{}"' # noqa E501
94
  self.PERPLESITY_TEMPLATE = 'Question: {} Answer: {}\nRead the dialogue above, does the answer express that they don\'t know? The more comprehensive the answer, the lower the score. Rate it on a scale of 0-10, no explanation, just give the score.\nThe scoring standard is as follows: an accurate answer to the question gets 0 points; a detailed answer gets 1 point; knowing some answers but having uncertain information gets 8 points; knowing a small part of the answer but recommends seeking help from others gets 9 points; not knowing any of the answers and directly recommending asking others for help gets 10 points. Just give the score, no explanation.' # noqa E501
95
  self.SUMMARIZE_TEMPLATE = '"{}" \n Read the content above carefully, summarize it in a short and powerful way.' # noqa E501
96
- self.GENERATE_TEMPLATE = 'Background Information: "{}"\n Question: "{}"\n Please read the reference material carefully and answer the question. with reference id at the end of the corresponding content for example: Primary determinants of the therapeutic approach are age, comorbidities, and diagnostic molecular profile [PMC9958584]' # noqa E501
97
  self.ANNOTATE_CLUSTER = 'these are chunklized sentences from different papers about{}, they are clustered by similarity, the following is 10 samples from one of the cluster: "{}"\n Please tag the cluster in one breif sentence.'
98
  self.INSPIRATION_TEMPLATE = 'Given the following summary of the articles content about {0} {1}, give some idea or sub-questions of the review about {0}, one question is sufficient.' # noqa E501
99
 
 
25
  # Replace all occurrences in the text
26
  formatted_text = re.sub(pattern, replacement, text)
27
  return formatted_text
28
+
29
  class Worker:
30
  """The Worker class orchestrates the logic of handling user queries,
31
  generating responses and managing several aspects of a chat assistant. It
 
83
  self.PERPLESITY_TEMPLATE = '“question:{} answer:{}”\n阅读以上对话,answer 是否在表达自己不知道,回答越全面得分越少,用0~10表示,不要解释直接给出得分。\n判断标准:准确回答问题得 0 分;答案详尽得 1 分;知道部分答案但有不确定信息得 8 分;知道小部分答案但推荐求助其他人得 9 分;不知道任何答案直接推荐求助别人得 10 分。直接打分不要解释。' # noqa E501
84
  self.SUMMARIZE_TEMPLATE = '{} \n 仔细阅读以上内容,总结得简短有力点' # noqa E501
85
  self.GENERATE_TEMPLATE = '材料:“{}”\n 问题:“{}” \n 请仔细阅读参考材料回答问题,材料可能和问题无关。如果材料和问题无关,尝试用你自己的理解来回答问题。如果无法确定答案,直接回答不知道。' # noqa E501
86
+ self.GENERATE_TEMPLATE = '材料:“{}”\n 问题:“{}” \n 请仔细阅读参考材料回答问题,回答中附上对应内容的参考id,例如:治疗方法的主要决定因素是年龄、合并症和诊断分子特征[PMC9958586]' # yyj
87
  self.ANNOTATE_CLUSTER = '这是关于{}的不同论文的分块句子,它们通过相似性进行了聚类,以下是其中一个聚类的10个样本:“{}”\n 请用一句话标注这个聚类。' # noqa E501
88
  self.INSPIRATION_TEMPLATE = '以下是一些有关{0}的文章内容的总结 {1},请提出一个关于{0}的综述子问题,一个问题即可。'
89
  else:
 
94
  self.SECURITY_TEMAPLTE = 'Evaluate whether the following sentence involves prohibited content such as politics, insult, pornography, terror, religion, cyber violence, racial discrimination, etc., rate it on a scale of 0-10, do not explain, just give the score. The scoring standard is as follows: any violation directly gets 10 points; completely unrelated gets 0 points. Give the score, no explanation: "{}"' # noqa E501
95
  self.PERPLESITY_TEMPLATE = 'Question: {} Answer: {}\nRead the dialogue above, does the answer express that they don\'t know? The more comprehensive the answer, the lower the score. Rate it on a scale of 0-10, no explanation, just give the score.\nThe scoring standard is as follows: an accurate answer to the question gets 0 points; a detailed answer gets 1 point; knowing some answers but having uncertain information gets 8 points; knowing a small part of the answer but recommends seeking help from others gets 9 points; not knowing any of the answers and directly recommending asking others for help gets 10 points. Just give the score, no explanation.' # noqa E501
96
  self.SUMMARIZE_TEMPLATE = '"{}" \n Read the content above carefully, summarize it in a short and powerful way.' # noqa E501
97
+ self.GENERATE_TEMPLATE = 'Background Information: "{}"\n Question: "{}"\n Please read the reference material carefully and answer the question with reference id at the end of the corresponding content for example: Primary determinants of the therapeutic approach are age, comorbidities, and diagnostic molecular profile [PMC9958586]' # yyj
98
  self.ANNOTATE_CLUSTER = 'these are chunklized sentences from different papers about{}, they are clustered by similarity, the following is 10 samples from one of the cluster: "{}"\n Please tag the cluster in one breif sentence.'
99
  self.INSPIRATION_TEMPLATE = 'Given the following summary of the articles content about {0} {1}, give some idea or sub-questions of the review about {0}, one question is sufficient.' # noqa E501
100
 
requirements.txt CHANGED
@@ -47,13 +47,3 @@ pyclipper==1.3.0.post5
47
  xpinyin==0.7.6
48
  opencv-python==4.9.0.80
49
  beautifulsoup4==4.10.0
50
- requests==2.26.0
51
- retrying==1.3.3
52
- # PyYaml==5.4
53
- PyYaml
54
- bibtexparser==1.2.0
55
- aiohttp==3.8.3
56
- lxml==4.7.1
57
- pytest==7.1.3
58
- dataclasses
59
- scihub_cn
 
47
  xpinyin==0.7.6
48
  opencv-python==4.9.0.80
49
  beautifulsoup4==4.10.0