import os import io import requests from bs4 import BeautifulSoup import gradio as gr import pandas as pd import time def get_papers_since(url, since_year=2023): """ 给定一个 Google Scholar citations page URL,返回指定年份(默认2023)及之后的论文列表。 每篇论文用一个字典表示:{"title": ..., "year": ...} """ headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)" } papers = [] start = 0 pagesize = 20 # Google Scholar 通常每页显示 20 条记录 while True: params = {"cstart": start, "pagesize": pagesize} response = requests.get(url, params=params, headers=headers) if response.status_code != 200: break soup = BeautifulSoup(response.text, "html.parser") rows = soup.find_all("tr", class_="gsc_a_tr") if not rows: break for row in rows: title_tag = row.find("a", class_="gsc_a_at") title = title_tag.text.strip() if title_tag else "Unknown Title" year_tag = row.find("span", class_="gsc_a_h") try: year = int(year_tag.text.strip()) if year_tag else None except ValueError: year = None if year and year >= since_year: papers.append({"title": title, "year": year}) next_button = soup.find("button", id="gsc_bpf_more") if next_button and "disabled" not in next_button.attrs: start += pagesize else: break return papers def search_paper_by_title(paper_title: str): """ 使用 Semantic Scholar "match" endpoint 根据标题搜索论文,返回 top-1 匹配结果。 返回: dict: 包含 'paperId', 'title', 'abstract', 'venue' 四个键的字典。 若找不到对应论文(HTTP 404 或无结果),则返回 None。 若出现其它错误则抛出异常。 """ base_url = "https://api.semanticscholar.org/graph/v1/paper/search/match" params = { "query": paper_title, "fields": "paperId,abstract,title,venue" } headers = { "x-api-key": os.getenv("S2_API_KEY") } response = requests.get(base_url, params=params, headers=headers) if response.status_code == 404: return None elif response.status_code != 200: raise Exception(f"错误 {response.status_code}: {response.text}") data_list = response.json().get("data", []) if not data_list: return None paper_data = data_list[0] return { "paperId": paper_data.get("paperId"), "title": paper_data.get("title"), "abstract": paper_data.get("abstract"), "venue": paper_data.get("venue") } def process_profiles(profiles_text, wechat, progress=None): """ 1. 将用户输入的多个 Google Scholar profile links (以换行分隔) 转为列表 2. 爬取所有链接在 2023 年及之后的论文 3. 对每篇论文调用 Semantic Scholar 获取匹配信息,若无匹配则丢弃 4. 根据 paperId 去重 5. 返回一个 CSV 文件内容(bytes),用于用户下载 """ log_messages = ["开始处理..."] def update_log(message): log_messages.append(message) return "\n".join(log_messages) if not profiles_text.strip(): return update_log("错误: 未提供任何链接"), None if not wechat.strip(): return update_log("错误: 未提供微信号"), None # 将用户输入按换行拆分 profile_urls = [line.strip() for line in profiles_text.splitlines() if line.strip()] message = f"已识别 {len(profile_urls)} 个档案链接,开始处理..." yield update_log(message), None all_papers = [] for i, url in enumerate(profile_urls): message = f"处理第 {i+1}/{len(profile_urls)} 个档案: {url}" yield update_log(message), None papers = get_papers_since(url, 2023) all_papers.extend(papers) message = f"已从档案 {i+1}/{len(profile_urls)} 中收集 {len(papers)} 篇论文" yield update_log(message), None message = f"共收集到 {len(all_papers)} 篇论文,开始获取详细信息..." yield update_log(message), None paperid_map = {} total_papers = len(all_papers) for i, paper in enumerate(all_papers): title = paper["title"] year = paper["year"] if i % 10 == 0 or i == total_papers - 1: percent = round((i+1)/total_papers*100) message = f"正在处理论文 {i+1}/{total_papers} ({percent}% 完成)" yield update_log(message), None if i > 0 and i % 10 == 0: time.sleep(1) try: time.sleep(0.3) paper_info = search_paper_by_title(title) except Exception as e: continue if not paper_info or not paper_info.get("paperId"): continue pid = paper_info["paperId"] if pid not in paperid_map: paperid_map[pid] = { "paperId": pid, "title": paper_info["title"], "abstract": paper_info["abstract"], "venue": paper_info["venue"], "year": year } if not paperid_map: message = "错误: 未找到任何匹配的论文" return update_log(message), None message = f"去重后共有 {len(paperid_map)} 篇论文,正在生成CSV..." yield update_log(message), None df = pd.DataFrame(list(paperid_map.values())) # 使用微信号作为 CSV 文件名 temp_csv_path = f"{wechat.strip()}.csv" df.to_csv(temp_csv_path, index=False, encoding='utf-8') message = f"任务完成! 已生成包含 {len(paperid_map)} 篇论文的CSV文件: {temp_csv_path}" yield update_log(message), temp_csv_path def build_app(): """ 使用 gradio 搭建一个小型 Demo app。 """ with gr.Blocks() as demo: gr.Markdown("## Google Scholar & Semantic Scholar 信息整合工具") gr.Markdown("在下方输入任意多个 Google Scholar Profile 链接,每行一个,然后输入微信号,点击 **开始爬取**。") profile_input = gr.Textbox( lines=5, placeholder="粘贴或输入多个 Google Scholar 个人页面链接(每行一个,需要下面格式 'https://scholar.google.com/citations?user=NVii64oAAAAJ')" ) # 新增微信号输入框 wechat_input = gr.Textbox( label="微信号", placeholder="请输入您的微信号" ) progress_output = gr.Textbox( label="进度", value="等待开始...", lines=10, interactive=False ) download_output = gr.File(label="下载结果 CSV") run_button = gr.Button("开始爬取") # 传入两个输入值 run_button.click( fn=process_profiles, inputs=[profile_input, wechat_input], outputs=[progress_output, download_output] ) return demo if __name__ == "__main__": app = build_app() app.launch()