|  | import requests | 
					
						
						|  | from bs4 import BeautifulSoup | 
					
						
						|  | import fitz | 
					
						
						|  | import os | 
					
						
						|  | import openai | 
					
						
						|  | import re | 
					
						
						|  | import gradio as gr | 
					
						
						|  | import gspread | 
					
						
						|  | from oauth2client.service_account import ServiceAccountCredentials | 
					
						
						|  | import json | 
					
						
						|  |  | 
					
						
						|  | def connect_gspread(spread_sheet_key): | 
					
						
						|  | """Google スプレッドシートに接続。""" | 
					
						
						|  | credentials_json = os.getenv('GOOGLE_CREDENTIALS') | 
					
						
						|  | credentials_dict = json.loads(credentials_json) | 
					
						
						|  | scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'] | 
					
						
						|  |  | 
					
						
						|  | credentials = ServiceAccountCredentials.from_json_keyfile_dict(credentials_dict, scope) | 
					
						
						|  | gc = gspread.authorize(credentials) | 
					
						
						|  | SPREADSHEET_KEY = spread_sheet_key | 
					
						
						|  | worksheet = gc.open_by_key(SPREADSHEET_KEY).sheet1 | 
					
						
						|  | return worksheet | 
					
						
						|  |  | 
					
						
						|  | spread_sheet_key = "1nSh6D_Gqdbhi1CB3wvD4OJUU6bji8-LE6HET7NTEjrM" | 
					
						
						|  | worksheet = connect_gspread(spread_sheet_key) | 
					
						
						|  |  | 
					
						
						|  | def download_paper(paper_url): | 
					
						
						|  | """論文PDFをダウンロードして保存。""" | 
					
						
						|  | response = requests.get(paper_url) | 
					
						
						|  | temp_pdf_path = "temp_paper.pdf" | 
					
						
						|  | with open(temp_pdf_path, 'wb') as f: | 
					
						
						|  | f.write(response.content) | 
					
						
						|  | return temp_pdf_path | 
					
						
						|  |  | 
					
						
						|  | def extract_text_from_pdf(pdf_path): | 
					
						
						|  | """PDFからテキストを抽出。""" | 
					
						
						|  | doc = fitz.open(pdf_path) | 
					
						
						|  | text = "" | 
					
						
						|  | for page in doc: | 
					
						
						|  | text += page.get_text() | 
					
						
						|  | return text | 
					
						
						|  |  | 
					
						
						|  | def summarize_text_with_chat(text, max_length=20000): | 
					
						
						|  | """OpenAIのChat APIを使ってテキストを要約。""" | 
					
						
						|  | openai.api_key = os.getenv('OPEN_AI_API_KEYS') | 
					
						
						|  | trimmed_text = text[:max_length] | 
					
						
						|  | response = openai.chat.completions.create( | 
					
						
						|  | model="gpt-4o", | 
					
						
						|  | messages=[ | 
					
						
						|  | {"role": "system", "content": "次の文書を要約してください。必ず'## タイトル', '## 要約', '## 専門用語解説'を記載してください。"}, | 
					
						
						|  | {"role": "user", "content": trimmed_text} | 
					
						
						|  | ], | 
					
						
						|  | temperature=0.7, | 
					
						
						|  | max_tokens=2000 | 
					
						
						|  | ) | 
					
						
						|  | summary_text = response.choices[0].message.content | 
					
						
						|  | total_token = response.usage.total_tokens | 
					
						
						|  | return summary_text, total_token | 
					
						
						|  |  | 
					
						
						|  | def fetch_paper_links(url): | 
					
						
						|  | """指定したURLから論文のリンクを抽出し、重複を排除。""" | 
					
						
						|  | response = requests.get(url) | 
					
						
						|  | soup = BeautifulSoup(response.text, 'html.parser') | 
					
						
						|  | pattern = re.compile(r'^/papers/\d+\.\d+$') | 
					
						
						|  | links = [] | 
					
						
						|  | for a in soup.find_all('a', href=True): | 
					
						
						|  | href = a['href'] | 
					
						
						|  | if pattern.match(href) and href not in links: | 
					
						
						|  | links.append(href) | 
					
						
						|  | return links | 
					
						
						|  |  | 
					
						
						|  | def summarize_paper_and_save_to_sheet(paper_id): | 
					
						
						|  | """論文を要約し、結果をGoogle スプレッドシートに保存。""" | 
					
						
						|  | paper_url = f"https://arxiv.org/pdf/{paper_id}.pdf" | 
					
						
						|  | pdf_path = download_paper(paper_url) | 
					
						
						|  | text = extract_text_from_pdf(pdf_path) | 
					
						
						|  | summary, token = summarize_text_with_chat(text) | 
					
						
						|  | os.remove(pdf_path) | 
					
						
						|  | worksheet.append_row([paper_id, paper_url, summary, token]) | 
					
						
						|  | return summary, token | 
					
						
						|  |  | 
					
						
						|  | def find_paper_in_sheet(records, paper_id): | 
					
						
						|  | """スプレッドシートから指定されたpaper_idを検索し、該当する行があればその内容を返す。""" | 
					
						
						|  | paper_id_url = f"https://arxiv.org/pdf/{paper_id}.pdf" | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for index, record in enumerate(records, start=2): | 
					
						
						|  | if record['URL'] == paper_id_url: | 
					
						
						|  | return record['summary'] | 
					
						
						|  |  | 
					
						
						|  | return None | 
					
						
						|  |  | 
					
						
						|  | def gradio_interface(): | 
					
						
						|  | paper_links = fetch_paper_links("https://huggingface.co/papers") | 
					
						
						|  | paper_ids = set(link.split('/')[-1] for link in paper_links) | 
					
						
						|  |  | 
					
						
						|  | total_tokens_used = 0 | 
					
						
						|  | summaries = [] | 
					
						
						|  | records = worksheet.get_all_records() | 
					
						
						|  |  | 
					
						
						|  | for paper_id in paper_ids: | 
					
						
						|  | summary_info = "" | 
					
						
						|  | summary = find_paper_in_sheet(records, paper_id) | 
					
						
						|  | if summary == None: | 
					
						
						|  | summary, tokens_used = summarize_paper_and_save_to_sheet(paper_id) | 
					
						
						|  | total_tokens_used += tokens_used | 
					
						
						|  |  | 
					
						
						|  | paper_id_url = f"https://arxiv.org/pdf/{paper_id}.pdf" | 
					
						
						|  | summary_info += f'論文: {paper_id_url}\n{summary}\n' | 
					
						
						|  |  | 
					
						
						|  | summaries.append(summary_info) | 
					
						
						|  |  | 
					
						
						|  | summaries_markdown = "\n---\n".join(summaries) | 
					
						
						|  | return summaries_markdown | 
					
						
						|  |  | 
					
						
						|  | iface = gr.Interface( | 
					
						
						|  | fn=gradio_interface, | 
					
						
						|  | inputs=[], | 
					
						
						|  | outputs=gr.Markdown(), | 
					
						
						|  | title="Dairy Papers 日本語要約ツール", | 
					
						
						|  | description="[Daily Papers](https://huggingface.co/papers)に掲載された論文を日本語で要約します。", | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if __name__ == "__main__": | 
					
						
						|  | iface.launch() |