Spaces:
Running
Running
| import { Paper, Reference } from '../types'; | |
| // ==================== Rate Limiter ==================== | |
| class RateLimiter { | |
| private lastRequestTime = 0; | |
| private queue: Array<{ | |
| fn: () => Promise<unknown>; | |
| resolve: (value: unknown) => void; | |
| reject: (reason?: unknown) => void; | |
| }> = []; | |
| private processing = false; | |
| private minDelay: number; | |
| constructor(minDelayMs: number = 3000) { | |
| this.minDelay = minDelayMs; | |
| } | |
| async execute<T>(fn: () => Promise<T>): Promise<T> { | |
| return new Promise<T>((resolve, reject) => { | |
| this.queue.push({ | |
| fn: fn as () => Promise<unknown>, | |
| resolve: resolve as (value: unknown) => void, | |
| reject, | |
| }); | |
| this.processQueue(); | |
| }); | |
| } | |
| private async processQueue() { | |
| if (this.processing || this.queue.length === 0) return; | |
| this.processing = true; | |
| while (this.queue.length > 0) { | |
| const item = this.queue.shift()!; | |
| const now = Date.now(); | |
| const elapsed = now - this.lastRequestTime; | |
| if (elapsed < this.minDelay) { | |
| await new Promise((r) => setTimeout(r, this.minDelay - elapsed)); | |
| } | |
| try { | |
| this.lastRequestTime = Date.now(); | |
| const result = await item.fn(); | |
| item.resolve(result); | |
| } catch (error) { | |
| item.reject(error); | |
| } | |
| } | |
| this.processing = false; | |
| } | |
| } | |
| const arxivLimiter = new RateLimiter(3500); | |
| const ar5ivLimiter = new RateLimiter(2500); | |
| // ==================== CORS Proxy ==================== | |
| async function fetchWithProxy(url: string): Promise<string> { | |
| // Try direct fetch first | |
| try { | |
| const controller = new AbortController(); | |
| const timeout = setTimeout(() => controller.abort(), 8000); | |
| const response = await fetch(url, { signal: controller.signal }); | |
| clearTimeout(timeout); | |
| if (response.ok) return await response.text(); | |
| } catch { | |
| // Direct fetch failed, try proxies | |
| } | |
| // Try allorigins proxy | |
| const proxies = [ | |
| `https://api.allorigins.win/raw?url=${encodeURIComponent(url)}`, | |
| `https://corsproxy.io/?${encodeURIComponent(url)}`, | |
| ]; | |
| for (const proxyUrl of proxies) { | |
| try { | |
| const controller = new AbortController(); | |
| const timeout = setTimeout(() => controller.abort(), 15000); | |
| const response = await fetch(proxyUrl, { signal: controller.signal }); | |
| clearTimeout(timeout); | |
| if (response.ok) return await response.text(); | |
| } catch { | |
| continue; | |
| } | |
| } | |
| throw new Error(`Failed to fetch: ${url}`); | |
| } | |
| // ==================== ArXiv Search API ==================== | |
| export async function searchArxiv( | |
| query: string, | |
| start: number = 0, | |
| maxResults: number = 10 | |
| ): Promise<{ papers: Paper[]; total: number }> { | |
| return arxivLimiter.execute(async () => { | |
| const searchQuery = query | |
| .split(/\s+/) | |
| .map((term) => `all:${term}`) | |
| .join('+AND+'); | |
| const url = `https://export.arxiv.org/api/query?search_query=${searchQuery}&start=${start}&max_results=${maxResults}&sortBy=relevance&sortOrder=descending`; | |
| const xml = await fetchWithProxy(url); | |
| return parseArxivAtom(xml); | |
| }); | |
| } | |
| function parseArxivAtom(xml: string): { papers: Paper[]; total: number } { | |
| const parser = new DOMParser(); | |
| const doc = parser.parseFromString(xml, 'application/xml'); | |
| const totalEl = doc.querySelector('totalResults') || | |
| doc.getElementsByTagNameNS('http://a9.com/-/spec/opensearch/1.1/', 'totalResults')[0]; | |
| const total = totalEl ? parseInt(totalEl.textContent || '0') : 0; | |
| const entries = doc.getElementsByTagName('entry'); | |
| const papers: Paper[] = []; | |
| for (let i = 0; i < entries.length; i++) { | |
| const entry = entries[i]; | |
| const getTag = (tag: string) => entry.getElementsByTagName(tag)[0]?.textContent?.trim() || ''; | |
| const idUrl = getTag('id'); | |
| const arxivId = idUrl.replace(/^https?:\/\/arxiv\.org\/abs\//, '').replace(/v\d+$/, ''); | |
| const title = getTag('title').replace(/\s+/g, ' '); | |
| const abstract = getTag('summary').replace(/\s+/g, ' '); | |
| const published = getTag('published'); | |
| const updated = getTag('updated'); | |
| const authorEls = entry.getElementsByTagName('author'); | |
| const authors: string[] = []; | |
| for (let j = 0; j < authorEls.length; j++) { | |
| const name = authorEls[j].getElementsByTagName('name')[0]?.textContent; | |
| if (name) authors.push(name); | |
| } | |
| const catEls = entry.getElementsByTagName('category'); | |
| const categories: string[] = []; | |
| for (let j = 0; j < catEls.length; j++) { | |
| const term = catEls[j].getAttribute('term'); | |
| if (term) categories.push(term); | |
| } | |
| const linkEls = entry.getElementsByTagName('link'); | |
| let pdfLink = ''; | |
| for (let j = 0; j < linkEls.length; j++) { | |
| if (linkEls[j].getAttribute('title') === 'pdf') { | |
| pdfLink = linkEls[j].getAttribute('href') || ''; | |
| } | |
| } | |
| if (title) { | |
| papers.push({ | |
| id: arxivId, | |
| title, | |
| authors, | |
| abstract, | |
| published, | |
| updated, | |
| categories, | |
| pdfLink, | |
| htmlLink: `https://ar5iv.labs.arxiv.org/html/${arxivId}`, | |
| sectionsLoaded: false, | |
| sectionsLoading: false, | |
| }); | |
| } | |
| } | |
| return { papers, total }; | |
| } | |
| // ==================== ar5iv Section Parser ==================== | |
| export async function fetchPaperSections( | |
| arxivId: string | |
| ): Promise<{ | |
| introduction?: string; | |
| relatedWork?: string; | |
| methods?: string; | |
| references?: Reference[]; | |
| }> { | |
| return ar5ivLimiter.execute(async () => { | |
| const url = `https://ar5iv.labs.arxiv.org/html/${arxivId}`; | |
| const html = await fetchWithProxy(url); | |
| return parseAr5ivHtml(html); | |
| }); | |
| } | |
| function parseAr5ivHtml(html: string): { | |
| introduction?: string; | |
| relatedWork?: string; | |
| methods?: string; | |
| references?: Reference[]; | |
| } { | |
| const parser = new DOMParser(); | |
| const doc = parser.parseFromString(html, 'text/html'); | |
| doc.querySelectorAll('script, style, nav, header, footer').forEach((el) => el.remove()); | |
| const result: { | |
| introduction?: string; | |
| relatedWork?: string; | |
| methods?: string; | |
| references?: Reference[]; | |
| } = {}; | |
| // Try multiple selectors for sections | |
| const sectionSelectors = [ | |
| 'section.ltx_section', | |
| 'section.ltx_chapter', | |
| 'div.ltx_section', | |
| 'section[id]', | |
| ]; | |
| let sections: Element[] = []; | |
| for (const sel of sectionSelectors) { | |
| const found = doc.querySelectorAll(sel); | |
| if (found.length > 0) { | |
| sections = Array.from(found); | |
| break; | |
| } | |
| } | |
| // If no structured sections found, try to parse by headings | |
| if (sections.length === 0) { | |
| const headings = doc.querySelectorAll('h2, h3'); | |
| headings.forEach((h) => { | |
| const text = h.textContent?.toLowerCase() || ''; | |
| const parent = h.parentElement; | |
| if (parent) { | |
| if (text.includes('introduction')) result.introduction = parent.innerHTML; | |
| else if (text.includes('related work') || text.includes('background')) | |
| result.relatedWork = parent.innerHTML; | |
| else if (text.includes('method') || text.includes('approach')) | |
| result.methods = parent.innerHTML; | |
| } | |
| }); | |
| } else { | |
| for (const section of sections) { | |
| const heading = section.querySelector('h1, h2, h3, h4, .ltx_title'); | |
| if (!heading) continue; | |
| const headingText = heading.textContent?.toLowerCase() || ''; | |
| if ( | |
| headingText.includes('introduction') && | |
| !headingText.includes('related') | |
| ) { | |
| result.introduction = section.innerHTML; | |
| } else if ( | |
| headingText.includes('related work') || | |
| headingText.includes('related works') || | |
| headingText.includes('literature review') || | |
| headingText.includes('background and related') || | |
| (headingText.includes('background') && headingText.includes('work')) | |
| ) { | |
| result.relatedWork = section.innerHTML; | |
| } else if ( | |
| !result.methods && | |
| (headingText.includes('method') || | |
| headingText.includes('approach') || | |
| headingText.includes('proposed') || | |
| headingText.includes('architecture') || | |
| headingText.includes('framework') || | |
| headingText.includes('model description')) | |
| ) { | |
| result.methods = section.innerHTML; | |
| } | |
| } | |
| } | |
| // Parse references | |
| const bibItems = doc.querySelectorAll( | |
| '.ltx_bibitem, li[id*="bib"], .ltx_biblist > li' | |
| ); | |
| const references: Reference[] = []; | |
| bibItems.forEach((item) => { | |
| const tagEl = item.querySelector('.ltx_tag, .ltx_tag_bibitem'); | |
| const number = tagEl?.textContent?.replace(/[\[\]]/g, '').trim() || ''; | |
| const key = item.id || `ref-${number}`; | |
| let text = ''; | |
| const blocks = item.querySelectorAll('.ltx_bibblock'); | |
| if (blocks.length > 0) { | |
| blocks.forEach((block) => { | |
| text += block.textContent + ' '; | |
| }); | |
| } else { | |
| text = item.textContent?.replace(tagEl?.textContent || '', '').trim() || ''; | |
| } | |
| text = text.trim(); | |
| let arxivId: string | undefined; | |
| const links = item.querySelectorAll('a[href]'); | |
| links.forEach((link) => { | |
| const href = link.getAttribute('href') || ''; | |
| const match = href.match(/arxiv\.org\/abs\/(\d{4}\.\d{4,5})/); | |
| if (match) arxivId = match[1]; | |
| }); | |
| if (!arxivId) { | |
| const textMatch = text.match(/arXiv[:\s]*(\d{4}\.\d{4,5})/i); | |
| if (textMatch) arxivId = textMatch[1]; | |
| } | |
| if (number || text) { | |
| references.push({ key, number, text, arxivId }); | |
| } | |
| }); | |
| result.references = references; | |
| return result; | |
| } | |
| // ==================== Translation ==================== | |
| export async function translateText( | |
| text: string, | |
| targetLang: string = 'zh-CN' | |
| ): Promise<string> { | |
| if (!text || text.trim().length === 0) return ''; | |
| // Strip HTML tags for translation | |
| const plainText = text.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim(); | |
| const chunks = splitIntoChunks(plainText, 4500); | |
| const results: string[] = []; | |
| for (const chunk of chunks) { | |
| const translated = await translateChunk(chunk, targetLang); | |
| results.push(translated); | |
| if (chunks.length > 1) { | |
| await new Promise((r) => setTimeout(r, 300)); | |
| } | |
| } | |
| return results.join(''); | |
| } | |
| function splitIntoChunks(text: string, maxLen: number): string[] { | |
| const chunks: string[] = []; | |
| let remaining = text; | |
| while (remaining.length > 0) { | |
| if (remaining.length <= maxLen) { | |
| chunks.push(remaining); | |
| break; | |
| } | |
| let bp = maxLen; | |
| const sentEnd = remaining.lastIndexOf('. ', maxLen); | |
| if (sentEnd > maxLen * 0.5) bp = sentEnd + 2; | |
| chunks.push(remaining.substring(0, bp)); | |
| remaining = remaining.substring(bp); | |
| } | |
| return chunks; | |
| } | |
| async function translateChunk(text: string, targetLang: string): Promise<string> { | |
| // Try Google Translate unofficial API | |
| try { | |
| const url = `https://translate.googleapis.com/translate_a/single?client=gtx&sl=en&tl=${targetLang}&dt=t&q=${encodeURIComponent(text)}`; | |
| const response = await fetch(url); | |
| if (response.ok) { | |
| const data = await response.json(); | |
| if (Array.isArray(data) && Array.isArray(data[0])) { | |
| return data[0] | |
| .filter((item: unknown) => Array.isArray(item) && item[0]) | |
| .map((item: unknown[]) => item[0]) | |
| .join(''); | |
| } | |
| } | |
| } catch { | |
| // fallthrough | |
| } | |
| // Fallback: MyMemory | |
| try { | |
| const url = `https://api.mymemory.translated.net/get?q=${encodeURIComponent(text.substring(0, 500))}&langpair=en|${targetLang}`; | |
| const response = await fetch(url); | |
| if (response.ok) { | |
| const data = await response.json(); | |
| if (data.responseStatus === 200) { | |
| return data.responseData.translatedText; | |
| } | |
| } | |
| } catch { | |
| // fallthrough | |
| } | |
| throw new Error('翻译失败,请稍后重试 / Translation failed'); | |
| } | |
| // ==================== Fetch Paper By ID ==================== | |
| export async function fetchPaperById(arxivId: string): Promise<Paper | null> { | |
| return arxivLimiter.execute(async () => { | |
| const cleanId = arxivId.replace(/^https?:\/\/arxiv\.org\/abs\//, '').replace(/v\d+$/, ''); | |
| const url = `https://export.arxiv.org/api/query?id_list=${encodeURIComponent(cleanId)}`; | |
| const xml = await fetchWithProxy(url); | |
| const { papers } = parseArxivAtom(xml); | |
| return papers.length > 0 ? papers[0] : null; | |
| }); | |
| } | |
| // ==================== Helpers ==================== | |
| export function extractPlainText(html: string): string { | |
| const div = document.createElement('div'); | |
| div.innerHTML = html; | |
| return div.textContent || div.innerText || ''; | |
| } | |