Spaces:

RBJin
/

ArXivResearchExplorer

Running

App Files Files Community

ArXivResearchExplorer / src /utils /api.ts

RBJin

Upload 20 files

81cb6e0 verified about 1 month ago

raw

history blame contribute delete

12.5 kB

	import { Paper, Reference } from '../types';

	// ==================== Rate Limiter ====================
	class RateLimiter {
	private lastRequestTime = 0;
	private queue: Array<{
	fn: () => Promise<unknown>;
	resolve: (value: unknown) => void;
	reject: (reason?: unknown) => void;
	}> = [];
	private processing = false;
	private minDelay: number;

	constructor(minDelayMs: number = 3000) {
	this.minDelay = minDelayMs;
	}

	async execute<T>(fn: () => Promise<T>): Promise<T> {
	return new Promise<T>((resolve, reject) => {
	this.queue.push({
	fn: fn as () => Promise<unknown>,
	resolve: resolve as (value: unknown) => void,
	reject,
	});
	this.processQueue();
	});
	}

	private async processQueue() {
	if (this.processing \|\| this.queue.length === 0) return;
	this.processing = true;
	while (this.queue.length > 0) {
	const item = this.queue.shift()!;
	const now = Date.now();
	const elapsed = now - this.lastRequestTime;
	if (elapsed < this.minDelay) {
	await new Promise((r) => setTimeout(r, this.minDelay - elapsed));
	}
	try {
	this.lastRequestTime = Date.now();
	const result = await item.fn();
	item.resolve(result);
	} catch (error) {
	item.reject(error);
	}
	}
	this.processing = false;
	}
	}

	const arxivLimiter = new RateLimiter(3500);
	const ar5ivLimiter = new RateLimiter(2500);

	// ==================== CORS Proxy ====================
	async function fetchWithProxy(url: string): Promise<string> {
	// Try direct fetch first
	try {
	const controller = new AbortController();
	const timeout = setTimeout(() => controller.abort(), 8000);
	const response = await fetch(url, { signal: controller.signal });
	clearTimeout(timeout);
	if (response.ok) return await response.text();
	} catch {
	// Direct fetch failed, try proxies
	}

	// Try allorigins proxy
	const proxies = [
	`https://api.allorigins.win/raw?url=${encodeURIComponent(url)}`,
	`https://corsproxy.io/?${encodeURIComponent(url)}`,
	];

	for (const proxyUrl of proxies) {
	try {
	const controller = new AbortController();
	const timeout = setTimeout(() => controller.abort(), 15000);
	const response = await fetch(proxyUrl, { signal: controller.signal });
	clearTimeout(timeout);
	if (response.ok) return await response.text();
	} catch {
	continue;
	}
	}

	throw new Error(`Failed to fetch: ${url}`);
	}

	// ==================== ArXiv Search API ====================
	export async function searchArxiv(
	query: string,
	start: number = 0,
	maxResults: number = 10
	): Promise<{ papers: Paper[]; total: number }> {
	return arxivLimiter.execute(async () => {
	const searchQuery = query
	.split(/\s+/)
	.map((term) => `all:${term}`)
	.join('+AND+');
	const url = `https://export.arxiv.org/api/query?search_query=${searchQuery}&start=${start}&max_results=${maxResults}&sortBy=relevance&sortOrder=descending`;
	const xml = await fetchWithProxy(url);
	return parseArxivAtom(xml);
	});
	}

	function parseArxivAtom(xml: string): { papers: Paper[]; total: number } {
	const parser = new DOMParser();
	const doc = parser.parseFromString(xml, 'application/xml');

	const totalEl = doc.querySelector('totalResults') \|\|
	doc.getElementsByTagNameNS('http://a9.com/-/spec/opensearch/1.1/', 'totalResults')[0];
	const total = totalEl ? parseInt(totalEl.textContent \|\| '0') : 0;

	const entries = doc.getElementsByTagName('entry');
	const papers: Paper[] = [];

	for (let i = 0; i < entries.length; i++) {
	const entry = entries[i];
	const getTag = (tag: string) => entry.getElementsByTagName(tag)[0]?.textContent?.trim() \|\| '';

	const idUrl = getTag('id');
	const arxivId = idUrl.replace(/^https?:\/\/arxiv\.org\/abs\//, '').replace(/v\d+$/, '');
	const title = getTag('title').replace(/\s+/g, ' ');
	const abstract = getTag('summary').replace(/\s+/g, ' ');
	const published = getTag('published');
	const updated = getTag('updated');

	const authorEls = entry.getElementsByTagName('author');
	const authors: string[] = [];
	for (let j = 0; j < authorEls.length; j++) {
	const name = authorEls[j].getElementsByTagName('name')[0]?.textContent;
	if (name) authors.push(name);
	}

	const catEls = entry.getElementsByTagName('category');
	const categories: string[] = [];
	for (let j = 0; j < catEls.length; j++) {
	const term = catEls[j].getAttribute('term');
	if (term) categories.push(term);
	}

	const linkEls = entry.getElementsByTagName('link');
	let pdfLink = '';
	for (let j = 0; j < linkEls.length; j++) {
	if (linkEls[j].getAttribute('title') === 'pdf') {
	pdfLink = linkEls[j].getAttribute('href') \|\| '';
	}
	}

	if (title) {
	papers.push({
	id: arxivId,
	title,
	authors,
	abstract,
	published,
	updated,
	categories,
	pdfLink,
	htmlLink: `https://ar5iv.labs.arxiv.org/html/${arxivId}`,
	sectionsLoaded: false,
	sectionsLoading: false,
	});
	}
	}

	return { papers, total };
	}

	// ==================== ar5iv Section Parser ====================
	export async function fetchPaperSections(
	arxivId: string
	): Promise<{
	introduction?: string;
	relatedWork?: string;
	methods?: string;
	references?: Reference[];
	}> {
	return ar5ivLimiter.execute(async () => {
	const url = `https://ar5iv.labs.arxiv.org/html/${arxivId}`;
	const html = await fetchWithProxy(url);
	return parseAr5ivHtml(html);
	});
	}

	function parseAr5ivHtml(html: string): {
	introduction?: string;
	relatedWork?: string;
	methods?: string;
	references?: Reference[];
	} {
	const parser = new DOMParser();
	const doc = parser.parseFromString(html, 'text/html');

	doc.querySelectorAll('script, style, nav, header, footer').forEach((el) => el.remove());

	const result: {
	introduction?: string;
	relatedWork?: string;
	methods?: string;
	references?: Reference[];
	} = {};

	// Try multiple selectors for sections
	const sectionSelectors = [
	'section.ltx_section',
	'section.ltx_chapter',
	'div.ltx_section',
	'section[id]',
	];

	let sections: Element[] = [];
	for (const sel of sectionSelectors) {
	const found = doc.querySelectorAll(sel);
	if (found.length > 0) {
	sections = Array.from(found);
	break;
	}
	}

	// If no structured sections found, try to parse by headings
	if (sections.length === 0) {
	const headings = doc.querySelectorAll('h2, h3');
	headings.forEach((h) => {
	const text = h.textContent?.toLowerCase() \|\| '';
	const parent = h.parentElement;
	if (parent) {
	if (text.includes('introduction')) result.introduction = parent.innerHTML;
	else if (text.includes('related work') \|\| text.includes('background'))
	result.relatedWork = parent.innerHTML;
	else if (text.includes('method') \|\| text.includes('approach'))
	result.methods = parent.innerHTML;
	}
	});
	} else {
	for (const section of sections) {
	const heading = section.querySelector('h1, h2, h3, h4, .ltx_title');
	if (!heading) continue;
	const headingText = heading.textContent?.toLowerCase() \|\| '';

	if (
	headingText.includes('introduction') &&
	!headingText.includes('related')
	) {
	result.introduction = section.innerHTML;
	} else if (
	headingText.includes('related work') \|\|
	headingText.includes('related works') \|\|
	headingText.includes('literature review') \|\|
	headingText.includes('background and related') \|\|
	(headingText.includes('background') && headingText.includes('work'))
	) {
	result.relatedWork = section.innerHTML;
	} else if (
	!result.methods &&
	(headingText.includes('method') \|\|
	headingText.includes('approach') \|\|
	headingText.includes('proposed') \|\|
	headingText.includes('architecture') \|\|
	headingText.includes('framework') \|\|
	headingText.includes('model description'))
	) {
	result.methods = section.innerHTML;
	}
	}
	}

	// Parse references
	const bibItems = doc.querySelectorAll(
	'.ltx_bibitem, li[id*="bib"], .ltx_biblist > li'
	);
	const references: Reference[] = [];

	bibItems.forEach((item) => {
	const tagEl = item.querySelector('.ltx_tag, .ltx_tag_bibitem');
	const number = tagEl?.textContent?.replace(/[\[\]]/g, '').trim() \|\| '';
	const key = item.id \|\| `ref-${number}`;

	let text = '';
	const blocks = item.querySelectorAll('.ltx_bibblock');
	if (blocks.length > 0) {
	blocks.forEach((block) => {
	text += block.textContent + ' ';
	});
	} else {
	text = item.textContent?.replace(tagEl?.textContent \|\| '', '').trim() \|\| '';
	}
	text = text.trim();

	let arxivId: string \| undefined;
	const links = item.querySelectorAll('a[href]');
	links.forEach((link) => {
	const href = link.getAttribute('href') \|\| '';
	const match = href.match(/arxiv\.org\/abs\/(\d{4}\.\d{4,5})/);
	if (match) arxivId = match[1];
	});

	if (!arxivId) {
	const textMatch = text.match(/arXiv[:\s]*(\d{4}\.\d{4,5})/i);
	if (textMatch) arxivId = textMatch[1];
	}

	if (number \|\| text) {
	references.push({ key, number, text, arxivId });
	}
	});

	result.references = references;
	return result;
	}

	// ==================== Translation ====================
	export async function translateText(
	text: string,
	targetLang: string = 'zh-CN'
	): Promise<string> {
	if (!text \|\| text.trim().length === 0) return '';

	// Strip HTML tags for translation
	const plainText = text.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();

	const chunks = splitIntoChunks(plainText, 4500);
	const results: string[] = [];

	for (const chunk of chunks) {
	const translated = await translateChunk(chunk, targetLang);
	results.push(translated);
	if (chunks.length > 1) {
	await new Promise((r) => setTimeout(r, 300));
	}
	}

	return results.join('');
	}

	function splitIntoChunks(text: string, maxLen: number): string[] {
	const chunks: string[] = [];
	let remaining = text;
	while (remaining.length > 0) {
	if (remaining.length <= maxLen) {
	chunks.push(remaining);
	break;
	}
	let bp = maxLen;
	const sentEnd = remaining.lastIndexOf('. ', maxLen);
	if (sentEnd > maxLen * 0.5) bp = sentEnd + 2;
	chunks.push(remaining.substring(0, bp));
	remaining = remaining.substring(bp);
	}
	return chunks;
	}

	async function translateChunk(text: string, targetLang: string): Promise<string> {
	// Try Google Translate unofficial API
	try {
	const url = `https://translate.googleapis.com/translate_a/single?client=gtx&sl=en&tl=${targetLang}&dt=t&q=${encodeURIComponent(text)}`;
	const response = await fetch(url);
	if (response.ok) {
	const data = await response.json();
	if (Array.isArray(data) && Array.isArray(data[0])) {
	return data[0]
	.filter((item: unknown) => Array.isArray(item) && item[0])
	.map((item: unknown[]) => item[0])
	.join('');
	}
	}
	} catch {
	// fallthrough
	}

	// Fallback: MyMemory
	try {
	const url = `https://api.mymemory.translated.net/get?q=${encodeURIComponent(text.substring(0, 500))}&langpair=en\|${targetLang}`;
	const response = await fetch(url);
	if (response.ok) {
	const data = await response.json();
	if (data.responseStatus === 200) {
	return data.responseData.translatedText;
	}
	}
	} catch {
	// fallthrough
	}

	throw new Error('翻译失败，请稍后重试 / Translation failed');
	}

	// ==================== Fetch Paper By ID ====================
	export async function fetchPaperById(arxivId: string): Promise<Paper \| null> {
	return arxivLimiter.execute(async () => {
	const cleanId = arxivId.replace(/^https?:\/\/arxiv\.org\/abs\//, '').replace(/v\d+$/, '');
	const url = `https://export.arxiv.org/api/query?id_list=${encodeURIComponent(cleanId)}`;
	const xml = await fetchWithProxy(url);
	const { papers } = parseArxivAtom(xml);
	return papers.length > 0 ? papers[0] : null;
	});
	}

	// ==================== Helpers ====================
	export function extractPlainText(html: string): string {
	const div = document.createElement('div');
	div.innerHTML = html;
	return div.textContent \|\| div.innerText \|\| '';
	}