Spaces:
Build error
Build error
| import { singleton } from 'tsyringe'; | |
| import { URL } from 'url'; | |
| import { AssertionFailureError, DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc'; | |
| import { AsyncService } from 'civkit/async-service'; | |
| import { HashManager } from 'civkit/hash'; | |
| import { marshalErrorLike } from 'civkit/lang'; | |
| import { GlobalLogger } from './logger'; | |
| import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket'; | |
| import { Threaded } from '../services/threaded'; | |
| export const md5Hasher = new HashManager('md5', 'hex'); | |
| () | |
| export class RobotsTxtService extends AsyncService { | |
| logger = this.globalLogger.child({ service: this.constructor.name }); | |
| constructor( | |
| protected globalLogger: GlobalLogger, | |
| protected firebaseStorageBucketControl: FirebaseStorageBucketControl, | |
| ) { | |
| super(...arguments); | |
| } | |
| override async init() { | |
| await this.dependencyReady(); | |
| this.emit('ready'); | |
| } | |
| async getCachedRobotTxt(origin: string) { | |
| const digest = md5Hasher.hash(origin.toLowerCase()); | |
| const cacheLoc = `robots-txt/${digest}`; | |
| let buff; | |
| buff = await this.firebaseStorageBucketControl.downloadFile(cacheLoc).catch(() => undefined); | |
| if (buff) { | |
| return buff.toString(); | |
| } | |
| const r = await fetch(new URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) }); | |
| if (!r.ok) { | |
| throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}: ${r.status} ${r.statusText}`); | |
| } | |
| buff = Buffer.from(await r.arrayBuffer()); | |
| this.firebaseStorageBucketControl.saveFile(cacheLoc, buff, { | |
| contentType: 'text/plain' | |
| }).catch((err) => { | |
| this.logger.warn(`Failed to save robots.txt to cache: ${err}`, { err: marshalErrorLike(err) }); | |
| }); | |
| return buff.toString(); | |
| } | |
| () | |
| async assertAccessAllowed(url: URL, inputMyUa = '*') { | |
| let robotTxt: string = ''; | |
| try { | |
| robotTxt = await this.getCachedRobotTxt(url.origin); | |
| } catch (err) { | |
| if (err instanceof DownstreamServiceFailureError) { | |
| // Remote server is reachable but cannot provide a robot.txt; this is treated as public access | |
| return true; | |
| } | |
| throw new AssertionFailureError(`Failed to load robots.txt from ${url.origin}: ${err}`); | |
| } | |
| const myUa = inputMyUa.toLowerCase(); | |
| const lines = robotTxt.split(/\r?\n/g); | |
| let currentUa = myUa || '*'; | |
| let uaLine = 'User-Agent: *'; | |
| const pathNormalized = `${url.pathname}?`; | |
| for (const line of lines) { | |
| const trimmed = line.trim(); | |
| if (trimmed.startsWith('#') || !trimmed) { | |
| continue; | |
| } | |
| const [k, ...rest] = trimmed.split(':'); | |
| const key = k.trim().toLowerCase(); | |
| const value = rest.join(':').trim(); | |
| if (key === 'user-agent') { | |
| currentUa = value.toLowerCase(); | |
| if (value === '*') { | |
| currentUa = myUa; | |
| } | |
| uaLine = line; | |
| continue; | |
| } | |
| if (currentUa !== myUa) { | |
| continue; | |
| } | |
| if (key === 'disallow') { | |
| if (!value) { | |
| return true; | |
| } | |
| if (value.includes('*')) { | |
| const [head, tail] = value.split('*'); | |
| if (url.pathname.startsWith(head) && url.pathname.endsWith(tail)) { | |
| throw new ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`); | |
| } | |
| } else if (pathNormalized.startsWith(value)) { | |
| throw new ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`); | |
| } | |
| continue; | |
| } | |
| if (key === 'allow') { | |
| if (!value) { | |
| return true; | |
| } | |
| if (pathNormalized.startsWith(value)) { | |
| return true; | |
| } | |
| continue; | |
| } | |
| } | |
| return true; | |
| } | |
| } | |