Spaces:
Build error
Build error
cleanup: use local project code as much as possible
Browse files- src/api/crawler.ts +14 -4
- src/services/alt-text.ts +2 -2
- src/services/brave-search.ts +4 -4
- src/services/cf-browser-rendering.ts +3 -2
- src/services/curl.ts +5 -4
- src/services/errors.ts +6 -0
- src/services/geoip.ts +2 -2
- src/services/jsdom.ts +3 -3
- src/services/lm.ts +2 -2
- src/services/pdf-extract.ts +3 -3
- src/services/puppeteer.ts +2 -6
- src/services/robots-text.ts +4 -7
- src/services/serper-search.ts +4 -4
- src/services/snapshot-formatter.ts +3 -3
- thinapps-shared +1 -1
src/api/crawler.ts
CHANGED
|
@@ -32,14 +32,16 @@ import { GlobalLogger } from '../services/logger';
|
|
| 32 |
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
| 33 |
import { AsyncLocalContext } from '../services/async-context';
|
| 34 |
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
|
| 35 |
-
import {
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
| 38 |
import { ProxyProvider } from '../shared/services/proxy-provider';
|
| 39 |
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
| 40 |
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
| 41 |
import { RobotsTxtService } from '../services/robots-text';
|
| 42 |
-
import { ServiceBadAttemptError } from '../shared/lib/errors';
|
| 43 |
|
| 44 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 45 |
withIframe?: boolean | 'quoted';
|
|
@@ -758,7 +760,9 @@ export class CrawlerHost extends RPCHost {
|
|
| 758 |
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
|
| 759 |
draftSnapshot.title ??= analyzed.title;
|
| 760 |
let fallbackProxyIsUsed = false;
|
| 761 |
-
if ((!crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) &&
|
|
|
|
|
|
|
| 762 |
const proxyLoaded = await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
|
| 763 |
if (!proxyLoaded.file) {
|
| 764 |
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
|
|
@@ -904,7 +908,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 904 |
}
|
| 905 |
this.threadLocal.set('retainImages', opts.retainImages);
|
| 906 |
this.threadLocal.set('noGfm', opts.noGfm);
|
| 907 |
-
this.threadLocal.set('DNT', Boolean(opts.doNotTrack))
|
| 908 |
|
| 909 |
const crawlOpts: ExtraScrappingOptions = {
|
| 910 |
proxyUrl: opts.proxyUrl,
|
|
@@ -1146,6 +1150,9 @@ export class CrawlerHost extends RPCHost {
|
|
| 1146 |
}
|
| 1147 |
|
| 1148 |
@retryWith((err) => {
|
|
|
|
|
|
|
|
|
|
| 1149 |
if (err instanceof ServiceBadAttemptError) {
|
| 1150 |
// Keep trying
|
| 1151 |
return true;
|
|
@@ -1157,6 +1164,9 @@ export class CrawlerHost extends RPCHost {
|
|
| 1157 |
return undefined;
|
| 1158 |
}, 3)
|
| 1159 |
async sideLoadWithAllocatedProxy(url: URL, opts?: ExtraScrappingOptions) {
|
|
|
|
|
|
|
|
|
|
| 1160 |
const proxy = await this.proxyProvider.alloc(opts?.allocProxy);
|
| 1161 |
const r = await this.curlControl.sideLoad(url, {
|
| 1162 |
...opts,
|
|
|
|
| 32 |
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
| 33 |
import { AsyncLocalContext } from '../services/async-context';
|
| 34 |
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
|
| 35 |
+
import {
|
| 36 |
+
BudgetExceededError, InsufficientBalanceError,
|
| 37 |
+
SecurityCompromiseError, ServiceBadApproachError, ServiceBadAttemptError
|
| 38 |
+
} from '../services/errors';
|
| 39 |
|
| 40 |
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
| 41 |
import { ProxyProvider } from '../shared/services/proxy-provider';
|
| 42 |
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
| 43 |
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
| 44 |
import { RobotsTxtService } from '../services/robots-text';
|
|
|
|
| 45 |
|
| 46 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 47 |
withIframe?: boolean | 'quoted';
|
|
|
|
| 760 |
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
|
| 761 |
draftSnapshot.title ??= analyzed.title;
|
| 762 |
let fallbackProxyIsUsed = false;
|
| 763 |
+
if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
|
| 764 |
+
(analyzed.tokens < 42 || sideLoaded.status !== 200)
|
| 765 |
+
) {
|
| 766 |
const proxyLoaded = await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
|
| 767 |
if (!proxyLoaded.file) {
|
| 768 |
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
|
|
|
|
| 908 |
}
|
| 909 |
this.threadLocal.set('retainImages', opts.retainImages);
|
| 910 |
this.threadLocal.set('noGfm', opts.noGfm);
|
| 911 |
+
this.threadLocal.set('DNT', Boolean(opts.doNotTrack));
|
| 912 |
|
| 913 |
const crawlOpts: ExtraScrappingOptions = {
|
| 914 |
proxyUrl: opts.proxyUrl,
|
|
|
|
| 1150 |
}
|
| 1151 |
|
| 1152 |
@retryWith((err) => {
|
| 1153 |
+
if (err instanceof ServiceBadApproachError) {
|
| 1154 |
+
return false;
|
| 1155 |
+
}
|
| 1156 |
if (err instanceof ServiceBadAttemptError) {
|
| 1157 |
// Keep trying
|
| 1158 |
return true;
|
|
|
|
| 1164 |
return undefined;
|
| 1165 |
}, 3)
|
| 1166 |
async sideLoadWithAllocatedProxy(url: URL, opts?: ExtraScrappingOptions) {
|
| 1167 |
+
if (opts?.allocProxy === 'none') {
|
| 1168 |
+
return this.curlControl.sideLoad(url, opts);
|
| 1169 |
+
}
|
| 1170 |
const proxy = await this.proxyProvider.alloc(opts?.allocProxy);
|
| 1171 |
const r = await this.curlControl.sideLoad(url, {
|
| 1172 |
...opts,
|
src/services/alt-text.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
|
| 2 |
import { singleton } from 'tsyringe';
|
| 3 |
-
import {
|
| 4 |
import { CanvasService } from '../shared/services/canvas';
|
| 5 |
import { ImageInterrogationManager } from '../shared/services/common-iminterrogate';
|
| 6 |
import { ImgBrief } from './puppeteer';
|
|
@@ -16,7 +16,7 @@ export class AltTextService extends AsyncService {
|
|
| 16 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 17 |
|
| 18 |
constructor(
|
| 19 |
-
protected globalLogger:
|
| 20 |
protected imageInterrogator: ImageInterrogationManager,
|
| 21 |
protected canvasService: CanvasService,
|
| 22 |
protected asyncLocalContext: AsyncLocalContext
|
|
|
|
| 1 |
import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
|
| 2 |
import { singleton } from 'tsyringe';
|
| 3 |
+
import { GlobalLogger } from './logger';
|
| 4 |
import { CanvasService } from '../shared/services/canvas';
|
| 5 |
import { ImageInterrogationManager } from '../shared/services/common-iminterrogate';
|
| 6 |
import { ImgBrief } from './puppeteer';
|
|
|
|
| 16 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 17 |
|
| 18 |
constructor(
|
| 19 |
+
protected globalLogger: GlobalLogger,
|
| 20 |
protected imageInterrogator: ImageInterrogationManager,
|
| 21 |
protected canvasService: CanvasService,
|
| 22 |
protected asyncLocalContext: AsyncLocalContext
|
src/services/brave-search.ts
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike } from 'civkit';
|
| 2 |
import { singleton } from 'tsyringe';
|
| 3 |
-
import {
|
| 4 |
import { SecretExposer } from '../shared/services/secrets';
|
| 5 |
import { BraveSearchHTTP, WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
| 6 |
import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip';
|
| 7 |
-
import {
|
| 8 |
import { WebSearchOptionalHeaderOptions } from '../shared/3rd-party/brave-types';
|
| 9 |
import type { Request, Response } from 'express';
|
| 10 |
import { BlackHoleDetector } from './blackhole-detector';
|
|
@@ -17,10 +17,10 @@ export class BraveSearchService extends AsyncService {
|
|
| 17 |
braveSearchHTTP!: BraveSearchHTTP;
|
| 18 |
|
| 19 |
constructor(
|
| 20 |
-
protected globalLogger:
|
| 21 |
protected secretExposer: SecretExposer,
|
| 22 |
protected geoipControl: GeoIPService,
|
| 23 |
-
protected threadLocal:
|
| 24 |
protected blackHoleDetector: BlackHoleDetector,
|
| 25 |
) {
|
| 26 |
super(...arguments);
|
|
|
|
| 1 |
import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike } from 'civkit';
|
| 2 |
import { singleton } from 'tsyringe';
|
| 3 |
+
import { GlobalLogger } from './logger';
|
| 4 |
import { SecretExposer } from '../shared/services/secrets';
|
| 5 |
import { BraveSearchHTTP, WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
| 6 |
import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip';
|
| 7 |
+
import { AsyncLocalContext } from './async-context';
|
| 8 |
import { WebSearchOptionalHeaderOptions } from '../shared/3rd-party/brave-types';
|
| 9 |
import type { Request, Response } from 'express';
|
| 10 |
import { BlackHoleDetector } from './blackhole-detector';
|
|
|
|
| 17 |
braveSearchHTTP!: BraveSearchHTTP;
|
| 18 |
|
| 19 |
constructor(
|
| 20 |
+
protected globalLogger: GlobalLogger,
|
| 21 |
protected secretExposer: SecretExposer,
|
| 22 |
protected geoipControl: GeoIPService,
|
| 23 |
+
protected threadLocal: AsyncLocalContext,
|
| 24 |
protected blackHoleDetector: BlackHoleDetector,
|
| 25 |
) {
|
| 26 |
super(...arguments);
|
src/services/cf-browser-rendering.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import { container, singleton } from 'tsyringe';
|
| 2 |
import { AsyncService } from 'civkit/async-service';
|
| 3 |
-
import {
|
|
|
|
| 4 |
import { CloudFlareHTTP } from '../shared/3rd-party/cloud-flare';
|
| 5 |
|
| 6 |
@singleton()
|
|
@@ -10,7 +11,7 @@ export class CFBrowserRendering extends AsyncService {
|
|
| 10 |
client!: CloudFlareHTTP;
|
| 11 |
|
| 12 |
constructor(
|
| 13 |
-
protected globalLogger:
|
| 14 |
protected secretExposer: SecretExposer,
|
| 15 |
) {
|
| 16 |
super(...arguments);
|
|
|
|
| 1 |
import { container, singleton } from 'tsyringe';
|
| 2 |
import { AsyncService } from 'civkit/async-service';
|
| 3 |
+
import { SecretExposer } from '../shared/services/secrets';
|
| 4 |
+
import { GlobalLogger } from './logger';
|
| 5 |
import { CloudFlareHTTP } from '../shared/3rd-party/cloud-flare';
|
| 6 |
|
| 7 |
@singleton()
|
|
|
|
| 11 |
client!: CloudFlareHTTP;
|
| 12 |
|
| 13 |
constructor(
|
| 14 |
+
protected globalLogger: GlobalLogger,
|
| 15 |
protected secretExposer: SecretExposer,
|
| 16 |
) {
|
| 17 |
super(...arguments);
|
src/services/curl.ts
CHANGED
|
@@ -5,9 +5,10 @@ import { Curl, CurlCode, CurlFeature, HeaderInfo } from 'node-libcurl';
|
|
| 5 |
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
| 6 |
|
| 7 |
import { ScrappingOptions } from './puppeteer';
|
| 8 |
-
import {
|
| 9 |
import { AssertionFailureError, FancyFile } from 'civkit';
|
| 10 |
-
import { ServiceBadAttemptError,
|
|
|
|
| 11 |
import { createBrotliDecompress, createInflate, createGunzip } from 'zlib';
|
| 12 |
import { ZSTDDecompress } from 'simple-zstd';
|
| 13 |
import _ from 'lodash';
|
|
@@ -32,7 +33,7 @@ export class CurlControl extends AsyncService {
|
|
| 32 |
lifeCycleTrack = new WeakMap();
|
| 33 |
|
| 34 |
constructor(
|
| 35 |
-
protected globalLogger:
|
| 36 |
protected tempFileManager: TempFileManager,
|
| 37 |
protected asyncLocalContext: AsyncLocalContext,
|
| 38 |
) {
|
|
@@ -328,7 +329,7 @@ export class CurlControl extends AsyncService {
|
|
| 328 |
};
|
| 329 |
}
|
| 330 |
if (!location && cookieRedirects > 1) {
|
| 331 |
-
throw new
|
| 332 |
}
|
| 333 |
|
| 334 |
nextHopUrl = new URL(location || '', nextHopUrl);
|
|
|
|
| 5 |
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
| 6 |
|
| 7 |
import { ScrappingOptions } from './puppeteer';
|
| 8 |
+
import { GlobalLogger } from './logger';
|
| 9 |
import { AssertionFailureError, FancyFile } from 'civkit';
|
| 10 |
+
import { ServiceBadAttemptError, ServiceBadApproachError } from './errors';
|
| 11 |
+
import { TempFileManager } from '../services/temp-file';
|
| 12 |
import { createBrotliDecompress, createInflate, createGunzip } from 'zlib';
|
| 13 |
import { ZSTDDecompress } from 'simple-zstd';
|
| 14 |
import _ from 'lodash';
|
|
|
|
| 33 |
lifeCycleTrack = new WeakMap();
|
| 34 |
|
| 35 |
constructor(
|
| 36 |
+
protected globalLogger: GlobalLogger,
|
| 37 |
protected tempFileManager: TempFileManager,
|
| 38 |
protected asyncLocalContext: AsyncLocalContext,
|
| 39 |
) {
|
|
|
|
| 329 |
};
|
| 330 |
}
|
| 331 |
if (!location && cookieRedirects > 1) {
|
| 332 |
+
throw new ServiceBadApproachError(`Failed to access ${urlToCrawl}: Browser required to solve complex cookie preconditions.`);
|
| 333 |
}
|
| 334 |
|
| 335 |
nextHopUrl = new URL(location || '', nextHopUrl);
|
src/services/errors.ts
CHANGED
|
@@ -14,6 +14,12 @@ export class ServiceCrashedError extends ApplicationError { }
|
|
| 14 |
@StatusCode(50303)
|
| 15 |
export class ServiceNodeResourceDrainError extends ApplicationError { }
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
@StatusCode(40104)
|
| 18 |
export class EmailUnverifiedError extends ApplicationError { }
|
| 19 |
|
|
|
|
| 14 |
@StatusCode(50303)
|
| 15 |
export class ServiceNodeResourceDrainError extends ApplicationError { }
|
| 16 |
|
| 17 |
+
@StatusCode(50304)
|
| 18 |
+
export class ServiceBadAttemptError extends ApplicationError { }
|
| 19 |
+
|
| 20 |
+
@StatusCode(50305)
|
| 21 |
+
export class ServiceBadApproachError extends ServiceBadAttemptError { }
|
| 22 |
+
|
| 23 |
@StatusCode(40104)
|
| 24 |
export class EmailUnverifiedError extends ApplicationError { }
|
| 25 |
|
src/services/geoip.ts
CHANGED
|
@@ -2,7 +2,7 @@ import { container, singleton } from 'tsyringe';
|
|
| 2 |
import fsp from 'fs/promises';
|
| 3 |
import { CityResponse, Reader } from 'maxmind';
|
| 4 |
import { AsyncService, AutoCastable, Prop, runOnce } from 'civkit';
|
| 5 |
-
import {
|
| 6 |
import path from 'path';
|
| 7 |
|
| 8 |
export enum GEOIP_SUPPORTED_LANGUAGES {
|
|
@@ -61,7 +61,7 @@ export class GeoIPService extends AsyncService {
|
|
| 61 |
mmdbCity!: Reader<CityResponse>;
|
| 62 |
|
| 63 |
constructor(
|
| 64 |
-
protected globalLogger:
|
| 65 |
) {
|
| 66 |
super(...arguments);
|
| 67 |
}
|
|
|
|
| 2 |
import fsp from 'fs/promises';
|
| 3 |
import { CityResponse, Reader } from 'maxmind';
|
| 4 |
import { AsyncService, AutoCastable, Prop, runOnce } from 'civkit';
|
| 5 |
+
import { GlobalLogger } from './logger';
|
| 6 |
import path from 'path';
|
| 7 |
|
| 8 |
export enum GEOIP_SUPPORTED_LANGUAGES {
|
|
|
|
| 61 |
mmdbCity!: Reader<CityResponse>;
|
| 62 |
|
| 63 |
constructor(
|
| 64 |
+
protected globalLogger: GlobalLogger,
|
| 65 |
) {
|
| 66 |
super(...arguments);
|
| 67 |
}
|
src/services/jsdom.ts
CHANGED
|
@@ -1,13 +1,13 @@
|
|
| 1 |
import { container, singleton } from 'tsyringe';
|
| 2 |
import { AsyncService, marshalErrorLike } from 'civkit';
|
| 3 |
-
import {
|
| 4 |
import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer';
|
| 5 |
import { Readability } from '@mozilla/readability';
|
| 6 |
import TurndownService from 'turndown';
|
| 7 |
import { Threaded } from '../services/threaded';
|
| 8 |
import type { ExtraScrappingOptions } from '../api/crawler';
|
| 9 |
import { tailwindClasses } from '../utils/tailwind-classes';
|
| 10 |
-
import { countGPTToken } from '../shared';
|
| 11 |
|
| 12 |
const pLinkedom = import('linkedom');
|
| 13 |
|
|
@@ -19,7 +19,7 @@ export class JSDomControl extends AsyncService {
|
|
| 19 |
linkedom!: Awaited<typeof pLinkedom>;
|
| 20 |
|
| 21 |
constructor(
|
| 22 |
-
protected globalLogger:
|
| 23 |
) {
|
| 24 |
super(...arguments);
|
| 25 |
}
|
|
|
|
| 1 |
import { container, singleton } from 'tsyringe';
|
| 2 |
import { AsyncService, marshalErrorLike } from 'civkit';
|
| 3 |
+
import { GlobalLogger } from './logger';
|
| 4 |
import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer';
|
| 5 |
import { Readability } from '@mozilla/readability';
|
| 6 |
import TurndownService from 'turndown';
|
| 7 |
import { Threaded } from '../services/threaded';
|
| 8 |
import type { ExtraScrappingOptions } from '../api/crawler';
|
| 9 |
import { tailwindClasses } from '../utils/tailwind-classes';
|
| 10 |
+
import { countGPTToken } from '../shared/utils/openai';
|
| 11 |
|
| 12 |
const pLinkedom = import('linkedom');
|
| 13 |
|
|
|
|
| 19 |
linkedom!: Awaited<typeof pLinkedom>;
|
| 20 |
|
| 21 |
constructor(
|
| 22 |
+
protected globalLogger: GlobalLogger,
|
| 23 |
) {
|
| 24 |
super(...arguments);
|
| 25 |
}
|
src/services/lm.ts
CHANGED
|
@@ -2,7 +2,7 @@ import { AsyncService } from 'civkit/async-service';
|
|
| 2 |
import { singleton } from 'tsyringe';
|
| 3 |
|
| 4 |
import { PageSnapshot } from './puppeteer';
|
| 5 |
-
import {
|
| 6 |
import _ from 'lodash';
|
| 7 |
import { AssertionFailureError } from 'civkit';
|
| 8 |
import { LLMManager } from '../shared/services/common-llm';
|
|
@@ -16,7 +16,7 @@ export class LmControl extends AsyncService {
|
|
| 16 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 17 |
|
| 18 |
constructor(
|
| 19 |
-
protected globalLogger:
|
| 20 |
protected commonLLM: LLMManager,
|
| 21 |
protected jsdomControl: JSDomControl,
|
| 22 |
) {
|
|
|
|
| 2 |
import { singleton } from 'tsyringe';
|
| 3 |
|
| 4 |
import { PageSnapshot } from './puppeteer';
|
| 5 |
+
import { GlobalLogger } from './logger';
|
| 6 |
import _ from 'lodash';
|
| 7 |
import { AssertionFailureError } from 'civkit';
|
| 8 |
import { LLMManager } from '../shared/services/common-llm';
|
|
|
|
| 16 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 17 |
|
| 18 |
constructor(
|
| 19 |
+
protected globalLogger: GlobalLogger,
|
| 20 |
protected commonLLM: LLMManager,
|
| 21 |
protected jsdomControl: JSDomControl,
|
| 22 |
) {
|
src/services/pdf-extract.ts
CHANGED
|
@@ -3,10 +3,10 @@ import { singleton } from 'tsyringe';
|
|
| 3 |
import _ from 'lodash';
|
| 4 |
import { TextItem } from 'pdfjs-dist/types/src/display/api';
|
| 5 |
import { AsyncService, HashManager } from 'civkit';
|
| 6 |
-
import {
|
| 7 |
import { PDFContent } from '../db/pdf';
|
| 8 |
import dayjs from 'dayjs';
|
| 9 |
-
import { FirebaseStorageBucketControl } from '../shared';
|
| 10 |
import { randomUUID } from 'crypto';
|
| 11 |
import type { PDFDocumentLoadingTask } from 'pdfjs-dist';
|
| 12 |
import path from 'path';
|
|
@@ -55,7 +55,7 @@ export class PDFExtractor extends AsyncService {
|
|
| 55 |
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
| 56 |
|
| 57 |
constructor(
|
| 58 |
-
protected globalLogger:
|
| 59 |
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
| 60 |
protected asyncLocalContext: AsyncLocalContext,
|
| 61 |
) {
|
|
|
|
| 3 |
import _ from 'lodash';
|
| 4 |
import { TextItem } from 'pdfjs-dist/types/src/display/api';
|
| 5 |
import { AsyncService, HashManager } from 'civkit';
|
| 6 |
+
import { GlobalLogger } from './logger';
|
| 7 |
import { PDFContent } from '../db/pdf';
|
| 8 |
import dayjs from 'dayjs';
|
| 9 |
+
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
| 10 |
import { randomUUID } from 'crypto';
|
| 11 |
import type { PDFDocumentLoadingTask } from 'pdfjs-dist';
|
| 12 |
import path from 'path';
|
|
|
|
| 55 |
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
| 56 |
|
| 57 |
constructor(
|
| 58 |
+
protected globalLogger: GlobalLogger,
|
| 59 |
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
| 60 |
protected asyncLocalContext: AsyncLocalContext,
|
| 61 |
) {
|
src/services/puppeteer.ts
CHANGED
|
@@ -2,14 +2,13 @@ import os from 'os';
|
|
| 2 |
import fs from 'fs';
|
| 3 |
import { container, singleton } from 'tsyringe';
|
| 4 |
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick, ParamValidationError, FancyFile } from 'civkit';
|
| 5 |
-
import {
|
| 6 |
|
| 7 |
import type { Browser, CookieParam, GoToOptions, HTTPResponse, Page, Viewport } from 'puppeteer';
|
| 8 |
import type { Cookie } from 'set-cookie-parser';
|
| 9 |
import puppeteer from 'puppeteer-extra';
|
| 10 |
|
| 11 |
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
| 12 |
-
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
| 13 |
import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors';
|
| 14 |
import { TimeoutError } from 'puppeteer';
|
| 15 |
import _ from 'lodash';
|
|
@@ -108,9 +107,6 @@ puppeteer.use(puppeteerBlockResources({
|
|
| 108 |
blockedTypes: new Set(['media']),
|
| 109 |
interceptResolutionPriority: 1,
|
| 110 |
}));
|
| 111 |
-
puppeteer.use(puppeteerPageProxy({
|
| 112 |
-
interceptResolutionPriority: 1,
|
| 113 |
-
}));
|
| 114 |
|
| 115 |
const SIMULATE_SCROLL = `
|
| 116 |
(function () {
|
|
@@ -472,7 +468,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 472 |
lifeCycleTrack = new WeakMap();
|
| 473 |
|
| 474 |
constructor(
|
| 475 |
-
protected globalLogger:
|
| 476 |
protected asyncLocalContext: AsyncLocalContext,
|
| 477 |
protected curlControl: CurlControl,
|
| 478 |
protected blackHoleDetector: BlackHoleDetector,
|
|
|
|
| 2 |
import fs from 'fs';
|
| 3 |
import { container, singleton } from 'tsyringe';
|
| 4 |
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick, ParamValidationError, FancyFile } from 'civkit';
|
| 5 |
+
import { GlobalLogger } from './logger';
|
| 6 |
|
| 7 |
import type { Browser, CookieParam, GoToOptions, HTTPResponse, Page, Viewport } from 'puppeteer';
|
| 8 |
import type { Cookie } from 'set-cookie-parser';
|
| 9 |
import puppeteer from 'puppeteer-extra';
|
| 10 |
|
| 11 |
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
|
|
|
| 12 |
import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors';
|
| 13 |
import { TimeoutError } from 'puppeteer';
|
| 14 |
import _ from 'lodash';
|
|
|
|
| 107 |
blockedTypes: new Set(['media']),
|
| 108 |
interceptResolutionPriority: 1,
|
| 109 |
}));
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
const SIMULATE_SCROLL = `
|
| 112 |
(function () {
|
|
|
|
| 468 |
lifeCycleTrack = new WeakMap();
|
| 469 |
|
| 470 |
constructor(
|
| 471 |
+
protected globalLogger: GlobalLogger,
|
| 472 |
protected asyncLocalContext: AsyncLocalContext,
|
| 473 |
protected curlControl: CurlControl,
|
| 474 |
protected blackHoleDetector: BlackHoleDetector,
|
src/services/robots-text.ts
CHANGED
|
@@ -1,13 +1,12 @@
|
|
| 1 |
import { singleton } from 'tsyringe';
|
|
|
|
| 2 |
import { DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc';
|
| 3 |
import { AsyncService } from 'civkit/async-service';
|
| 4 |
import { HashManager } from 'civkit/hash';
|
| 5 |
import { marshalErrorLike } from 'civkit/lang';
|
| 6 |
|
| 7 |
-
import {
|
| 8 |
-
import {
|
| 9 |
-
import { FirebaseStorageBucketControl } from '../shared';
|
| 10 |
-
import { URL } from 'url';
|
| 11 |
import { Threaded } from '../services/threaded';
|
| 12 |
|
| 13 |
|
|
@@ -18,10 +17,8 @@ export class RobotsTxtService extends AsyncService {
|
|
| 18 |
|
| 19 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 20 |
|
| 21 |
-
braveSearchHTTP!: BraveSearchHTTP;
|
| 22 |
-
|
| 23 |
constructor(
|
| 24 |
-
protected globalLogger:
|
| 25 |
protected firebaseStorageBucketControl: FirebaseStorageBucketControl,
|
| 26 |
) {
|
| 27 |
super(...arguments);
|
|
|
|
| 1 |
import { singleton } from 'tsyringe';
|
| 2 |
+
import { URL } from 'url';
|
| 3 |
import { DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc';
|
| 4 |
import { AsyncService } from 'civkit/async-service';
|
| 5 |
import { HashManager } from 'civkit/hash';
|
| 6 |
import { marshalErrorLike } from 'civkit/lang';
|
| 7 |
|
| 8 |
+
import { GlobalLogger } from './logger';
|
| 9 |
+
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
|
|
|
|
|
|
| 10 |
import { Threaded } from '../services/threaded';
|
| 11 |
|
| 12 |
|
|
|
|
| 17 |
|
| 18 |
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 19 |
|
|
|
|
|
|
|
| 20 |
constructor(
|
| 21 |
+
protected globalLogger: GlobalLogger,
|
| 22 |
protected firebaseStorageBucketControl: FirebaseStorageBucketControl,
|
| 23 |
) {
|
| 24 |
super(...arguments);
|
src/services/serper-search.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike } from 'civkit';
|
| 2 |
import { singleton } from 'tsyringe';
|
| 3 |
-
import {
|
| 4 |
import { SecretExposer } from '../shared/services/secrets';
|
| 5 |
import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip';
|
| 6 |
-
import {
|
| 7 |
import { SerperGoogleHTTP, SerperSearchQueryParams, WORLD_COUNTRIES } from '../shared/3rd-party/serper-search';
|
| 8 |
import { BlackHoleDetector } from './blackhole-detector';
|
| 9 |
import { Context } from './registry';
|
|
@@ -16,10 +16,10 @@ export class SerperSearchService extends AsyncService {
|
|
| 16 |
serperSearchHTTP!: SerperGoogleHTTP;
|
| 17 |
|
| 18 |
constructor(
|
| 19 |
-
protected globalLogger:
|
| 20 |
protected secretExposer: SecretExposer,
|
| 21 |
protected geoipControl: GeoIPService,
|
| 22 |
-
protected threadLocal:
|
| 23 |
protected blackHoleDetector: BlackHoleDetector,
|
| 24 |
) {
|
| 25 |
super(...arguments);
|
|
|
|
| 1 |
import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike } from 'civkit';
|
| 2 |
import { singleton } from 'tsyringe';
|
| 3 |
+
import { GlobalLogger } from './logger';
|
| 4 |
import { SecretExposer } from '../shared/services/secrets';
|
| 5 |
import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip';
|
| 6 |
+
import { AsyncLocalContext } from './async-context';
|
| 7 |
import { SerperGoogleHTTP, SerperSearchQueryParams, WORLD_COUNTRIES } from '../shared/3rd-party/serper-search';
|
| 8 |
import { BlackHoleDetector } from './blackhole-detector';
|
| 9 |
import { Context } from './registry';
|
|
|
|
| 16 |
serperSearchHTTP!: SerperGoogleHTTP;
|
| 17 |
|
| 18 |
constructor(
|
| 19 |
+
protected globalLogger: GlobalLogger,
|
| 20 |
protected secretExposer: SecretExposer,
|
| 21 |
protected geoipControl: GeoIPService,
|
| 22 |
+
protected threadLocal: AsyncLocalContext,
|
| 23 |
protected blackHoleDetector: BlackHoleDetector,
|
| 24 |
) {
|
| 25 |
super(...arguments);
|
src/services/snapshot-formatter.ts
CHANGED
|
@@ -2,7 +2,7 @@ import { randomUUID } from 'crypto';
|
|
| 2 |
import { container, singleton } from 'tsyringe';
|
| 3 |
import { AssertionFailureError, AsyncService, FancyFile, HashManager, marshalErrorLike } from 'civkit';
|
| 4 |
import TurndownService, { Filter, Rule } from 'turndown';
|
| 5 |
-
import {
|
| 6 |
import { PageSnapshot } from './puppeteer';
|
| 7 |
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
| 8 |
import { AsyncContext } from '../shared/services/async-context';
|
|
@@ -16,7 +16,7 @@ import { STATUS_CODES } from 'http';
|
|
| 16 |
import type { CrawlerOptions } from '../dto/crawler-options';
|
| 17 |
import { readFile } from 'fs/promises';
|
| 18 |
import { pathToFileURL } from 'url';
|
| 19 |
-
import { countGPTToken } from '../shared';
|
| 20 |
|
| 21 |
|
| 22 |
export interface FormattedPage {
|
|
@@ -82,7 +82,7 @@ export class SnapshotFormatter extends AsyncService {
|
|
| 82 |
gfmNoTable = [highlightedCodeBlock, gfmPlugin.strikethrough, gfmPlugin.taskListItems];
|
| 83 |
|
| 84 |
constructor(
|
| 85 |
-
protected globalLogger:
|
| 86 |
protected jsdomControl: JSDomControl,
|
| 87 |
protected altTextService: AltTextService,
|
| 88 |
protected pdfExtractor: PDFExtractor,
|
|
|
|
| 2 |
import { container, singleton } from 'tsyringe';
|
| 3 |
import { AssertionFailureError, AsyncService, FancyFile, HashManager, marshalErrorLike } from 'civkit';
|
| 4 |
import TurndownService, { Filter, Rule } from 'turndown';
|
| 5 |
+
import { GlobalLogger } from './logger';
|
| 6 |
import { PageSnapshot } from './puppeteer';
|
| 7 |
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
| 8 |
import { AsyncContext } from '../shared/services/async-context';
|
|
|
|
| 16 |
import type { CrawlerOptions } from '../dto/crawler-options';
|
| 17 |
import { readFile } from 'fs/promises';
|
| 18 |
import { pathToFileURL } from 'url';
|
| 19 |
+
import { countGPTToken } from '../shared/utils/openai';
|
| 20 |
|
| 21 |
|
| 22 |
export interface FormattedPage {
|
|
|
|
| 82 |
gfmNoTable = [highlightedCodeBlock, gfmPlugin.strikethrough, gfmPlugin.taskListItems];
|
| 83 |
|
| 84 |
constructor(
|
| 85 |
+
protected globalLogger: GlobalLogger,
|
| 86 |
protected jsdomControl: JSDomControl,
|
| 87 |
protected altTextService: AltTextService,
|
| 88 |
protected pdfExtractor: PDFExtractor,
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit 755639081df7640733bb5f704460892a1a9059e7
|