Spaces:

nateraw
/

jupyterlab-inference-dev

Runtime error

App Files Files Community

jupyterlab-inference-dev / proto /generate.proto

nateraw

🍻 cheers

c7a96cd over 1 year ago

raw

history blame

4.43 kB

	syntax = "proto3";

	package generate.v1;

	service TextGenerationService {
	/// Model Info
	rpc Info (InfoRequest) returns (InfoResponse) {}
	/// Service discovery
	rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {}
	/// Empties batch cache
	rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse);
	/// Remove requests from a cached batch
	rpc FilterBatch (FilterBatchRequest) returns (FilterBatchResponse);
	/// Prefill batch and decode first token
	rpc Prefill (PrefillRequest) returns (PrefillResponse);
	/// Decode token for a list of prefilled batches
	rpc Decode (DecodeRequest) returns (DecodeResponse);
	/// Health check
	rpc Health (HealthRequest) returns (HealthResponse);
	}

	message HealthRequest {}
	message HealthResponse {}

	/// Empty request
	message InfoRequest {}

	message InfoResponse {
	bool requires_padding = 1;
	string dtype = 2;
	string device_type = 3;
	}

	/// Empty request
	message ServiceDiscoveryRequest {}

	message ServiceDiscoveryResponse {
	/// Other shards urls
	repeated string urls = 1;
	}

	message ClearCacheRequest {
	/// Optional batch id
	optional uint64 id = 1;
	}

	/// Empty response
	message ClearCacheResponse {}

	message NextTokenChooserParameters {
	/// exponential scaling output probability distribution
	float temperature = 1;
	/// restricting to the k highest probability elements
	uint32 top_k = 2;
	/// restricting to top tokens summing to prob_cut_off <= prob_cut_off
	float top_p = 3;
	/// restricting to top tokens summing to prob_cut_off <= prob_cut_off
	float typical_p = 4;
	/// apply sampling on the logits
	bool do_sample = 5;
	/// random seed for sampling
	uint64 seed = 6;
	/// repetition penalty
	float repetition_penalty = 7;
	/// token watermarking using "A Watermark for Large Language Models"
	bool watermark = 8;
	}

	message StoppingCriteriaParameters {
	/// Maximum number of generated tokens
	uint32 max_new_tokens = 1;
	/// Optional stopping sequences
	repeated string stop_sequences = 2;
	/// Ignore end of sequence token
	/// used for benchmarking
	bool ignore_eos_token = 3;
	}

	message Request {
	/// Request ID
	uint64 id = 1;
	/// The generation context
	string inputs = 2;
	/// Context truncation
	uint32 truncate = 3;
	/// Next Token Chooser Parameters
	NextTokenChooserParameters parameters = 4;
	/// Stopping Criteria Parameters
	StoppingCriteriaParameters stopping_parameters = 5;
	}

	message Batch {
	/// Batch ID
	uint64 id = 1;
	/// Individual requests
	repeated Request requests = 2;
	/// Batch size (==len(requests))
	uint32 size = 3;
	/// Maximum number of tokens this batch will grow to
	uint32 max_tokens = 4;
	}

	enum FinishReason {
	FINISH_REASON_LENGTH = 0;
	FINISH_REASON_EOS_TOKEN = 1;
	FINISH_REASON_STOP_SEQUENCE = 2;
	}

	message GeneratedText {
	/// Output
	string text = 1;
	/// Number of generated tokens
	uint32 generated_tokens = 2;
	/// Finish reason
	FinishReason finish_reason = 3;
	/// Seed
	optional uint64 seed = 4;
	}

	message PrefillTokens {
	/// Prefill Token IDs
	repeated uint32 ids = 1;
	/// Prefill Logprobs
	repeated float logprobs = 2;
	/// Prefill tokens
	repeated string texts = 3;
	}

	message Generation {
	/// Request ID
	uint64 request_id = 1;
	/// Prefill tokens (optional)
	PrefillTokens prefill_tokens = 2;
	/// Token ID
	uint32 token_id = 3;
	/// Logprob
	float token_logprob = 4;
	/// Text
	string token_text = 5;
	/// Is it a special token
	bool token_is_special = 6;
	/// Complete generated text
	GeneratedText generated_text = 7;
	}

	message FilterBatchRequest {
	/// Batch ID
	uint64 batch_id = 1;
	/// Requests to keep
	repeated Request keep_requests = 2;
	}

	message FilterBatchResponse {
	/// Filtered Batch (cached)
	Batch batch = 1;
	}


	message PrefillRequest {
	/// Batch
	Batch batch = 1;
	}

	message PrefillResponse {
	/// Generation
	repeated Generation generations = 1;
	/// Next batch (cached)
	optional Batch batch = 2;
	}

	message DecodeRequest {
	/// Cached batches
	repeated Batch batches = 1;
	}

	message DecodeResponse {
	/// Decodes
	repeated Generation generations = 1;
	/// Next batch (cached)
	optional Batch batch = 2;
	}