yessenzhar commited on
Commit
a83b588
1 Parent(s): 210dc9f

add smaller files

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.pyc
ensemble/1/.tmp ADDED
File without changes
ensemble/config.pbtxt ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "ensemble"
28
+ platform: "ensemble"
29
+ max_batch_size: 128
30
+ input [
31
+ {
32
+ name: "text_input"
33
+ data_type: TYPE_STRING
34
+ dims: [ -1 ]
35
+ },
36
+ {
37
+ name: "max_tokens"
38
+ data_type: TYPE_UINT32
39
+ dims: [ -1 ]
40
+ },
41
+ {
42
+ name: "bad_words"
43
+ data_type: TYPE_STRING
44
+ dims: [ -1 ]
45
+ },
46
+ {
47
+ name: "stop_words"
48
+ data_type: TYPE_STRING
49
+ dims: [ -1 ]
50
+ },
51
+ {
52
+ name: "end_id"
53
+ data_type: TYPE_UINT32
54
+ dims: [ 1 ]
55
+ optional: true
56
+ },
57
+ {
58
+ name: "pad_id"
59
+ data_type: TYPE_UINT32
60
+ dims: [ 1 ]
61
+ optional: true
62
+ },
63
+ {
64
+ name: "top_k"
65
+ data_type: TYPE_UINT32
66
+ dims: [ 1 ]
67
+ optional: true
68
+ },
69
+ {
70
+ name: "top_p"
71
+ data_type: TYPE_FP32
72
+ dims: [ 1 ]
73
+ optional: true
74
+ },
75
+ {
76
+ name: "temperature"
77
+ data_type: TYPE_FP32
78
+ dims: [ 1 ]
79
+ optional: true
80
+ },
81
+ {
82
+ name: "length_penalty"
83
+ data_type: TYPE_FP32
84
+ dims: [ 1 ]
85
+ optional: true
86
+ },
87
+ {
88
+ name: "repetition_penalty"
89
+ data_type: TYPE_FP32
90
+ dims: [ 1 ]
91
+ optional: true
92
+ },
93
+ {
94
+ name: "min_length"
95
+ data_type: TYPE_UINT32
96
+ dims: [ 1 ]
97
+ optional: true
98
+ },
99
+ {
100
+ name: "presence_penalty"
101
+ data_type: TYPE_FP32
102
+ dims: [ 1 ]
103
+ optional: true
104
+ },
105
+ {
106
+ name: "random_seed"
107
+ data_type: TYPE_UINT64
108
+ dims: [ 1 ]
109
+ optional: true
110
+ },
111
+ {
112
+ name: "beam_width"
113
+ data_type: TYPE_UINT32
114
+ dims: [ 1 ]
115
+ optional: true
116
+ },
117
+ {
118
+ name: "stream"
119
+ data_type: TYPE_BOOL
120
+ dims: [ 1 ]
121
+ optional: true
122
+ }
123
+ ]
124
+ output [
125
+ {
126
+ name: "text_output"
127
+ data_type: TYPE_STRING
128
+ dims: [ -1, -1 ]
129
+ },
130
+ {
131
+ name: "output_tokens"
132
+ data_type: TYPE_UINT32
133
+ dims: [ -1 ]
134
+ }
135
+ ]
136
+ ensemble_scheduling {
137
+ step [
138
+ {
139
+ model_name: "preprocessing"
140
+ model_version: -1
141
+ input_map {
142
+ key: "QUERY"
143
+ value: "text_input"
144
+ }
145
+ input_map {
146
+ key: "REQUEST_OUTPUT_LEN"
147
+ value: "max_tokens"
148
+ }
149
+ input_map {
150
+ key: "BAD_WORDS_DICT"
151
+ value: "bad_words"
152
+ }
153
+ input_map {
154
+ key: "STOP_WORDS_DICT"
155
+ value: "stop_words"
156
+ }
157
+ output_map {
158
+ key: "REQUEST_INPUT_LEN"
159
+ value: "_REQUEST_INPUT_LEN"
160
+ }
161
+ output_map {
162
+ key: "INPUT_ID"
163
+ value: "_INPUT_ID"
164
+ }
165
+ output_map {
166
+ key: "REQUEST_OUTPUT_LEN"
167
+ value: "_REQUEST_OUTPUT_LEN"
168
+ }
169
+ },
170
+ {
171
+ model_name: "tensorrt_llm"
172
+ model_version: -1
173
+ input_map {
174
+ key: "input_ids"
175
+ value: "_INPUT_ID"
176
+ }
177
+ input_map {
178
+ key: "input_lengths"
179
+ value: "_REQUEST_INPUT_LEN"
180
+ }
181
+ input_map {
182
+ key: "request_output_len"
183
+ value: "_REQUEST_OUTPUT_LEN"
184
+ }
185
+ input_map {
186
+ key: "end_id"
187
+ value: "end_id"
188
+ }
189
+ input_map {
190
+ key: "pad_id"
191
+ value: "pad_id"
192
+ }
193
+ input_map {
194
+ key: "runtime_top_k"
195
+ value: "top_k"
196
+ }
197
+ input_map {
198
+ key: "runtime_top_p"
199
+ value: "top_p"
200
+ }
201
+ input_map {
202
+ key: "temperature"
203
+ value: "temperature"
204
+ }
205
+ input_map {
206
+ key: "len_penalty"
207
+ value: "length_penalty"
208
+ }
209
+ input_map {
210
+ key: "repetition_penalty"
211
+ value: "repetition_penalty"
212
+ }
213
+ input_map {
214
+ key: "min_length"
215
+ value: "min_length"
216
+ }
217
+ input_map {
218
+ key: "presence_penalty"
219
+ value: "presence_penalty"
220
+ }
221
+ input_map {
222
+ key: "random_seed"
223
+ value: "random_seed"
224
+ }
225
+ input_map {
226
+ key: "beam_width"
227
+ value: "beam_width"
228
+ }
229
+ input_map {
230
+ key: "streaming"
231
+ value: "stream"
232
+ }
233
+ output_map {
234
+ key: "output_ids"
235
+ value: "_TOKENS_BATCH"
236
+ }
237
+ },
238
+ {
239
+ model_name: "postprocessing"
240
+ model_version: -1
241
+ input_map {
242
+ key: "TOKENS_BATCH"
243
+ value: "_TOKENS_BATCH"
244
+ }
245
+ output_map {
246
+ key: "OUTPUT"
247
+ value: "text_output"
248
+ }
249
+ output_map {
250
+ key: "OUTPUT_LENS"
251
+ value: "output_tokens"
252
+ }
253
+ }
254
+ ]
255
+ }
postprocessing/1/model.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ import json
28
+
29
+ import numpy as np
30
+ import triton_python_backend_utils as pb_utils
31
+ from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
32
+
33
+
34
+ class TritonPythonModel:
35
+ """Your Python model must use the same class name. Every Python model
36
+ that is created must have "TritonPythonModel" as the class name.
37
+ """
38
+
39
+ def initialize(self, args):
40
+ """`initialize` is called only once when the model is being loaded.
41
+ Implementing `initialize` function is optional. This function allows
42
+ the model to initialize any state associated with this model.
43
+ Parameters
44
+ ----------
45
+ args : dict
46
+ Both keys and values are strings. The dictionary keys and values are:
47
+ * model_config: A JSON string containing the model configuration
48
+ * model_instance_kind: A string containing model instance kind
49
+ * model_instance_device_id: A string containing model instance device ID
50
+ * model_repository: Model repository path
51
+ * model_version: Model version
52
+ * model_name: Model name
53
+ """
54
+ # Parse model configs
55
+ model_config = json.loads(args['model_config'])
56
+ tokenizer_dir = model_config['parameters']['tokenizer_dir'][
57
+ 'string_value']
58
+ tokenizer_type = model_config['parameters']['tokenizer_type'][
59
+ 'string_value']
60
+
61
+ if tokenizer_type == 't5':
62
+ self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
63
+ padding_side='left')
64
+ elif tokenizer_type == 'auto':
65
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
66
+ padding_side='left')
67
+ elif tokenizer_type == 'llama':
68
+ self.tokenizer = LlamaTokenizer.from_pretrained(
69
+ tokenizer_dir, legacy=False, padding_side='left')
70
+ else:
71
+ raise AttributeError(
72
+ f'Unexpected tokenizer type: {tokenizer_type}')
73
+ self.tokenizer.pad_token = self.tokenizer.eos_token
74
+
75
+ self._init_token_map()
76
+
77
+ # Parse model output configs
78
+ output_config = pb_utils.get_output_config_by_name(
79
+ model_config, "OUTPUT")
80
+
81
+ # Convert Triton types to numpy types
82
+ self.output_dtype = pb_utils.triton_string_to_numpy(
83
+ output_config['data_type'])
84
+ output_lens_config = pb_utils.get_output_config_by_name(
85
+ model_config, "OUTPUT_LENS")
86
+
87
+ # Convert Triton types to numpy types
88
+ self.output_lens_dtype = pb_utils.triton_string_to_numpy(
89
+ output_lens_config['data_type'])
90
+
91
+ def _init_token_map(self):
92
+ v = self.tokenizer.get_vocab()
93
+ self.token_map = [None] * len(v)
94
+ for k, val in v.items():
95
+ self.token_map[val] = k
96
+
97
+ for i in range(len(v)):
98
+ if self.token_map[i] is None:
99
+ print("error %s" % i)
100
+
101
+ def execute(self, requests):
102
+ """`execute` must be implemented in every Python model. `execute`
103
+ function receives a list of pb_utils.InferenceRequest as the only
104
+ argument. This function is called when an inference is requested
105
+ for this model. Depending on the batching configuration (e.g. Dynamic
106
+ Batching) used, `requests` may contain multiple requests. Every
107
+ Python model, must create one pb_utils.InferenceResponse for every
108
+ pb_utils.InferenceRequest in `requests`. If there is an error, you can
109
+ set the error argument when creating a pb_utils.InferenceResponse.
110
+ Parameters
111
+ ----------
112
+ requests : list
113
+ A list of pb_utils.InferenceRequest
114
+ Returns
115
+ -------
116
+ list
117
+ A list of pb_utils.InferenceResponse. The length of this list must
118
+ be the same as `requests`
119
+ """
120
+
121
+ responses = []
122
+
123
+ # Every Python backend must iterate over everyone of the requests
124
+ # and create a pb_utils.InferenceResponse for each of them.
125
+ for idx, request in enumerate(requests):
126
+ # Get input tensors
127
+ tokens_batch = pb_utils.get_input_tensor_by_name(
128
+ request, 'TOKENS_BATCH').as_numpy()
129
+
130
+ # Reshape Input
131
+ # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
132
+ # tokens_batch = tokens_batch.T
133
+
134
+ # Postprocessing output data.
135
+ outputs, output_lens = self._postprocessing(tokens_batch)
136
+
137
+ # Create output tensors. You need pb_utils.Tensor
138
+ # objects to create pb_utils.InferenceResponse.
139
+ output_tensor = pb_utils.Tensor(
140
+ 'OUTPUT',
141
+ np.array(outputs).astype(self.output_dtype))
142
+ output_lens_tensor = pb_utils.Tensor(
143
+ 'OUTPUT_LENS',
144
+ np.array(output_lens).astype(self.output_lens_dtype))
145
+
146
+ # Create InferenceResponse. You can set an error here in case
147
+ # there was a problem with handling this inference request.
148
+ # Below is an example of how you can set errors in inference
149
+ # response:
150
+ #
151
+ # pb_utils.InferenceResponse(
152
+ # output_tensors=..., TritonError("An error occurred"))
153
+ inference_response = pb_utils.InferenceResponse(
154
+ output_tensors=[output_tensor, output_lens_tensor])
155
+ responses.append(inference_response)
156
+
157
+ # You should return a list of pb_utils.InferenceResponse. Length
158
+ # of this list must match the length of `requests` list.
159
+ return responses
160
+
161
+ def finalize(self):
162
+ """`finalize` is called only once when the model is being unloaded.
163
+ Implementing `finalize` function is optional. This function allows
164
+ the model to perform any necessary clean ups before exit.
165
+ """
166
+ print('Cleaning up...')
167
+
168
+ def _single_token_decode(self, token):
169
+ st = self.token_map[token]
170
+ if st[0] == '▁':
171
+ return " " + st[1:]
172
+ return st
173
+
174
+ def _postprocessing(self, tokens_batch):
175
+ outputs = []
176
+ output_lens = []
177
+ for beam_tokens in tokens_batch:
178
+ total_len = 0
179
+ for tokens in beam_tokens:
180
+ if len(tokens) == 1:
181
+ output = self._single_token_decode(tokens[0])
182
+ else:
183
+ output = self.tokenizer.decode(tokens)
184
+ print(output)
185
+ outputs.append(output.encode('utf8'))
186
+ total_len += len(tokens)
187
+ output_lens.append(total_len)
188
+ return outputs, output_lens
postprocessing/config.pbtxt ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "postprocessing"
28
+ backend: "python"
29
+ max_batch_size: 128
30
+ input [
31
+ {
32
+ name: "TOKENS_BATCH"
33
+ data_type: TYPE_INT32
34
+ dims: [ -1, -1 ]
35
+ }
36
+ ]
37
+ output [
38
+ {
39
+ name: "OUTPUT"
40
+ data_type: TYPE_STRING
41
+ dims: [ -1, -1 ]
42
+ },
43
+ {
44
+ name: "OUTPUT_LENS"
45
+ data_type: TYPE_UINT32
46
+ dims: [ -1 ]
47
+ }
48
+ ]
49
+
50
+ parameters {
51
+ key: "tokenizer_dir"
52
+ value: {
53
+ string_value: "/data/tgi-data/orig_llama"
54
+ }
55
+ }
56
+
57
+ parameters {
58
+ key: "tokenizer_type"
59
+ value: {
60
+ string_value: "llama"
61
+ }
62
+ }
63
+
64
+ instance_group [
65
+ {
66
+ count: 1
67
+ kind: KIND_CPU
68
+ }
69
+ ]
preprocessing/1/model.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ import csv
28
+ import json
29
+ from typing import List
30
+
31
+ import numpy as np
32
+ import torch
33
+ import triton_python_backend_utils as pb_utils
34
+ from torch.nn.utils.rnn import pad_sequence
35
+ from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
36
+
37
+
38
+ class TritonPythonModel:
39
+ """Your Python model must use the same class name. Every Python model
40
+ that is created must have "TritonPythonModel" as the class name.
41
+ """
42
+
43
+ def initialize(self, args):
44
+ """`initialize` is called only once when the model is being loaded.
45
+ Implementing `initialize` function is optional. This function allows
46
+ the model to initialize any state associated with this model.
47
+ Parameters
48
+ ----------
49
+ args : dict
50
+ Both keys and values are strings. The dictionary keys and values are:
51
+ * model_config: A JSON string containing the model configuration
52
+ * model_instance_kind: A string containing model instance kind
53
+ * model_instance_device_id: A string containing model instance device ID
54
+ * model_repository: Model repository path
55
+ * model_version: Model version
56
+ * model_name: Model name
57
+ """
58
+ # Parse model configs
59
+ model_config = json.loads(args['model_config'])
60
+ tokenizer_dir = model_config['parameters']['tokenizer_dir'][
61
+ 'string_value']
62
+ tokenizer_type = model_config['parameters']['tokenizer_type'][
63
+ 'string_value']
64
+
65
+ if tokenizer_type == 't5':
66
+ self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
67
+ padding_side='left')
68
+ elif tokenizer_type == 'auto':
69
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
70
+ padding_side='left')
71
+ elif tokenizer_type == 'llama':
72
+ self.tokenizer = LlamaTokenizer.from_pretrained(
73
+ tokenizer_dir, legacy=False, padding_side='left')
74
+ else:
75
+ raise AttributeError(
76
+ f'Unexpected tokenizer type: {tokenizer_type}')
77
+ self.tokenizer.pad_token = self.tokenizer.eos_token
78
+
79
+ self.pad_id = self.tokenizer.encode(self.tokenizer.pad_token,
80
+ add_special_tokens=False)[0]
81
+
82
+ # Parse model output configs and convert Triton types to numpy types
83
+ input_names = [
84
+ "INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS"
85
+ ]
86
+ for input_name in input_names:
87
+ setattr(
88
+ self,
89
+ input_name.lower() + "_dtype",
90
+ pb_utils.triton_string_to_numpy(
91
+ pb_utils.get_output_config_by_name(
92
+ model_config, input_name)['data_type']))
93
+
94
+ def execute(self, requests):
95
+ """`execute` must be implemented in every Python model. `execute`
96
+ function receives a list of pb_utils.InferenceRequest as the only
97
+ argument. This function is called when an inference is requested
98
+ for this model. Depending on the batching configuration (e.g. Dynamic
99
+ Batching) used, `requests` may contain multiple requests. Every
100
+ Python model, must create one pb_utils.InferenceResponse for every
101
+ pb_utils.InferenceRequest in `requests`. If there is an error, you can
102
+ set the error argument when creating a pb_utils.InferenceResponse.
103
+ Parameters
104
+ ----------
105
+ requests : list
106
+ A list of pb_utils.InferenceRequest
107
+ Returns
108
+ -------
109
+ list
110
+ A list of pb_utils.InferenceResponse. The length of this list must
111
+ be the same as `requests`
112
+ """
113
+
114
+ responses = []
115
+
116
+ # Every Python backend must iterate over everyone of the requests
117
+ # and create a pb_utils.InferenceResponse for each of them.
118
+ for idx, request in enumerate(requests):
119
+ # Get input tensors
120
+ query = pb_utils.get_input_tensor_by_name(request,
121
+ 'QUERY').as_numpy()
122
+ request_output_len = pb_utils.get_input_tensor_by_name(
123
+ request, 'REQUEST_OUTPUT_LEN').as_numpy()
124
+
125
+ bad_words_dict = pb_utils.get_input_tensor_by_name(
126
+ request, 'BAD_WORDS_DICT').as_numpy()
127
+ stop_words_dict = pb_utils.get_input_tensor_by_name(
128
+ request, 'STOP_WORDS_DICT').as_numpy()
129
+
130
+ # Preprocessing input data.
131
+ input_id, request_input_len = self._create_request(query)
132
+ bad_words = self._to_word_list_format(bad_words_dict)
133
+ stop_words = self._to_word_list_format(stop_words_dict)
134
+
135
+ # Create output tensors. You need pb_utils.Tensor
136
+ # objects to create pb_utils.InferenceResponse.
137
+ input_id_tensor = pb_utils.Tensor(
138
+ 'INPUT_ID',
139
+ np.array(input_id).astype(self.input_id_dtype))
140
+ request_input_len_tensor = pb_utils.Tensor(
141
+ 'REQUEST_INPUT_LEN',
142
+ np.array(request_input_len).astype(
143
+ self.request_input_len_dtype))
144
+ request_output_len_tensor = pb_utils.Tensor(
145
+ 'REQUEST_OUTPUT_LEN', request_output_len)
146
+ bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
147
+ stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
148
+ stop_words)
149
+
150
+ # Create InferenceResponse. You can set an error here in case
151
+ # there was a problem with handling this inference request.
152
+ # Below is an example of how you can set errors in inference
153
+ # response:
154
+ #
155
+ # pb_utils.InferenceResponse(
156
+ # output_tensors=..., TritonError("An error occurred"))
157
+ inference_response = pb_utils.InferenceResponse(output_tensors=[
158
+ input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor,
159
+ request_input_len_tensor, request_output_len_tensor
160
+ ])
161
+ responses.append(inference_response)
162
+
163
+ # You should return a list of pb_utils.InferenceResponse. Length
164
+ # of this list must match the length of `requests` list.
165
+ return responses
166
+
167
+ def finalize(self):
168
+ """`finalize` is called only once when the model is being unloaded.
169
+ Implementing `finalize` function is optional. This function allows
170
+ the model to perform any necessary clean ups before exit.
171
+ """
172
+ print('Cleaning up...')
173
+
174
+ def _create_request(self, query):
175
+ """
176
+ query : batch string (2D numpy array)
177
+ """
178
+ start_ids = [
179
+ torch.IntTensor(self.tokenizer.encode(s[0].decode()))
180
+ for s in query
181
+ ]
182
+ start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
183
+
184
+ start_ids = pad_sequence(start_ids,
185
+ batch_first=True,
186
+ padding_value=self.pad_id)
187
+ # input_len = min(start_lengths)
188
+ #attn_mask = torch.ones((batch_size, input_len, input_len)).tril()
189
+
190
+ return start_ids, start_lengths
191
+
192
+ def _to_word_list_format(self, word_dict: List[List[str]]):
193
+ '''
194
+ format of word_dict
195
+ len(word_dict) should be same to batch_size
196
+ word_dict[i] means the words for batch i
197
+ len(word_dict[i]) must be 1, which means it only contains 1 string
198
+ This string can contains several sentences and split by ",".
199
+ For example, if word_dict[2] = " I am happy, I am sad", then this function will return
200
+ the ids for two short sentences " I am happy" and " I am sad".
201
+ '''
202
+ assert self.tokenizer != None, "need to set tokenizer"
203
+
204
+ flat_ids = []
205
+ offsets = []
206
+ for word_dict_item in word_dict:
207
+ item_flat_ids = []
208
+ item_offsets = []
209
+
210
+ if isinstance(word_dict_item[0], bytes):
211
+ word_dict_item = [word_dict_item[0].decode()]
212
+
213
+ words = list(csv.reader(word_dict_item))[0]
214
+ for word in words:
215
+ ids = self.tokenizer.encode(word)
216
+
217
+ if len(ids) == 0:
218
+ continue
219
+
220
+ item_flat_ids += ids
221
+ item_offsets.append(len(ids))
222
+
223
+ flat_ids.append(np.array(item_flat_ids))
224
+ offsets.append(np.cumsum(np.array(item_offsets)))
225
+
226
+ pad_to = max(1, max(len(ids) for ids in flat_ids))
227
+
228
+ for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
229
+ flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)),
230
+ constant_values=0)
231
+ offsets[i] = np.pad(offs, (0, pad_to - len(offs)),
232
+ constant_values=-1)
233
+
234
+ return np.array([flat_ids, offsets], dtype="int32").transpose(
235
+ (1, 0, 2))
preprocessing/config.pbtxt ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "preprocessing"
28
+ backend: "python"
29
+ max_batch_size: 128
30
+ input [
31
+ {
32
+ name: "QUERY"
33
+ data_type: TYPE_STRING
34
+ dims: [ -1 ]
35
+ },
36
+ {
37
+ name: "BAD_WORDS_DICT"
38
+ data_type: TYPE_STRING
39
+ dims: [ -1 ]
40
+ },
41
+ {
42
+ name: "STOP_WORDS_DICT"
43
+ data_type: TYPE_STRING
44
+ dims: [ -1 ]
45
+ },
46
+ {
47
+ name: "REQUEST_OUTPUT_LEN"
48
+ data_type: TYPE_UINT32
49
+ dims: [ -1 ]
50
+ }
51
+ ]
52
+ output [
53
+ {
54
+ name: "INPUT_ID"
55
+ data_type: TYPE_INT32
56
+ dims: [ -1 ]
57
+ },
58
+ {
59
+ name: "REQUEST_INPUT_LEN"
60
+ data_type: TYPE_INT32
61
+ dims: [ 1 ]
62
+ },
63
+ {
64
+ name: "BAD_WORDS_IDS"
65
+ data_type: TYPE_INT32
66
+ dims: [ 2, -1 ]
67
+ },
68
+ {
69
+ name: "STOP_WORDS_IDS"
70
+ data_type: TYPE_INT32
71
+ dims: [ 2, -1 ]
72
+ },
73
+ {
74
+ name: "REQUEST_OUTPUT_LEN"
75
+ data_type: TYPE_UINT32
76
+ dims: [ -1 ]
77
+ }
78
+ ]
79
+
80
+ parameters {
81
+ key: "tokenizer_dir"
82
+ value: {
83
+ string_value: "/data/tgi-data/orig_llama"
84
+ }
85
+ }
86
+
87
+ parameters {
88
+ key: "tokenizer_type"
89
+ value: {
90
+ string_value: "llama"
91
+ }
92
+ }
93
+
94
+ instance_group [
95
+ {
96
+ count: 1
97
+ kind: KIND_CPU
98
+ }
99
+ ]
tensorrt_llm/1/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_config": {
3
+ "fp8": true,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 8192,
6
+ "int8": false,
7
+ "max_batch_size": 64,
8
+ "max_input_len": 4096,
9
+ "max_num_tokens": null,
10
+ "max_output_len": 4096,
11
+ "max_position_embeddings": 4096,
12
+ "name": "llama",
13
+ "num_heads": 64,
14
+ "num_kv_heads": 8,
15
+ "num_layers": 80,
16
+ "parallel_build": true,
17
+ "pipeline_parallel": 1,
18
+ "precision": "float16",
19
+ "quant_mode": 384,
20
+ "tensor_parallel": 4,
21
+ "use_refit": false,
22
+ "vocab_size": 32000
23
+ },
24
+ "plugin_config": {
25
+ "attention_qk_half_accumulation": false,
26
+ "bert_attention_plugin": false,
27
+ "context_fmha_type": 1,
28
+ "gemm_plugin": "float16",
29
+ "gpt_attention_plugin": "float16",
30
+ "identity_plugin": false,
31
+ "layernorm_plugin": false,
32
+ "layernorm_quantization_plugin": false,
33
+ "lookup_plugin": false,
34
+ "nccl_plugin": "float16",
35
+ "paged_kv_cache": true,
36
+ "quantize_per_token_plugin": false,
37
+ "quantize_tensor_plugin": false,
38
+ "remove_input_padding": true,
39
+ "rmsnorm_plugin": false,
40
+ "rmsnorm_quantization_plugin": false,
41
+ "smooth_quant_gemm_plugin": false,
42
+ "tokens_per_block": 64,
43
+ "use_custom_all_reduce": false,
44
+ "weight_only_groupwise_quant_matmul_plugin": false,
45
+ "weight_only_quant_matmul_plugin": false
46
+ }
47
+ }
tensorrt_llm/config.pbtxt ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "tensorrt_llm"
28
+ backend: "tensorrtllm"
29
+ max_batch_size: 128
30
+
31
+ model_transaction_policy {
32
+ decoupled: True
33
+ }
34
+
35
+ input [
36
+ {
37
+ name: "input_ids"
38
+ data_type: TYPE_INT32
39
+ dims: [ -1 ]
40
+ },
41
+ {
42
+ name: "input_lengths"
43
+ data_type: TYPE_INT32
44
+ dims: [ 1 ]
45
+ reshape: { shape: [ ] }
46
+ },
47
+ {
48
+ name: "request_output_len"
49
+ data_type: TYPE_UINT32
50
+ dims: [ 1 ]
51
+ },
52
+ {
53
+ name: "end_id"
54
+ data_type: TYPE_UINT32
55
+ dims: [ 1 ]
56
+ reshape: { shape: [ ] }
57
+ optional: true
58
+ },
59
+ {
60
+ name: "pad_id"
61
+ data_type: TYPE_UINT32
62
+ dims: [ 1 ]
63
+ reshape: { shape: [ ] }
64
+ optional: true
65
+ },
66
+ {
67
+ name: "beam_width"
68
+ data_type: TYPE_UINT32
69
+ dims: [ 1 ]
70
+ reshape: { shape: [ ] }
71
+ optional: true
72
+ },
73
+ {
74
+ name: "temperature"
75
+ data_type: TYPE_FP32
76
+ dims: [ 1 ]
77
+ reshape: { shape: [ ] }
78
+ optional: true
79
+ },
80
+ {
81
+ name: "runtime_top_k"
82
+ data_type: TYPE_UINT32
83
+ dims: [ 1 ]
84
+ reshape: { shape: [ ] }
85
+ optional: true
86
+ },
87
+ {
88
+ name: "runtime_top_p"
89
+ data_type: TYPE_FP32
90
+ dims: [ 1 ]
91
+ reshape: { shape: [ ] }
92
+ optional: true
93
+ },
94
+ {
95
+ name: "len_penalty"
96
+ data_type: TYPE_FP32
97
+ dims: [ 1 ]
98
+ reshape: { shape: [ ] }
99
+ optional: true
100
+ },
101
+ {
102
+ name: "repetition_penalty"
103
+ data_type: TYPE_FP32
104
+ dims: [ 1 ]
105
+ reshape: { shape: [ ] }
106
+ optional: true
107
+ },
108
+ {
109
+ name: "min_length"
110
+ data_type: TYPE_UINT32
111
+ dims: [ 1 ]
112
+ reshape: { shape: [ ] }
113
+ optional: true
114
+ },
115
+ {
116
+ name: "presence_penalty"
117
+ data_type: TYPE_FP32
118
+ dims: [ 1 ]
119
+ reshape: { shape: [ ] }
120
+ optional: true
121
+ },
122
+ {
123
+ name: "random_seed"
124
+ data_type: TYPE_UINT64
125
+ dims: [ 1 ]
126
+ reshape: { shape: [ ] }
127
+ optional: true
128
+ },
129
+ {
130
+ name: "stop"
131
+ data_type: TYPE_BOOL
132
+ dims: [ 1 ]
133
+ optional: true
134
+ },
135
+ {
136
+ name: "streaming"
137
+ data_type: TYPE_BOOL
138
+ dims: [ 1 ]
139
+ optional: true
140
+ }
141
+ ]
142
+ output [
143
+ {
144
+ name: "output_ids"
145
+ data_type: TYPE_INT32
146
+ dims: [ -1, -1 ]
147
+ }
148
+ ]
149
+ instance_group [
150
+ {
151
+ count: 1
152
+ kind : KIND_CPU
153
+ }
154
+ ]
155
+ parameters: {
156
+ key: "max_beam_width"
157
+ value: {
158
+ string_value: "1"
159
+ }
160
+ }
161
+ parameters: {
162
+ key: "FORCE_CPU_ONLY_INPUT_TENSORS"
163
+ value: {
164
+ string_value: "no"
165
+ }
166
+ }
167
+ parameters: {
168
+ key: "gpt_model_type"
169
+ value: {
170
+ string_value: "inflight_fused_batching"
171
+ }
172
+ }
173
+ parameters: {
174
+ key: "gpt_model_path"
175
+ value: {
176
+ string_value: "/data/tgi-data/triton_model_repo_70_fp8/tensorrt_llm/1"
177
+ }
178
+ }
179
+ parameters: {
180
+ key: "max_tokens_in_paged_kv_cache"
181
+ value: {
182
+ string_value: "${max_tokens_in_paged_kv_cache}"
183
+ }
184
+ }
185
+ parameters: {
186
+ key: "batch_scheduler_policy"
187
+ value: {
188
+ string_value: "max_utilization"
189
+ }
190
+ }
191
+ parameters: {
192
+ key: "kv_cache_free_gpu_mem_fraction"
193
+ value: {
194
+ string_value: "0.9"
195
+ }
196
+ }
197
+ parameters: {
198
+ key: "max_num_sequences"
199
+ value: {
200
+ string_value: "${max_num_sequences}"
201
+ }
202
+ }
203
+ parameters: {
204
+ key: "enable_trt_overlap"
205
+ value: {
206
+ string_value: "${enable_trt_overlap}"
207
+ }
208
+ }