gen-bi commited on
Commit
d11789f
1 Parent(s): b8b10b1

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +19 -21
  2. tokenizer.json +18 -0
  3. tokenizer_config.json +24 -3
special_tokens_map.json CHANGED
@@ -1,25 +1,23 @@
1
  {
2
- "bos_token": {
3
- "content": "<bos>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "<eos>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": {
17
- "content": "<pad>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
  "unk_token": {
24
  "content": "<unk>",
25
  "lstrip": false,
 
1
  {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|im_start|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|im_end|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
+ "bos_token": "<|im_start|>",
19
+ "eos_token": "<|im_end|>",
20
+ "pad_token": "<|im_end|>",
 
 
21
  "unk_token": {
22
  "content": "<unk>",
23
  "lstrip": false,
tokenizer.json CHANGED
@@ -101,6 +101,24 @@
101
  "rstrip": false,
102
  "normalized": false,
103
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  }
105
  ],
106
  "normalizer": {
 
101
  "rstrip": false,
102
  "normalized": false,
103
  "special": true
104
+ },
105
+ {
106
+ "id": 46338,
107
+ "content": "<|im_start|>",
108
+ "single_word": false,
109
+ "lstrip": false,
110
+ "rstrip": false,
111
+ "normalized": false,
112
+ "special": true
113
+ },
114
+ {
115
+ "id": 46339,
116
+ "content": "<|im_end|>",
117
+ "single_word": false,
118
+ "lstrip": false,
119
+ "rstrip": false,
120
+ "normalized": false,
121
+ "special": true
122
  }
123
  ],
124
  "normalizer": {
tokenizer_config.json CHANGED
@@ -89,14 +89,35 @@
89
  "rstrip": false,
90
  "single_word": false,
91
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  }
93
  },
94
- "bos_token": "<bos>",
 
 
 
 
 
95
  "clean_up_tokenization_spaces": false,
96
- "eos_token": "<eos>",
97
  "legacy": false,
98
  "model_max_length": 1000000000000000019884624838656,
99
- "pad_token": "<pad>",
100
  "sp_model_kwargs": {},
101
  "tokenizer_class": "LlamaTokenizer",
102
  "unk_token": "<unk>",
 
89
  "rstrip": false,
90
  "single_word": false,
91
  "special": true
92
+ },
93
+ "46338": {
94
+ "content": "<|im_start|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "46339": {
102
+ "content": "<|im_end|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
  }
109
  },
110
+ "additional_special_tokens": [
111
+ "<|im_start|>",
112
+ "<|im_end|>"
113
+ ],
114
+ "bos_token": "<|im_start|>",
115
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
116
  "clean_up_tokenization_spaces": false,
117
+ "eos_token": "<|im_end|>",
118
  "legacy": false,
119
  "model_max_length": 1000000000000000019884624838656,
120
+ "pad_token": "<|im_end|>",
121
  "sp_model_kwargs": {},
122
  "tokenizer_class": "LlamaTokenizer",
123
  "unk_token": "<unk>",