coder3101 commited on
Commit
b36655d
·
verified ·
1 Parent(s): 16f7ca0

Upload tokenizer

Browse files
Files changed (3) hide show
  1. chat_template.jinja +64 -0
  2. tokenizer.json +0 -0
  3. tokenizer_config.json +22 -0
chat_template.jinja ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token -}}
2
+ {%- set keep_past_thinking = keep_past_thinking | default(false) -%}
3
+ {%- set ns = namespace(system_prompt="") -%}
4
+ {%- if messages[0]["role"] == "system" -%}
5
+ {%- set sys_content = messages[0]["content"] -%}
6
+ {%- if sys_content is not string -%}
7
+ {%- for item in sys_content -%}
8
+ {%- if item["type"] == "text" -%}
9
+ {%- set ns.system_prompt = ns.system_prompt + item["text"] -%}
10
+ {%- endif -%}
11
+ {%- endfor -%}
12
+ {%- else -%}
13
+ {%- set ns.system_prompt = sys_content -%}
14
+ {%- endif -%}
15
+ {%- set messages = messages[1:] -%}
16
+ {%- endif -%}
17
+ {%- if tools -%}
18
+ {%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: [" -%}
19
+ {%- for tool in tools -%}
20
+ {%- if tool is not string -%}
21
+ {%- set tool = tool | tojson -%}
22
+ {%- endif -%}
23
+ {%- set ns.system_prompt = ns.system_prompt + tool -%}
24
+ {%- if not loop.last -%}
25
+ {%- set ns.system_prompt = ns.system_prompt + ", " -%}
26
+ {%- endif -%}
27
+ {%- endfor -%}
28
+ {%- set ns.system_prompt = ns.system_prompt + "]" -%}
29
+ {%- endif -%}
30
+ {%- if ns.system_prompt -%}
31
+ {{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
32
+ {%- endif -%}
33
+ {%- set ns.last_assistant_index = -1 -%}
34
+ {%- for message in messages -%}
35
+ {%- if message["role"] == "assistant" -%}
36
+ {%- set ns.last_assistant_index = loop.index0 -%}
37
+ {%- endif -%}
38
+ {%- endfor -%}
39
+ {%- for message in messages -%}
40
+ {{- "<|im_start|>" + message["role"] + "\n" -}}
41
+ {%- set content = message["content"] -%}
42
+ {%- if content is not string -%}
43
+ {%- set ns.content = "" -%}
44
+ {%- for item in content -%}
45
+ {%- if item["type"] == "image" -%}
46
+ {%- set ns.content = ns.content + "<image>" -%}
47
+ {%- elif item["type"] == "text" -%}
48
+ {%- set ns.content = ns.content + item["text"] -%}
49
+ {%- else -%}
50
+ {%- set ns.content = ns.content + item | tojson -%}
51
+ {%- endif -%}
52
+ {%- endfor -%}
53
+ {%- set content = ns.content -%}
54
+ {%- endif -%}
55
+ {%- if message["role"] == "assistant" and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}
56
+ {%- if "</think>" in content -%}
57
+ {%- set content = content.split("</think>")[-1] | trim -%}
58
+ {%- endif -%}
59
+ {%- endif -%}
60
+ {{- content + "<|im_end|>\n" -}}
61
+ {%- endfor -%}
62
+ {%- if add_generation_prompt -%}
63
+ {{- "<|im_start|>assistant\n" -}}
64
+ {%- endif -%}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|startoftext|>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "<|im_end|>",
6
+ "extra_special_tokens": [],
7
+ "is_local": false,
8
+ "legacy": false,
9
+ "local_files_only": false,
10
+ "model_input_names": [
11
+ "input_ids",
12
+ "attention_mask"
13
+ ],
14
+ "model_max_length": 1000000000000000019884624838656,
15
+ "model_specific_special_tokens": {},
16
+ "pad_token": "<|pad|>",
17
+ "sp_model_kwargs": {},
18
+ "spaces_between_special_tokens": false,
19
+ "tokenizer_class": "TokenizersBackend",
20
+ "use_default_system_prompt": false,
21
+ "use_fast": true
22
+ }