Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit3681540

Browse files
committed
gguf : deprecate old FIM token KVs
1 parent3ae8670 commit3681540

File tree

3 files changed

+36
-14
lines changed

3 files changed

+36
-14
lines changed

‎gguf-py/gguf/constants.py‎

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,8 @@ class Tokenizer:
152152
MERGES="tokenizer.ggml.merges"
153153
BOS_ID="tokenizer.ggml.bos_token_id"
154154
EOS_ID="tokenizer.ggml.eos_token_id"
155+
EOT_ID="tokenizer.ggml.eot_token_id"
156+
EOM_ID="tokenizer.ggml.eom_token_id"
155157
UNK_ID="tokenizer.ggml.unknown_token_id"
156158
SEP_ID="tokenizer.ggml.seperator_token_id"
157159
PAD_ID="tokenizer.ggml.padding_token_id"
@@ -168,11 +170,16 @@ class Tokenizer:
168170
CHAT_TEMPLATE_N="tokenizer.chat_template.{name}"
169171
CHAT_TEMPLATES="tokenizer.chat_templates"
170172
# FIM/Infill special tokens constants
173+
FIM_PRE_ID="tokenizer.ggml.fim_pre_token_id"
174+
FIM_SUF_ID="tokenizer.ggml.fim_suf_token_id"
175+
FIM_MID_ID="tokenizer.ggml.fim_mid_token_id"
176+
FIM_PAD_ID="tokenizer.ggml.fim_pad_token_id"
177+
FIM_REP_ID="tokenizer.ggml.fim_rep_token_id"
178+
FIM_SEP_ID="tokenizer.ggml.fim_sep_token_id"
179+
# deprecated:
171180
PREFIX_ID="tokenizer.ggml.prefix_token_id"
172181
SUFFIX_ID="tokenizer.ggml.suffix_token_id"
173182
MIDDLE_ID="tokenizer.ggml.middle_token_id"
174-
EOT_ID="tokenizer.ggml.eot_token_id"
175-
EOM_ID="tokenizer.ggml.eom_token_id"
176183

177184
classAdapter:
178185
TYPE="adapter.type"
@@ -1579,15 +1586,24 @@ def get_type(val: Any) -> GGUFValueType:
15791586
KEY_TOKENIZER_MERGES=Keys.Tokenizer.MERGES
15801587
KEY_TOKENIZER_BOS_ID=Keys.Tokenizer.BOS_ID
15811588
KEY_TOKENIZER_EOS_ID=Keys.Tokenizer.EOS_ID
1589+
KEY_TOKENIZER_EOT_ID=Keys.Tokenizer.EOT_ID
1590+
KEY_TOKENIZER_EOM_ID=Keys.Tokenizer.EOM_ID
15821591
KEY_TOKENIZER_UNK_ID=Keys.Tokenizer.UNK_ID
15831592
KEY_TOKENIZER_SEP_ID=Keys.Tokenizer.SEP_ID
15841593
KEY_TOKENIZER_PAD_ID=Keys.Tokenizer.PAD_ID
15851594
KEY_TOKENIZER_CLS_ID=Keys.Tokenizer.CLS_ID
15861595
KEY_TOKENIZER_MASK_ID=Keys.Tokenizer.MASK_ID
15871596
KEY_TOKENIZER_HF_JSON=Keys.Tokenizer.HF_JSON
15881597
KEY_TOKENIZER_RWKV=Keys.Tokenizer.RWKV
1589-
KEY_TOKENIZER_PRIFIX_ID=Keys.Tokenizer.PREFIX_ID
1598+
1599+
KEY_TOKENIZER_FIM_PRE_ID=Keys.Tokenizer.FIM_PRE_ID
1600+
KEY_TOKENIZER_FIM_SUF_ID=Keys.Tokenizer.FIM_SUF_ID
1601+
KEY_TOKENIZER_FIM_MID_ID=Keys.Tokenizer.FIM_MID_ID
1602+
KEY_TOKENIZER_FIM_PAD_ID=Keys.Tokenizer.FIM_PAD_ID
1603+
KEY_TOKENIZER_FIM_REP_ID=Keys.Tokenizer.FIM_REP_ID
1604+
KEY_TOKENIZER_FIM_SEP_ID=Keys.Tokenizer.FIM_SEP_ID
1605+
1606+
# deprecated
1607+
KEY_TOKENIZER_PREFIX_ID=Keys.Tokenizer.PREFIX_ID
15901608
KEY_TOKENIZER_SUFFIX_ID=Keys.Tokenizer.SUFFIX_ID
15911609
KEY_TOKENIZER_MIDDLE_ID=Keys.Tokenizer.MIDDLE_ID
1592-
KEY_TOKENIZER_EOT_ID=Keys.Tokenizer.EOT_ID
1593-
KEY_TOKENIZER_EOM_ID=Keys.Tokenizer.EOM_ID

‎gguf-py/gguf/gguf_writer.py‎

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -843,15 +843,6 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
843843

844844
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE,value)
845845

846-
defadd_prefix_token_id(self,id:int)->None:
847-
self.add_uint32(Keys.Tokenizer.PREFIX_ID,id)
848-
849-
defadd_suffix_token_id(self,id:int)->None:
850-
self.add_uint32(Keys.Tokenizer.SUFFIX_ID,id)
851-
852-
defadd_middle_token_id(self,id:int)->None:
853-
self.add_uint32(Keys.Tokenizer.MIDDLE_ID,id)
854-
855846
defadd_eot_token_id(self,id:int)->None:
856847
self.add_uint32(Keys.Tokenizer.EOT_ID,id)
857848

‎src/llama.cpp‎

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,11 @@ enum llm_kv {
368368

369369
LLM_KV_ADAPTER_TYPE,
370370
LLM_KV_ADAPTER_LORA_ALPHA,
371+
372+
// deprecated:
373+
LLM_KV_TOKENIZER_PREFIX_ID,
374+
LLM_KV_TOKENIZER_SUFFIX_ID,
375+
LLM_KV_TOKENIZER_MIDDLE_ID,
371376
};
372377

373378
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -479,6 +484,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
479484

480485
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
481486
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
487+
488+
// deprecated
489+
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
490+
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
491+
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
482492
};
483493

484494
struct LLM_KV {
@@ -6533,6 +6543,11 @@ static void llm_load_vocab(
65336543
{ LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
65346544
{ LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
65356545
{ LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },
6546+
6547+
// deprecated
6548+
{ LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id },
6549+
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id },
6550+
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id },
65366551
};
65376552

65386553
for (const auto & it : special_token_types) {

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp