NotificationsYou must be signed in to change notification settings
Fork566
Star6.2k

Commit2b7f921

committed

Added gemma-7b performance

1 parentef055fc commit2b7f921Copy full SHA for 2b7f921

File tree

1 file changed

-8

lines changed

model.py

1 file changed

-8

lines changed

`‎model.py‎`

Lines changed: 9 additions & 8 deletions

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ class ModelArgs:`
`26`	`26`	`dim:int=4096`
`27`	`27`	`intermediate_size:int=None`
`28`	`28`	`n_local_heads:int=-1`
`29`		`-head_dim:int=64`
	`29`	`+head_dim:int=None`
`30`	`30`	`rope_base:float=10000`
`31`	`31`	`norm_eps:float=1e-5`
`32`	`32`
`@@ -37,7 +37,8 @@ def __post_init__(self):`
`37`	`37`	`hidden_dim=4*self.dim`
`38`	`38`	`n_hidden=int(2*hidden_dim/3)`
`39`	`39`	`self.intermediate_size=find_multiple(n_hidden,256)`
`40`		`-self.head_dim=self.dim//self.n_head`
	`40`	`+ifself.head_dimisNone:`
	`41`	`+self.head_dim=self.dim//self.n_head`
`41`	`42`
`42`	`43`	`@classmethod`
`43`	`44`	`deffrom_name(cls,name:str):`
`@@ -51,6 +52,7 @@ def from_name(cls, name: str):`
`51`	`52`
`52`	`53`	`transformer_configs= {`
`53`	`54`	`"gemma-2b":dict(dim=2048,vocab_size=256000,n_layer=18,n_head=8,n_local_heads=1,intermediate_size=16384),`
	`55`	`+"gemma-7b":dict(dim=3072,vocab_size=256000,n_layer=28,n_head=16,n_local_heads=16,intermediate_size=24576,head_dim=256),`
`54`	`56`	`"CodeLlama-7b-Python-hf":dict(block_size=16384,vocab_size=32000,n_layer=32,dim=4096,rope_base=1000000),`
`55`	`57`	`"7B":dict(n_layer=32,n_head=32,dim=4096),`
`56`	`58`	`"13B":dict(n_layer=40,n_head=40,dim=5120),`
`@@ -95,14 +97,13 @@ def __init__(self, config: ModelArgs) -> None:`
`95`	`97`	`defsetup_caches(self,max_batch_size,max_seq_length):`
`96`	`98`	`ifself.max_seq_length>=max_seq_lengthandself.max_batch_size>=max_batch_size:`
`97`	`99`	`return`
`98`		`-head_dim=self.config.dim//self.config.n_head`
`99`	`100`	`max_seq_length=find_multiple(max_seq_length,8)`
`100`	`101`	`self.max_seq_length=max_seq_length`
`101`	`102`	`self.max_batch_size=max_batch_size`
`102`	`103`	`forbinself.layers:`
`103`		`-b.attention.kv_cache=KVCache(max_batch_size,max_seq_length,self.config.n_local_heads,head_dim)`
	`104`	`+b.attention.kv_cache=KVCache(max_batch_size,max_seq_length,self.config.n_local_heads,self.config.head_dim)`
`104`	`105`
`105`		`-self.freqs_cis=precompute_freqs_cis(self.config.block_size,self.config.dim//self.config.n_head,self.config.rope_base)`
	`106`	`+self.freqs_cis=precompute_freqs_cis(self.config.block_size,self.config.head_dim,self.config.rope_base)`
`106`	`107`	`self.causal_mask=torch.tril(torch.ones(self.max_seq_length,self.max_seq_length,dtype=torch.bool))`
`107`	`108`
`108`	`109`	`defforward(self,idx:Tensor,input_pos:Optional[Tensor]=None)->Tensor:`
`@@ -145,7 +146,7 @@ def __init__(self, config: ModelArgs):`
`145`	`146`	`total_head_dim= (config.n_head+2config.n_local_heads)config.head_dim`
`146`	`147`	`# key, query, value projections for all heads, but in a batch`
`147`	`148`	`self.wqkv=nn.Linear(config.dim,total_head_dim,bias=False)`
`148`		`-self.wo=nn.Linear(config.dim,config.dim,bias=False)`
	`149`	`+self.wo=nn.Linear(config.n_head*config.head_dim,config.dim,bias=False)`
`149`	`150`	`self.kv_cache=None`
`150`	`151`
`151`	`152`	`self.n_head=config.n_head`
`@@ -165,7 +166,7 @@ def forward(self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Optiona`
`165`	`166`	`bsz,seqlen,_=x.shape`
`166`	`167`
`167`	`168`	`kv_size=self.n_local_heads*self.head_dim`
`168`		`-q,k,v=self.wqkv(x).split([self.dim,kv_size,kv_size],dim=-1)`
	`169`	`+q,k,v=self.wqkv(x).split([self.n_head*self.head_dim,kv_size,kv_size],dim=-1)`
`169`	`170`
`170`	`171`	`q=q.view(bsz,seqlen,self.n_head,self.head_dim)`
`171`	`172`	`k=k.view(bsz,seqlen,self.n_local_heads,self.head_dim)`
`@@ -183,7 +184,7 @@ def forward(self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Optiona`
`183`	`184`	`v=v.repeat_interleave(self.n_head//self.n_local_heads,dim=1)`
`184`	`185`	`y=F.scaled_dot_product_attention(q,k,v,attn_mask=mask,dropout_p=0.0)`
`185`	`186`
`186`		`-y=y.transpose(1,2).contiguous().view(bsz,seqlen,self.dim)`
	`187`	`+y=y.transpose(1,2).contiguous().view(bsz,seqlen,self.n_head*self.head_dim)`
`187`	`188`
`188`	`189`	`y=self.wo(y)`
`189`	`190`	`returny`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit2b7f921

File tree

1 file changed

1 file changed

`‎model.py‎`

0 commit comments