Commitd419b67

Mesh TensorFlow Team

committed

Minor changes to make Experts Attention work.

PiperOrigin-RevId: 388312437

1 parent7e78cf8 commitd419b67Copy full SHA for d419b67

File tree

+13

-5

lines changed

+13

-5

lines changed

Lines changed: 6 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -663,6 +663,8 @@ def __init__(self,`
`663`	`663`	`ifmtf.layers.unit_scaling_convention():`
`664`	`664`	`raiseNotImplementedError`
`665`	`665`
	`666`	`+# TODO(barretzoph): Make this work for model parallelism by not outputing`
	`667`	+# a tensor with `heads` dim.
`666`	`668`	`moe_output_dims=self.q_shape[-1]`
`667`	`669`	`tf.logging.info("ExpertsAttention moe_hidden_size: {}".format(`
`668`	`670`	`experts_hparams.hidden_size))`
`@@ -680,10 +682,12 @@ def __init__(self,`
`680`	`682`	`switch_dropout=experts_hparams.switch_dropout,`
`681`	`683`	`switch_temperature=experts_hparams.switch_temperature,`
`682`	`684`	`switch_jitter=experts_hparams.switch_jitter,`
`683`		`-switch_top_k=experts_hparams.switch_top_k,`
	`685`	`+ntlb_top_k=experts_hparams.ntlb_top_k,`
`684`	`686`	`hidden_size=experts_hparams.hidden_size,`
`685`	`687`	`output_dim=moe_output_dims,`
`686`		`-use_experts_attention=experts_hparams.use_experts_attention)`
	`688`	`+use_experts_attention=experts_hparams.use_experts_attention,`
	`689`	`+activation=experts_hparams.activation,`
	`690`	`+z_loss=experts_hparams.z_loss)`
`687`	`691`
`688`	`692`	`def_compute_merge_qkv(self,antecedent):`
`689`	`693`	`"""Computes qkv all in one call using MoE layer."""`

Lines changed: 7 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -402,9 +402,11 @@ def __init__(self,`
`402`	`402`	`switch_dropout=0.0,`
`403`	`403`	`switch_temperature=1.0,`
`404`	`404`	`switch_jitter=1e-2,`
`405`		`-switch_top_k=4,`
	`405`	`+ntlb_top_k=4,`
`406`	`406`	`hidden_size=3072,`
`407`	`407`	`use_experts_attention=True,`
	`408`	`+activation="relu",`
	`409`	`+z_loss=None,`
`408`	`410`	`**kwargs):`
`409`	`411`	`super(ExpertsSelfAttention,self).__init__(**kwargs)`
`410`	`412`	`self._hparams=mtf.transformer.moe.HParams(`
`@@ -420,9 +422,11 @@ def __init__(self,`
`420`	`422`	`switch_dropout=switch_dropout,`
`421`	`423`	`switch_temperature=switch_temperature,`
`422`	`424`	`switch_jitter=switch_jitter,`
`423`		`-switch_top_k=switch_top_k,`
	`425`	`+ntlb_top_k=ntlb_top_k,`
`424`	`426`	`hidden_size=hidden_size,`
`425`		`-use_experts_attention=use_experts_attention)`
	`427`	`+use_experts_attention=use_experts_attention,`
	`428`	`+activation=activation,`
	`429`	`+z_loss=z_loss)`
`426`	`430`
`427`	`431`	`defmake_params(self,context):`
`428`	`432`	`num_heads=self.num_heads`

Comments

(0)