Commitead60ee

William Fedus

authored and

Mesh TensorFlow Team

committed

Split out optimizer call for internal purposes.

PiperOrigin-RevId: 424207820

1 parenta32810e commitead60eeCopy full SHA for ead60ee

File tree

2 files changed

+36

-36

lines changed

mesh_tensorflow
- optimize.py
- transformer
  - utils.py

2 files changed

+36

-36

lines changed

`‎mesh_tensorflow/optimize.py‎`

Lines changed: 6 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ def lr(self):`
`89`	`89`
`90`	`90`	`defapply_grad(self,grad,var):`
`91`	`91`	`ifgradisNone:`
`92`		`-tf.logging.warning("Gradient is None for variable %s"%var.name)`
	`92`	`+tf.logging.warning("Gradient is None for variable %s",var.name)`
`93`	`93`	`return []`
`94`	`94`	`# It is critical to use assign_sub instead of mtf.assign(var - ...)`
`95`	`95`	`# for the case of bfloat16 activations, so as to avoid repeatedly rounding`
`@@ -115,7 +115,7 @@ def momentum(self):`
`115`	`115`
`116`	`116`	`defapply_grad(self,grad,var):`
`117`	`117`	`ifgradisNone:`
`118`		`-tf.logging.warning("Gradient is None for variable %s"%var.name)`
	`118`	`+tf.logging.warning("Gradient is None for variable %s",var.name)`
`119`	`119`	`return []`
`120`	`120`
`121`	`121`	`updates= []`
`@@ -153,7 +153,7 @@ def __init__(self,`
`153`	`153`	`defapply_grad(self,grad,var):`
`154`	`154`	`"""See base class."""`
`155`	`155`	`ifgradisNone:`
`156`		`-tf.logging.warning("Gradient is None for variable %s"%var.name)`
	`156`	`+tf.logging.warning("Gradient is None for variable %s",var.name)`
`157`	`157`	`return []`
`158`	`158`	`grad=mtf.to_float(grad)`
`159`	`159`
`@@ -219,7 +219,8 @@ def __init__(self,`
`219`	`219`	`epsilon2=1e-3,`
`220`	`220`	`min_dim_size_to_factor=128,`
`221`	`221`	`stacked_dim_names=None,`
`222`		`-exclude_from_parameter_scale=None):`
	`222`	`+exclude_from_parameter_scale=None,`
	`223`	`+ ):`
`223`	`224`	`"""Construct a new Adafactor optimizer.`
`224`	`225`
`225`	`226`	`See class comment.`
`@@ -306,7 +307,7 @@ def _parameter_scale(self, var):`
`306`	`307`
`307`	`308`	`defapply_grad(self,grad,var):`
`308`	`309`	`ifgradisNone:`
`309`		`-tf.logging.warning("Gradient is None for variable %s"%var.name)`
	`310`	`+tf.logging.warning("Gradient is None for variable %s",var.name)`
`310`	`311`	`return []`
`311`	`312`	`# create slots`
`312`	`313`	`grad=mtf.to_float(grad)`

`‎mesh_tensorflow/transformer/utils.py‎`

Lines changed: 30 additions & 31 deletions

Original file line number	Diff line number	Diff line change
`@@ -522,7 +522,9 @@ def tpu_estimator_model_fn(model_type,`
`522`	`522`	`ensemble_inputs=None,`
`523`	`523`	`mesh_devices=None,`
`524`	`524`	`model_info_file=None,`
`525`		`-hierarchical_tiling_spec=None):`
	`525`	`+hierarchical_tiling_spec=None,`
	`526`	`+weight_decay_checkpoint=None# GOOGLE-INTERNAL,`
	`527`	`+ ):`
`526`	`528`	`"""Create a TPUEstimator model function.`
`527`	`529`
`528`	`530`	`Args:`
`@@ -564,14 +566,17 @@ def tpu_estimator_model_fn(model_type,`
`564`	`566`	`if empty string (default), all variables from the checkpoint are loaded.`
`565`	`567`	`ensemble_inputs: an optional integer - pass the size of the ensemble to`
`566`	`568`	`train an ensemble where each model gets different inputs.`
`567`		`- You also need to configure Unitransformer.ensembleto the right size.`
	`569`	`+ You also need to configure Unitransformer.ensemble to the right size.`
`568`	`570`	`If None, then all models are trained on the same inputs.`
`569`	`571`	`mesh_devices: a list of strings, the device names to use for each mesh`
`570`	`572`	`slice. Only required for GPU.`
`571`	`573`	`model_info_file: an optional string, information about variables and`
`572`	`574`	`operations will be logged to this file during the TRAIN mode.`
`573`	`575`	`hierarchical_tiling_spec: an optional list that can be passed as the`
`574`	`576`	`spec argument to simd_mesh_impl.HierarchicalTiling`
	`577`	`+ weight_decay_checkpoint: an optional checkpoint dir to weight decay from. #`
	`578`	`+ GOOGE-INTERNAL`
	`579`	`+`
`575`	`580`	`Returns:`
`576`	`581`	`a function to be passed to TPUEstimator`
`577`	`582`	`"""`
`@@ -663,7 +668,7 @@ def my_model_fn(features, labels, mode, params=None, config=None):`
`663`	`668`	`x=tf.cast(features[key],tf.int32)`
`664`	`669`	`x=tf.reshape(x,feature_shape.to_integer_list)`
`665`	`670`	`ifnotuse_tpu:`
`666`		`-tf.logging.info("feature %s : %s"% (key,x))`
	`671`	`+tf.logging.info("feature %s : %s",key,x)`
`667`	`672`	`mtf_features[key]=mtf.import_fully_replicated(`
`668`	`673`	`mesh,x,feature_shape,name=key)`
`669`	`674`
`@@ -886,6 +891,7 @@ def serialized_fn(mtf_features):`
`886`	`891`	`var_grads=mtf.gradients(`
`887`	`892`	`[loss], [v.outputs[0]forvingraph.trainable_variables])`
`888`	`893`
	`894`	`+`
`889`	`895`	`iftpu_summaries:`
`890`	`896`	`mtf.scalar_summary("loss",loss)`
`891`	`897`
`@@ -919,11 +925,8 @@ def serialized_fn(mtf_features):`
`919`	`925`	`tf.logging.info("Variables not being trained:")`
`920`	`926`	`tf.logging.info([v.nameforvingraph.trainable_variables`
`921`	`927`	`ifnotvariable_filter_fn(v)])`
`922`		`-`
`923`		`-update_ops=optimizer(learning_rate=learning_rate).apply_grads(`
`924`		`-trainable_var_grads,trainable_vars`
`925`		`- )`
`926`		`-`
	`928`	`+opt=optimizer(learning_rate=learning_rate)`
	`929`	`+update_ops=opt.apply_grads(trainable_var_grads,trainable_vars)`
`927`	`930`	`lowering=mtf.Lowering(`
`928`	`931`	`graph, {mesh:mesh_impl},`
`929`	`932`	`autostack=autostack,`
`@@ -980,6 +983,7 @@ def serialized_fn(mtf_features):`
`980`	`983`	`{init_checkpoint_variable_mapping(v):vforvinrestore_vars}`
`981`	`984`	`)`
`982`	`985`
	`986`	`+`
`983`	`987`	`# Copy master variables to slices. Must be called first.`
`984`	`988`	`restore_hook=mtf.MtfRestoreHook(lowering)`
`985`	`989`	`saver=tf.train.Saver(`
`@@ -1348,8 +1352,8 @@ def _maybe_detokenize(value, vocab):`
`1348`	`1352`	`yieldoutput_string`
`1349`	`1353`	`ifi& (i-1)==0:`
`1350`	`1354`	`# LOG every power of 2.`
`1351`		`-tf.logging.info("decoded{}: {}".format(i,input_string))`
`1352`		`-tf.logging.info(" ->{}".format(output_string))`
	`1355`	`+tf.logging.info("decoded%s: %s",i,input_string)`
	`1356`	`+tf.logging.info(" ->%s",output_string)`
`1353`	`1357`
`1354`	`1358`
`1355`	`1359`	`@gin.configurable`
`@@ -1681,7 +1685,7 @@ def save_scores_to_tfrecords(`
`1681`	`1685`	`inputs= [r.split(" ",1)[0]forrininputs]`
`1682`	`1686`
`1683`	`1687`	`table_path="{}_{}.tfrecord".format(scores_filename,shard_idx)`
`1684`		`-tf.logging.info("Saving results to{}".format(table_path))`
	`1688`	`+tf.logging.info("Saving results to%s",table_path)`
`1685`	`1689`
`1686`	`1690`	`withtf.io.TFRecordWriter(table_path)asfile_writer:`
`1687`	`1691`	`forinput_,target,scoreinzip(inputs,targets,scores):`
`@@ -1769,12 +1773,10 @@ def score_with_estimator_lazy(`
`1769`	`1773`	`num_shards=math.ceil(num_examples/num_examples_per_shard)`
`1770`	`1774`	`else:`
`1771`	`1775`	`num_shards=None`
`1772`		`-tf.logging.info(`
`1773`		`-"Scoring {} examples with {} shards at {} examples per shard".format(`
`1774`		`-num_examples,num_shards,num_examples_per_shard))`
	`1776`	`+tf.logging.info("Scoring %s examples with %s shards at %s examples per shard",`
	`1777`	`+num_examples,num_shards,num_examples_per_shard)`
`1775`	`1778`
`1776`		`-checkpoint_path,=get_checkpoint_iterator(`
`1777`		`-eval_checkpoint_step,model_dir)`
	`1779`	`+checkpoint_path,=get_checkpoint_iterator(eval_checkpoint_step,model_dir)`
`1778`	`1780`	`result_iter=estimator.predict(input_fn,checkpoint_path=checkpoint_path)`
`1779`	`1781`
`1780`	`1782`	`start=time.time()`
`@@ -1794,9 +1796,8 @@ def score_with_estimator_lazy(`
`1794`	`1796`	`score_postprocess_fn(results,vocabulary,shard_idx=shard_idx)`
`1795`	`1797`
`1796`	`1798`	`elapsed=time.time()-start`
`1797`		`-tf.logging.info(`
`1798`		`-"Scored {} results in {} s, {} examples/s for shard {}".format(`
`1799`		`-num_results,elapsed,num_results/elapsed,shard_idx))`
	`1799`	`+tf.logging.info("Scored %s results in %s s, %s examples/s for shard %s",`
	`1800`	`+num_results,elapsed,num_results/elapsed,shard_idx)`
`1800`	`1801`
`1801`	`1802`	`results= []`
`1802`	`1803`	`shard_idx+=1`
`@@ -2379,7 +2380,7 @@ def eval_model(estimator,`
`2379`	`2380`
`2380`	`2381`	`checkpoint_paths=get_checkpoint_iterator(eval_checkpoint_step,model_dir)`
`2381`	`2382`	`forcheckpoint_pathincheckpoint_paths:`
`2382`		`-tf.logging.info("Checkpoint path %s"%checkpoint_path)`
	`2383`	`+tf.logging.info("Checkpoint path %s",checkpoint_path)`
`2383`	`2384`	`global_step=int(get_step_from_checkpoint_path(checkpoint_path))`
`2384`	`2385`	`ifeval_with_score:`
`2385`	`2386`	`outputs,_=score_with_estimator_fn(`
`@@ -2907,15 +2908,15 @@ def run(tpu_job_name,`
`2907`	`2908`	`learning_rate_schedule=functools.partial(`
`2908`	`2909`	`learning_rate_schedule,total_train_steps=total_run_steps)`
`2909`	`2910`
`2910`		`-tf.logging.info("model_type=%s"%model_type,)`
`2911`		`-tf.logging.info("mode=%s"%mode,)`
`2912`		`-tf.logging.info("sequence_length=%s"%sequence_length,)`
`2913`		`-tf.logging.info("batch_size=%s"%batch_size,)`
`2914`		`-tf.logging.info("train_steps=%s"%train_steps,)`
	`2911`	`+tf.logging.info("model_type=%s",model_type,)`
	`2912`	`+tf.logging.info("mode=%s",mode,)`
	`2913`	`+tf.logging.info("sequence_length=%s",sequence_length,)`
	`2914`	`+tf.logging.info("batch_size=%s",batch_size,)`
	`2915`	`+tf.logging.info("train_steps=%s",train_steps,)`
`2915`	`2916`	`iftotal_run_stepsisnotNone:`
`2916`		`-tf.logging.info("total_run_steps=%s"%total_run_steps,)`
`2917`		`-tf.logging.info("mesh_shape=%s"%mesh_shape,)`
`2918`		`-tf.logging.info("layout_rules=%s"%layout_rules,)`
	`2917`	`+tf.logging.info("total_run_steps=%s",total_run_steps,)`
	`2918`	`+tf.logging.info("mesh_shape=%s",mesh_shape,)`
	`2919`	`+tf.logging.info("layout_rules=%s",layout_rules,)`
`2919`	`2920`
`2920`	`2921`	`ifmode=="train"anddataset_split!="train":`
`2921`	`2922`	`raiseValueError("mode==\"train\" requires dataset_split==\"train\"")`
`@@ -2929,9 +2930,7 @@ def run(tpu_job_name,`
`2929`	`2930`	`cluster=tf.distribute.cluster_resolver.TPUClusterResolver(`
`2930`	`2931`	`tpu,zone=tpu_zone,project=gcp_project)iftpuelseNone`
`2931`	`2932`
`2932`		`-tf.logging.info(`
`2933`		`-"Building TPUConfig with tpu_job_name={}".format(tpu_job_name)`
`2934`		`- )`
	`2933`	`+tf.logging.info("Building TPUConfig with tpu_job_name=%s",tpu_job_name)`
`2935`	`2934`
`2936`	`2935`	`score_in_predict_mode="score"inmode`
`2937`	`2936`	`estimator_fn=functools.partial(`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitead60ee

File tree

2 files changed

2 files changed

`‎mesh_tensorflow/optimize.py‎`

`‎mesh_tensorflow/transformer/utils.py‎`

0 commit comments