Commitcfc7a67

Hyperparticle

authored and

Mesh TensorFlow Team

committed

Add utility to save score predictions to TFRecords for scoring large datasets.

PiperOrigin-RevId: 396705745

1 parentf08b18e commitcfc7a67Copy full SHA for cfc7a67

File tree

1 file changed

+143

-9

lines changed

mesh_tensorflow/transformer
- utils.py

1 file changed

+143

-9

lines changed

`‎mesh_tensorflow/transformer/utils.py‎`

Lines changed: 143 additions & 9 deletions

Original file line number	Diff line number	Diff line change
`@@ -26,9 +26,11 @@`
`26`	`26`
`27`	`27`	`importfunctools`
`28`	`28`	`importitertools`
	`29`	`+importmath`
`29`	`30`	`importos`
`30`	`31`	`importrandom`
`31`	`32`	`importre`
	`33`	`+importtime`
`32`	`34`
`33`	`35`	`importgin`
`34`	`36`	`importgin.tf`
`@@ -1654,6 +1656,54 @@ def get_sequence_length(tokens, pad_id=0):`
`1654`	`1656`	`returnscores`
`1655`	`1657`
`1656`	`1658`
	`1659`	`+@gin.configurable`
	`1660`	`+defsave_scores_to_tfrecords(`
	`1661`	`+results,vocabulary,scores_filename,shard_idx=0,save_ids_only=False):`
	`1662`	`+"""Processes results from scoring examples and saves them to tfrecords files.`
	`1663`	`+`
	`1664`	`+ Args:`
	`1665`	`+ results: list of dictionaries containing the results for each scored`
	`1666`	`+ example.`
	`1667`	`+ vocabulary: a function that that returns a tf.data.Dataset with examples`
	`1668`	`+ containing the string field 'targets' and optionally the field 'inputs'`
	`1669`	`+ scores_filename: a string (path of file to write scores to).`
	`1670`	`+ shard_idx: an integer indicating the current index of the file for sharding.`
	`1671`	`+ save_ids_only: if true, save the ID that is prepended to the inputs,`
	`1672`	`+ delimited by a space.`
	`1673`	`+ """`
	`1674`	`+results=_maybe_add_pretokenized_features(results,vocabulary)`
	`1675`	`+scores= [r.get("scores",0.0)forrinresults]`
	`1676`	`+targets= [r.get("targets_pretokenized",r["targets"])forrinresults]`
	`1677`	`+inputs= [r.get("targets_neg_pretokenized",`
	`1678`	`+r.get("inputs",""))forrinresults]`
	`1679`	`+`
	`1680`	`+ifsave_ids_only:`
	`1681`	`+inputs= [r.split(" ",1)[0]forrininputs]`
	`1682`	`+`
	`1683`	`+table_path="{}_{}.tfrecord".format(scores_filename,shard_idx)`
	`1684`	`+tf.logging.info("Saving results to {}".format(table_path))`
	`1685`	`+`
	`1686`	`+withtf.io.TFRecordWriter(table_path)asfile_writer:`
	`1687`	`+forinput_,target,scoreinzip(inputs,targets,scores):`
	`1688`	`+record_bytes=tf.train.Example(`
	`1689`	`+features=tf.train.Features(`
	`1690`	`+feature={`
	`1691`	`+"input":`
	`1692`	`+tf.train.Feature(`
	`1693`	`+bytes_list=tf.train.BytesList(`
	`1694`	`+value=[bytes(input_,"utf8")])),`
	`1695`	`+"target":`
	`1696`	`+tf.train.Feature(`
	`1697`	`+bytes_list=tf.train.BytesList(`
	`1698`	`+value=[bytes(target,"utf8")])),`
	`1699`	`+"score":`
	`1700`	`+tf.train.Feature(`
	`1701`	`+float_list=tf.train.FloatList(value=[score])),`
	`1702`	`+ })).SerializeToString()`
	`1703`	`+file_writer.write(record_bytes)`
	`1704`	`+`
	`1705`	`+`
	`1706`	`+@gin.configurable`
`1657`	`1707`	`defscore_with_estimator(estimator,input_fn,eval_checkpoint_step,model_dir,`
`1658`	`1708`	`vocabulary,score_postprocess_fn=save_scores,`
`1659`	`1709`	`num_examples=None):`
`@@ -1691,6 +1741,74 @@ def score_with_estimator(estimator, input_fn, eval_checkpoint_step, model_dir,`
`1691`	`1741`	`returnscore_postprocess_fn(results,vocabulary)`
`1692`	`1742`
`1693`	`1743`
	`1744`	`+@gin.configurable`
	`1745`	`+defscore_with_estimator_lazy(`
	`1746`	`+estimator,input_fn,eval_checkpoint_step,model_dir,`
	`1747`	`+vocabulary,score_postprocess_fn=save_scores_to_tfrecords,`
	`1748`	`+num_examples=None,num_examples_per_shard=100000):`
	`1749`	`+"""Score each example returned by input_fn lazily.`
	`1750`	`+`
	`1751`	`+ Args:`
	`1752`	`+ estimator: a TPUEstimator`
	`1753`	`+ input_fn: a function that that returns a tf.data.Dataset with examples`
	`1754`	`+ containing the string field 'targets' and optionally the field 'inputs'`
	`1755`	+ eval_checkpoint_step: int, list of ints, or None, see `eval_model`
	`1756`	`+ docstring.`
	`1757`	`+ model_dir: string, estimator model_dir`
	`1758`	`+ vocabulary: a vocabulary.Vocabulary or (inputs_vocabulary,`
	`1759`	`+ targets_vocabulary) tuple`
	`1760`	`+ score_postprocess_fn: a function that takes in model outputs`
	`1761`	`+ post-processes, and saves them.`
	`1762`	`+ num_examples: int, the total # of examples being scored, None if unknown`
	`1763`	`+ num_examples_per_shard: int, the number of examples per file shard.`
	`1764`	`+`
	`1765`	`+ Returns:`
	`1766`	`+ a list of floats`
	`1767`	`+ """`
	`1768`	`+ifnum_examplesisnotNone:`
	`1769`	`+num_shards=math.ceil(num_examples/num_examples_per_shard)`
	`1770`	`+else:`
	`1771`	`+num_shards=None`
	`1772`	`+tf.logging.info(`
	`1773`	`+"Scoring {} examples with {} shards at {} examples per shard".format(`
	`1774`	`+num_examples,num_shards,num_examples_per_shard))`
	`1775`	`+`
	`1776`	`+checkpoint_path,=get_checkpoint_iterator(`
	`1777`	`+eval_checkpoint_step,model_dir)`
	`1778`	`+result_iter=estimator.predict(input_fn,checkpoint_path=checkpoint_path)`
	`1779`	`+`
	`1780`	`+start=time.time()`
	`1781`	`+results= []`
	`1782`	`+shard_idx=0`
	`1783`	`+`
	`1784`	`+fori,resultinenumerate(result_iter):`
	`1785`	`+results.append(result)`
	`1786`	`+num_results=len(results)`
	`1787`	`+exceeded_examples_per_shard= (`
	`1788`	`+num_examples_per_shardisnotNone`
	`1789`	`+andnum_examples_per_shard>0`
	`1790`	`+andnum_results>=num_examples_per_shard)`
	`1791`	`+exceeded_num_examples=num_examplesisnotNoneandi>=num_examples`
	`1792`	`+`
	`1793`	`+ifexceeded_examples_per_shardorexceeded_num_examples:`
	`1794`	`+score_postprocess_fn(results,vocabulary,shard_idx=shard_idx)`
	`1795`	`+`
	`1796`	`+elapsed=time.time()-start`
	`1797`	`+tf.logging.info(`
	`1798`	`+"Scored {} results in {} s, {} examples/s for shard {}".format(`
	`1799`	`+num_results,elapsed,num_results/elapsed,shard_idx))`
	`1800`	`+`
	`1801`	`+results= []`
	`1802`	`+shard_idx+=1`
	`1803`	`+start=time.time()`
	`1804`	`+`
	`1805`	`+ifexceeded_num_examples:`
	`1806`	`+break`
	`1807`	`+`
	`1808`	`+ifresults:`
	`1809`	`+score_postprocess_fn(results,vocabulary,shard_idx=shard_idx)`
	`1810`	`+`
	`1811`	`+`
`1694`	`1812`	`def_maybe_add_pretokenized_features(examples,vocabulary):`
`1695`	`1813`	`"""Ensures decoded versions of "inputs" and "targets" exist in each example.`
`1696`	`1814`
`@@ -1712,9 +1830,19 @@ def _maybe_add_pretokenized_features(examples, vocabulary):`
`1712`	`1830`	`forexampleinexamples:`
`1713`	`1831`	`forfeature_namein ["inputs","targets"]:`
`1714`	`1832`	`pretokenized_feature_name=feature_name+"_pretokenized"`
	`1833`	`+neg_pretokenized_feature_name=feature_name+"_neg_pretokenized"`
`1715`	`1834`	`iffeature_nameinexampleandpretokenized_feature_namenotinexample:`
`1716`		`-s=vocabulary[feature_name].decode(example[feature_name].tolist())`
`1717`		`-example[pretokenized_feature_name]=s`
	`1835`	`+ids=example[feature_name].tolist()`
	`1836`	`+`
	`1837`	`+neg_ids= [abs(i)foriinidsifi<0]`
	`1838`	`+ids= [iforiinidsifi>0]`
	`1839`	`+`
	`1840`	`+decoded_string=vocabulary[feature_name].decode(ids)`
	`1841`	`+example[pretokenized_feature_name]=decoded_string`
	`1842`	`+`
	`1843`	`+ifneg_ids:`
	`1844`	`+neg_decoded_string=vocabulary[feature_name].decode(neg_ids)`
	`1845`	`+example[neg_pretokenized_feature_name]=neg_decoded_string`
`1718`	`1846`
`1719`	`1847`	`ifnotadded_pretokenized[feature_name]:`
`1720`	`1848`	`added_pretokenized[feature_name]=True`
`@@ -1730,7 +1858,8 @@ def score_from_strings(estimator, vocabulary, model_type, batch_size,`
`1730`	`1858`	`sequence_length,model_dir,eval_checkpoint_step,`
`1731`	`1859`	`inputs=gin.REQUIRED,targets=gin.REQUIRED,`
`1732`	`1860`	`score_postprocess_fn=gin.REQUIRED,eos_id=1,`
`1733`		`-score_eos=True):`
	`1861`	`+score_eos=True,`
	`1862`	`+score_with_estimator_fn=score_with_estimator):`
`1734`	`1863`	`"""Compute log likelihoods per example and write to a text file.`
`1735`	`1864`
`1736`	`1865`	`inputs & targets must either be the same length (in lines) or have inputs`
`@@ -1761,6 +1890,7 @@ def score_from_strings(estimator, vocabulary, model_type, batch_size,`
`1761`	`1890`	`score_eos: a boolean - whether to score the final eos token of each line`
`1762`	`1891`	`If this is set to false, the scores can be interpreted as prefix`
`1763`	`1892`	`log-likelihoods`
	`1893`	`+ score_with_estimator_fn: a function to run scoring with the estimator.`
`1764`	`1894`	`Returns:`
`1765`	`1895`	`a list of floats`
`1766`	`1896`	`"""`
`@@ -1806,7 +1936,7 @@ def input_fn(params):`
`1806`	`1936`	`dataset=dataset.batch(batch_size,drop_remainder=True)`
`1807`	`1937`	`returndataset.prefetch(tf.data.experimental.AUTOTUNE)`
`1808`	`1938`
`1809`		`-returnscore_with_estimator(`
	`1939`	`+returnscore_with_estimator_fn(`
`1810`	`1940`	`estimator,input_fn,eval_checkpoint_step,model_dir,`
`1811`	`1941`	`vocabulary,score_postprocess_fn,len(targets))`
`1812`	`1942`
`@@ -1815,7 +1945,8 @@ def input_fn(params):`
`1815`	`1945`	`defscore_from_dataset(estimator,vocabulary,batch_size,sequence_length,`
`1816`	`1946`	`model_dir,eval_checkpoint_step,dataset_split,`
`1817`	`1947`	`score_dataset_fn=None,`
`1818`		`-score_postprocess_fn=gin.REQUIRED):`
	`1948`	`+score_postprocess_fn=gin.REQUIRED,`
	`1949`	`+score_with_estimator_fn=score_with_estimator):`
`1819`	`1950`	`"""Compute log likelihoods per example and write to a text file.`
`1820`	`1951`
`1821`	`1952`	`The function returns a list of floats representing the log-likelihood of the`
`@@ -1837,6 +1968,7 @@ def score_from_dataset(estimator, vocabulary, batch_size, sequence_length,`
`1837`	`1968`	See `eval_dataset_fn` argument to `eval_model` for details.
`1838`	`1969`	`score_postprocess_fn: Function that takes in model outputs and`
`1839`	`1970`	`post-processes then returns then.`
	`1971`	`+ score_with_estimator_fn: a function to run scoring with the estimator.`
`1840`	`1972`
`1841`	`1973`	`Returns:`
`1842`	`1974`	`scores: a list of floats, the log likelihood scores`
`@@ -1850,9 +1982,9 @@ def score_from_dataset(estimator, vocabulary, batch_size, sequence_length,`
`1850`	`1982`	`input_fn=_get_combined_dataset_input_fn(`
`1851`	`1983`	`scoring_datasets,batch_size,sequence_length)`
`1852`	`1984`
`1853`		`-returnscore_with_estimator(`
	`1985`	`+returnscore_with_estimator_fn(`
`1854`	`1986`	`estimator,input_fn,eval_checkpoint_step,model_dir,`
`1855`		`-vocabulary,score_postprocess_fn,None)`
	`1987`	`+vocabulary,score_postprocess_fn)`
`1856`	`1988`
`1857`	`1989`
`1858`	`1990`	`defget_estimator(model_type,vocabulary,mesh_shape,`
`@@ -2093,7 +2225,8 @@ def eval_model(estimator,`
`2093`	`2225`	`eval_checkpoint_step,`
`2094`	`2226`	`eval_with_score=False,`
`2095`	`2227`	`output_eval_examples=True,`
`2096`		`-eval_dir_suffix=None):`
	`2228`	`+eval_dir_suffix=None,`
	`2229`	`+score_with_estimator_fn=score_with_estimator):`
`2097`	`2230`	`"""Eval a Mesh-TF model.`
`2098`	`2231`
`2099`	`2232`	`Args:`
`@@ -2137,6 +2270,7 @@ def eval_model(estimator,`
`2137`	`2270`	`of the eval examples in plaintext to eval_summary_dir.`
`2138`	`2271`	`eval_dir_suffix: string, if not None then will appended to the`
`2139`	`2272`	`eval_summary_dir.`
	`2273`	`+ score_with_estimator_fn: a function to run scoring with the estimator.`
`2140`	`2274`	`"""`
`2141`	`2275`	`ifeval_dataset_fnisNone:`
`2142`	`2276`	`raiseValueError("Must provide eval_dataset_fn through gin for eval.")`
`@@ -2248,7 +2382,7 @@ def eval_model(estimator,`
`2248`	`2382`	`tf.logging.info("Checkpoint path %s"%checkpoint_path)`
`2249`	`2383`	`global_step=int(get_step_from_checkpoint_path(checkpoint_path))`
`2250`	`2384`	`ifeval_with_score:`
`2251`		`-outputs,_=score_with_estimator(`
	`2385`	`+outputs,_=score_with_estimator_fn(`
`2252`	`2386`	`estimator,input_fn,global_step,model_dir,vocabulary,`
`2253`	`2387`	`num_examples=sum(len(cex)forcexincached_examples.values()))`
`2254`	`2388`	`else:`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitcfc7a67

File tree

1 file changed

1 file changed

`‎mesh_tensorflow/transformer/utils.py‎`

0 commit comments