googleapis/python-bigquery-dataframesPublic

NotificationsYou must be signed in to change notification settings
Fork63
Star273

Commitc62e553

authored

feat: Add bigframes.pandas.crosstab (#2231)

1 parent44e9869 commitc62e553Copy full SHA for c62e553

File tree

7 files changed

+261

-3

lines changed

bigframes
- core/reshape
  - api.py
  - pivot.py
- dataframe.py
- pandas
  - __init__.py
- session
  - __init__.py
tests/system/small
- test_pandas.py
third_party/bigframes_vendored/pandas/core/reshape
- pivot.py

7 files changed

+261

-3

lines changed

`‎bigframes/core/reshape/api.py‎`

Lines changed: 2 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@`
`15`	`15`	`frombigframes.core.reshape.concatimportconcat`
`16`	`16`	`frombigframes.core.reshape.encodingimportget_dummies`
`17`	`17`	`frombigframes.core.reshape.mergeimportmerge`
	`18`	`+frombigframes.core.reshape.pivotimportcrosstab`
`18`	`19`	`frombigframes.core.reshape.tileimportcut,qcut`
`19`	`20`
`20`		`-__all__= ["concat","get_dummies","merge","cut","qcut"]`
	`21`	`+__all__= ["concat","get_dummies","merge","cut","qcut","crosstab"]`

`‎bigframes/core/reshape/pivot.py‎`

Lines changed: 89 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,89 @@`
	`1`	`+# Copyright 2025 Google LLC`
	`2`	`+#`
	`3`	`+# Licensed under the Apache License, Version 2.0 (the "License");`
	`4`	`+# you may not use this file except in compliance with the License.`
	`5`	`+# You may obtain a copy of the License at`
	`6`	`+#`
	`7`	`+# http://www.apache.org/licenses/LICENSE-2.0`
	`8`	`+#`
	`9`	`+# Unless required by applicable law or agreed to in writing, software`
	`10`	`+# distributed under the License is distributed on an "AS IS" BASIS,`
	`11`	`+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
	`12`	`+# See the License for the specific language governing permissions and`
	`13`	`+# limitations under the License.`
	`14`	`+from __future__importannotations`
	`15`	`+`
	`16`	`+fromtypingimportOptional,TYPE_CHECKING`
	`17`	`+`
	`18`	`+importbigframes_vendored.pandas.core.reshape.pivotasvendored_pandas_pivot`
	`19`	`+importpandasaspd`
	`20`	`+`
	`21`	`+importbigframes`
	`22`	`+frombigframes.coreimportconvert,utils`
	`23`	`+frombigframes.core.reshapeimportconcat`
	`24`	`+frombigframes.dataframeimportDataFrame`
	`25`	`+`
	`26`	`+ifTYPE_CHECKING:`
	`27`	`+importbigframes.session`
	`28`	`+`
	`29`	`+`
	`30`	`+defcrosstab(`
	`31`	`+index,`
	`32`	`+columns,`
	`33`	`+values=None,`
	`34`	`+rownames=None,`
	`35`	`+colnames=None,`
	`36`	`+aggfunc=None,`
	`37`	`+*,`
	`38`	`+session:Optional[bigframes.session.Session]=None,`
	`39`	`+)->DataFrame:`
	`40`	`+if_is_list_of_lists(index):`
	`41`	`+index= [`
	`42`	`+convert.to_bf_series(subindex,default_index=None,session=session)`
	`43`	`+forsubindexinindex`
	`44`	`+ ]`
	`45`	`+else:`
	`46`	`+index= [convert.to_bf_series(index,default_index=None,session=session)]`
	`47`	`+if_is_list_of_lists(columns):`
	`48`	`+columns= [`
	`49`	`+convert.to_bf_series(subcol,default_index=None,session=session)`
	`50`	`+forsubcolincolumns`
	`51`	`+ ]`
	`52`	`+else:`
	`53`	`+columns= [convert.to_bf_series(columns,default_index=None,session=session)]`
	`54`	`+`
	`55`	`+df=concat.concat([index,columns],join="inner",axis=1)`
	`56`	`+# for uniqueness`
	`57`	`+tmp_index_names= [f"_crosstab_index_{i}"foriinrange(len(index))]`
	`58`	`+tmp_col_names= [f"_crosstab_columns_{i}"foriinrange(len(columns))]`
	`59`	`+df.columns=pd.Index([tmp_index_names,tmp_col_names])`
	`60`	`+`
	`61`	`+values= (`
	`62`	`+convert.to_bf_series(values,default_index=df.index,session=session)`
	`63`	`+ifvaluesisnotNone`
	`64`	`+else0`
	`65`	`+ )`
	`66`	`+`
	`67`	`+df["_crosstab_values"]=values`
	`68`	`+pivot_table=df.pivot_table(`
	`69`	`+values="_crosstab_values",`
	`70`	`+index=tmp_index_names,`
	`71`	`+columns=tmp_col_names,`
	`72`	`+aggfunc=aggfuncor"count",`
	`73`	`+sort=False,`
	`74`	`+ )`
	`75`	`+pivot_table.index.names=rownamesor [i.nameforiinindex]`
	`76`	`+pivot_table.columns.names=colnamesor [c.nameforcincolumns]`
	`77`	`+ifaggfuncisNone:`
	`78`	`+# TODO: Push this into pivot_table itself`
	`79`	`+pivot_table=pivot_table.fillna(0)`
	`80`	`+returnpivot_table`
	`81`	`+`
	`82`	`+`
	`83`	`+def_is_list_of_lists(item)->bool:`
	`84`	`+ifnotutils.is_list_like(item):`
	`85`	`+returnFalse`
	`86`	`+returnall(convert.can_convert_to_series(subitem)forsubiteminitem)`
	`87`	`+`
	`88`	`+`
	`89`	`+crosstab.__doc__=vendored_pandas_pivot.crosstab.__doc__`

`‎bigframes/dataframe.py‎`

Lines changed: 30 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -3479,7 +3479,34 @@ def pivot_table(`
`3479`	`3479`	`]=None,`
`3480`	`3480`	`columns:typing.Union[blocks.Label,Sequence[blocks.Label]]=None,`
`3481`	`3481`	`aggfunc:str="mean",`
	`3482`	`+fill_value=None,`
	`3483`	`+margins:bool=False,`
	`3484`	`+dropna:bool=True,`
	`3485`	`+margins_name:Hashable="All",`
	`3486`	`+observed:bool=False,`
	`3487`	`+sort:bool=True,`
`3482`	`3488`	`)->DataFrame:`
	`3489`	`+iffill_valueisnotNone:`
	`3490`	`+raiseNotImplementedError(`
	`3491`	`+"DataFrame.pivot_table fill_value arg not supported. {constants.FEEDBACK_LINK}"`
	`3492`	`+ )`
	`3493`	`+ifmargins:`
	`3494`	`+raiseNotImplementedError(`
	`3495`	`+"DataFrame.pivot_table margins arg not supported. {constants.FEEDBACK_LINK}"`
	`3496`	`+ )`
	`3497`	`+ifnotdropna:`
	`3498`	`+raiseNotImplementedError(`
	`3499`	`+"DataFrame.pivot_table dropna arg not supported. {constants.FEEDBACK_LINK}"`
	`3500`	`+ )`
	`3501`	`+ifmargins_name!="All":`
	`3502`	`+raiseNotImplementedError(`
	`3503`	`+"DataFrame.pivot_table margins_name arg not supported. {constants.FEEDBACK_LINK}"`
	`3504`	`+ )`
	`3505`	`+ifobserved:`
	`3506`	`+raiseNotImplementedError(`
	`3507`	`+"DataFrame.pivot_table observed arg not supported. {constants.FEEDBACK_LINK}"`
	`3508`	`+ )`
	`3509`	`+`
`3483`	`3510`	`ifisinstance(index,Iterable)andnot (`
`3484`	`3511`	`isinstance(index,blocks.Label)andindexinself.columns`
`3485`	`3512`	`):`
`@@ -3521,7 +3548,9 @@ def pivot_table(`
`3521`	`3548`	`columns=columns,`
`3522`	`3549`	`index=index,`
`3523`	`3550`	`values=valuesiflen(values)>1elseNone,`
`3524`		`- ).sort_index()`
	`3551`	`+ )`
	`3552`	`+ifsort:`
	`3553`	`+pivoted=pivoted.sort_index()`
`3525`	`3554`
`3526`	`3555`	`# TODO: Remove the reordering step once the issue is resolved.`
`3527`	`3556`	`# The pivot_table method results in multi-index columns that are always ordered.`

`‎bigframes/pandas/init.py‎`

Lines changed: 2 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@`
`31`	`31`	`importbigframes.core.blocks`
`32`	`32`	`importbigframes.core.global_sessionasglobal_session`
`33`	`33`	`importbigframes.core.indexes`
`34`		`-frombigframes.core.reshape.apiimportconcat,cut,get_dummies,merge,qcut`
	`34`	`+frombigframes.core.reshape.apiimportconcat,crosstab,cut,get_dummies,merge,qcut`
`35`	`35`	`importbigframes.core.tools`
`36`	`36`	`importbigframes.dataframe`
`37`	`37`	`importbigframes.enums`
`@@ -372,6 +372,7 @@ def reset_session():`
`372`	`372`	`_functions= [`
`373`	`373`	`clean_up_by_session_id,`
`374`	`374`	`concat,`
	`375`	`+crosstab,`
`375`	`376`	`cut,`
`376`	`377`	`deploy_remote_function,`
`377`	`378`	`deploy_udf,`

`‎bigframes/session/init.py‎`

Lines changed: 15 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -2312,6 +2312,21 @@ def cut(self, args, *kwargs) -> bigframes.series.Series:`
`2312`	`2312`	`**kwargs,`
`2313`	`2313`	`)`
`2314`	`2314`
	`2315`	`+defcrosstab(self,args,*kwargs)->dataframe.DataFrame:`
	`2316`	`+"""Compute a simple cross tabulation of two (or more) factors.`
	`2317`	`+`
	`2318`	`+ Included for compatibility between bpd and Session.`
	`2319`	`+`
	`2320`	+ See :func:`bigframes.pandas.crosstab` for full documentation.
	`2321`	`+ """`
	`2322`	`+importbigframes.core.reshape.pivot`
	`2323`	`+`
	`2324`	`+returnbigframes.core.reshape.pivot.crosstab(`
	`2325`	`+*args,`
	`2326`	`+session=self,`
	`2327`	`+**kwargs,`
	`2328`	`+ )`
	`2329`	`+`
`2315`	`2330`	`defDataFrame(self,args,*kwargs):`
`2316`	`2331`	`"""Constructs a DataFrame.`
`2317`	`2332`

`‎tests/system/small/test_pandas.py‎`

Lines changed: 66 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -450,6 +450,72 @@ def test_merge_raises_error_when_left_right_on_set(scalars_dfs):`
`450`	`450`	`)`
`451`	`451`
`452`	`452`
	`453`	`+deftest_crosstab_aligned_series(scalars_dfs):`
	`454`	`+scalars_df,scalars_pandas_df=scalars_dfs`
	`455`	`+`
	`456`	`+pd_result=pd.crosstab(`
	`457`	`+scalars_pandas_df["int64_col"],scalars_pandas_df["int64_too"]`
	`458`	`+ )`
	`459`	`+bf_result=bpd.crosstab(`
	`460`	`+scalars_df["int64_col"],scalars_df["int64_too"]`
	`461`	`+ ).to_pandas()`
	`462`	`+`
	`463`	`+assert_pandas_df_equal(bf_result,pd_result,check_dtype=False)`
	`464`	`+`
	`465`	`+`
	`466`	`+deftest_crosstab_nondefault_func(scalars_dfs):`
	`467`	`+scalars_df,scalars_pandas_df=scalars_dfs`
	`468`	`+`
	`469`	`+pd_result=pd.crosstab(`
	`470`	`+scalars_pandas_df["int64_col"],`
	`471`	`+scalars_pandas_df["int64_too"],`
	`472`	`+values=scalars_pandas_df["float64_col"],`
	`473`	`+aggfunc="mean",`
	`474`	`+ )`
	`475`	`+bf_result=bpd.crosstab(`
	`476`	`+scalars_df["int64_col"],`
	`477`	`+scalars_df["int64_too"],`
	`478`	`+values=scalars_df["float64_col"],`
	`479`	`+aggfunc="mean",`
	`480`	`+ ).to_pandas()`
	`481`	`+`
	`482`	`+assert_pandas_df_equal(bf_result,pd_result,check_dtype=False)`
	`483`	`+`
	`484`	`+`
	`485`	`+deftest_crosstab_multi_cols(scalars_dfs):`
	`486`	`+scalars_df,scalars_pandas_df=scalars_dfs`
	`487`	`+`
	`488`	`+pd_result=pd.crosstab(`
	`489`	`+ [scalars_pandas_df["int64_col"],scalars_pandas_df["bool_col"]],`
	`490`	`+ [scalars_pandas_df["int64_too"],scalars_pandas_df["string_col"]],`
	`491`	`+rownames=["a","b"],`
	`492`	`+colnames=["c","d"],`
	`493`	`+ )`
	`494`	`+bf_result=bpd.crosstab(`
	`495`	`+ [scalars_df["int64_col"],scalars_df["bool_col"]],`
	`496`	`+ [scalars_df["int64_too"],scalars_df["string_col"]],`
	`497`	`+rownames=["a","b"],`
	`498`	`+colnames=["c","d"],`
	`499`	`+ ).to_pandas()`
	`500`	`+`
	`501`	`+assert_pandas_df_equal(bf_result,pd_result,check_dtype=False)`
	`502`	`+`
	`503`	`+`
	`504`	`+deftest_crosstab_unaligned_series(scalars_dfs,session):`
	`505`	`+scalars_df,scalars_pandas_df=scalars_dfs`
	`506`	`+other_pd_series=pd.Series(`
	`507`	`+ [10,20,10,30,10],index=[5,4,1,2,3],dtype="Int64",name="nums"`
	`508`	`+ )`
	`509`	`+other_bf_series=session.Series(`
	`510`	`+ [10,20,10,30,10],index=[5,4,1,2,3],name="nums"`
	`511`	`+ )`
	`512`	`+`
	`513`	`+pd_result=pd.crosstab(scalars_pandas_df["int64_col"],other_pd_series)`
	`514`	`+bf_result=bpd.crosstab(scalars_df["int64_col"],other_bf_series).to_pandas()`
	`515`	`+`
	`516`	`+assert_pandas_df_equal(bf_result,pd_result,check_dtype=False)`
	`517`	`+`
	`518`	`+`
`453`	`519`	`def_convert_pandas_category(pd_s:pd.Series):`
`454`	`520`	`"""`
`455`	`521`	`Transforms a pandas Series with Categorical dtype into a bigframes-compatible`

`‎third_party/bigframes_vendored/pandas/core/reshape/pivot.py‎`

Lines changed: 57 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,57 @@`
	`1`	`+# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/reshape/pivot.py`
	`2`	`+from __future__importannotations`
	`3`	`+`
	`4`	`+frombigframesimportconstants`
	`5`	`+`
	`6`	`+`
	`7`	`+defcrosstab(`
	`8`	`+index,`
	`9`	`+columns,`
	`10`	`+values=None,`
	`11`	`+rownames=None,`
	`12`	`+colnames=None,`
	`13`	`+aggfunc=None,`
	`14`	`+):`
	`15`	`+"""`
	`16`	`+ Compute a simple cross tabulation of two (or more) factors.`
	`17`	`+`
	`18`	`+ By default, computes a frequency table of the factors unless an`
	`19`	`+ array of values and an aggregation function are passed.`
	`20`	`+`
	`21`	`+ Examples:`
	`22`	`+ >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",`
	`23`	`+ ... "bar", "bar", "foo", "foo", "foo"], dtype=object)`
	`24`	`+ >>> b = np.array(["one", "one", "one", "two", "one", "one",`
	`25`	`+ ... "one", "two", "two", "two", "one"], dtype=object)`
	`26`	`+ >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",`
	`27`	`+ ... "shiny", "dull", "shiny", "shiny", "shiny"],`
	`28`	`+ ... dtype=object)`
	`29`	`+ >>> bpd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])`
	`30`	`+ b one two`
	`31`	`+ c dull shiny dull shiny`
	`32`	`+ a`
	`33`	`+ bar 1 2 1 0`
	`34`	`+ foo 2 2 1 2`
	`35`	`+ <BLANKLINE>`
	`36`	`+ [2 rows x 4 columns]`
	`37`	`+`
	`38`	`+ Args:`
	`39`	`+ index (array-like, Series, or list of arrays/Series):`
	`40`	`+ Values to group by in the rows.`
	`41`	`+ columns (array-like, Series, or list of arrays/Series):`
	`42`	`+ Values to group by in the columns.`
	`43`	`+ values (array-like, optional):`
	`44`	`+ Array of values to aggregate according to the factors.`
	`45`	+ Requires `aggfunc` be specified.
	`46`	`+ rownames (sequence, default None):`
	`47`	`+ If passed, must match number of row arrays passed.`
	`48`	`+ colnames (sequence, default None):`
	`49`	`+ If passed, must match number of column arrays passed.`
	`50`	`+ aggfunc (function, optional):`
	`51`	+ If specified, requires `values` be specified as well.
	`52`	`+`
	`53`	`+ Returns:`
	`54`	`+ DataFrame:`
	`55`	`+ Cross tabulation of the data.`
	`56`	`+ """`
	`57`	`+raiseNotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitc62e553

File tree

7 files changed

7 files changed

`‎bigframes/core/reshape/api.py‎`

`‎bigframes/core/reshape/pivot.py‎`

`‎bigframes/dataframe.py‎`

`‎bigframes/pandas/init.py‎`

`‎bigframes/session/init.py‎`

`‎tests/system/small/test_pandas.py‎`

`‎third_party/bigframes_vendored/pandas/core/reshape/pivot.py‎`

0 commit comments

Movatterモバイル変換

File tree

7 files changed

7 files changed

‎bigframes/core/reshape/api.py‎

‎bigframes/core/reshape/pivot.py‎

‎bigframes/dataframe.py‎

‎bigframes/pandas/__init__.py‎

‎bigframes/session/__init__.py‎

‎tests/system/small/test_pandas.py‎

‎third_party/bigframes_vendored/pandas/core/reshape/pivot.py‎

0 commit comments

`‎bigframes/core/reshape/api.py‎`

`‎bigframes/core/reshape/pivot.py‎`

`‎bigframes/dataframe.py‎`

`‎bigframes/pandas/init.py‎`

`‎bigframes/session/init.py‎`

`‎tests/system/small/test_pandas.py‎`

`‎third_party/bigframes_vendored/pandas/core/reshape/pivot.py‎`