Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitc62e553

Browse files
feat: Add bigframes.pandas.crosstab (#2231)
1 parent44e9869 commitc62e553

File tree

7 files changed

+261
-3
lines changed

7 files changed

+261
-3
lines changed

‎bigframes/core/reshape/api.py‎

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
frombigframes.core.reshape.concatimportconcat
1616
frombigframes.core.reshape.encodingimportget_dummies
1717
frombigframes.core.reshape.mergeimportmerge
18+
frombigframes.core.reshape.pivotimportcrosstab
1819
frombigframes.core.reshape.tileimportcut,qcut
1920

20-
__all__= ["concat","get_dummies","merge","cut","qcut"]
21+
__all__= ["concat","get_dummies","merge","cut","qcut","crosstab"]

‎bigframes/core/reshape/pivot.py‎

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
from __future__importannotations
15+
16+
fromtypingimportOptional,TYPE_CHECKING
17+
18+
importbigframes_vendored.pandas.core.reshape.pivotasvendored_pandas_pivot
19+
importpandasaspd
20+
21+
importbigframes
22+
frombigframes.coreimportconvert,utils
23+
frombigframes.core.reshapeimportconcat
24+
frombigframes.dataframeimportDataFrame
25+
26+
ifTYPE_CHECKING:
27+
importbigframes.session
28+
29+
30+
defcrosstab(
31+
index,
32+
columns,
33+
values=None,
34+
rownames=None,
35+
colnames=None,
36+
aggfunc=None,
37+
*,
38+
session:Optional[bigframes.session.Session]=None,
39+
)->DataFrame:
40+
if_is_list_of_lists(index):
41+
index= [
42+
convert.to_bf_series(subindex,default_index=None,session=session)
43+
forsubindexinindex
44+
]
45+
else:
46+
index= [convert.to_bf_series(index,default_index=None,session=session)]
47+
if_is_list_of_lists(columns):
48+
columns= [
49+
convert.to_bf_series(subcol,default_index=None,session=session)
50+
forsubcolincolumns
51+
]
52+
else:
53+
columns= [convert.to_bf_series(columns,default_index=None,session=session)]
54+
55+
df=concat.concat([*index,*columns],join="inner",axis=1)
56+
# for uniqueness
57+
tmp_index_names= [f"_crosstab_index_{i}"foriinrange(len(index))]
58+
tmp_col_names= [f"_crosstab_columns_{i}"foriinrange(len(columns))]
59+
df.columns=pd.Index([*tmp_index_names,*tmp_col_names])
60+
61+
values= (
62+
convert.to_bf_series(values,default_index=df.index,session=session)
63+
ifvaluesisnotNone
64+
else0
65+
)
66+
67+
df["_crosstab_values"]=values
68+
pivot_table=df.pivot_table(
69+
values="_crosstab_values",
70+
index=tmp_index_names,
71+
columns=tmp_col_names,
72+
aggfunc=aggfuncor"count",
73+
sort=False,
74+
)
75+
pivot_table.index.names=rownamesor [i.nameforiinindex]
76+
pivot_table.columns.names=colnamesor [c.nameforcincolumns]
77+
ifaggfuncisNone:
78+
# TODO: Push this into pivot_table itself
79+
pivot_table=pivot_table.fillna(0)
80+
returnpivot_table
81+
82+
83+
def_is_list_of_lists(item)->bool:
84+
ifnotutils.is_list_like(item):
85+
returnFalse
86+
returnall(convert.can_convert_to_series(subitem)forsubiteminitem)
87+
88+
89+
crosstab.__doc__=vendored_pandas_pivot.crosstab.__doc__

‎bigframes/dataframe.py‎

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3479,7 +3479,34 @@ def pivot_table(
34793479
]=None,
34803480
columns:typing.Union[blocks.Label,Sequence[blocks.Label]]=None,
34813481
aggfunc:str="mean",
3482+
fill_value=None,
3483+
margins:bool=False,
3484+
dropna:bool=True,
3485+
margins_name:Hashable="All",
3486+
observed:bool=False,
3487+
sort:bool=True,
34823488
)->DataFrame:
3489+
iffill_valueisnotNone:
3490+
raiseNotImplementedError(
3491+
"DataFrame.pivot_table fill_value arg not supported. {constants.FEEDBACK_LINK}"
3492+
)
3493+
ifmargins:
3494+
raiseNotImplementedError(
3495+
"DataFrame.pivot_table margins arg not supported. {constants.FEEDBACK_LINK}"
3496+
)
3497+
ifnotdropna:
3498+
raiseNotImplementedError(
3499+
"DataFrame.pivot_table dropna arg not supported. {constants.FEEDBACK_LINK}"
3500+
)
3501+
ifmargins_name!="All":
3502+
raiseNotImplementedError(
3503+
"DataFrame.pivot_table margins_name arg not supported. {constants.FEEDBACK_LINK}"
3504+
)
3505+
ifobserved:
3506+
raiseNotImplementedError(
3507+
"DataFrame.pivot_table observed arg not supported. {constants.FEEDBACK_LINK}"
3508+
)
3509+
34833510
ifisinstance(index,Iterable)andnot (
34843511
isinstance(index,blocks.Label)andindexinself.columns
34853512
):
@@ -3521,7 +3548,9 @@ def pivot_table(
35213548
columns=columns,
35223549
index=index,
35233550
values=valuesiflen(values)>1elseNone,
3524-
).sort_index()
3551+
)
3552+
ifsort:
3553+
pivoted=pivoted.sort_index()
35253554

35263555
# TODO: Remove the reordering step once the issue is resolved.
35273556
# The pivot_table method results in multi-index columns that are always ordered.

‎bigframes/pandas/__init__.py‎

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
importbigframes.core.blocks
3232
importbigframes.core.global_sessionasglobal_session
3333
importbigframes.core.indexes
34-
frombigframes.core.reshape.apiimportconcat,cut,get_dummies,merge,qcut
34+
frombigframes.core.reshape.apiimportconcat,crosstab,cut,get_dummies,merge,qcut
3535
importbigframes.core.tools
3636
importbigframes.dataframe
3737
importbigframes.enums
@@ -372,6 +372,7 @@ def reset_session():
372372
_functions= [
373373
clean_up_by_session_id,
374374
concat,
375+
crosstab,
375376
cut,
376377
deploy_remote_function,
377378
deploy_udf,

‎bigframes/session/__init__.py‎

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2312,6 +2312,21 @@ def cut(self, *args, **kwargs) -> bigframes.series.Series:
23122312
**kwargs,
23132313
)
23142314

2315+
defcrosstab(self,*args,**kwargs)->dataframe.DataFrame:
2316+
"""Compute a simple cross tabulation of two (or more) factors.
2317+
2318+
Included for compatibility between bpd and Session.
2319+
2320+
See :func:`bigframes.pandas.crosstab` for full documentation.
2321+
"""
2322+
importbigframes.core.reshape.pivot
2323+
2324+
returnbigframes.core.reshape.pivot.crosstab(
2325+
*args,
2326+
session=self,
2327+
**kwargs,
2328+
)
2329+
23152330
defDataFrame(self,*args,**kwargs):
23162331
"""Constructs a DataFrame.
23172332

‎tests/system/small/test_pandas.py‎

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,72 @@ def test_merge_raises_error_when_left_right_on_set(scalars_dfs):
450450
)
451451

452452

453+
deftest_crosstab_aligned_series(scalars_dfs):
454+
scalars_df,scalars_pandas_df=scalars_dfs
455+
456+
pd_result=pd.crosstab(
457+
scalars_pandas_df["int64_col"],scalars_pandas_df["int64_too"]
458+
)
459+
bf_result=bpd.crosstab(
460+
scalars_df["int64_col"],scalars_df["int64_too"]
461+
).to_pandas()
462+
463+
assert_pandas_df_equal(bf_result,pd_result,check_dtype=False)
464+
465+
466+
deftest_crosstab_nondefault_func(scalars_dfs):
467+
scalars_df,scalars_pandas_df=scalars_dfs
468+
469+
pd_result=pd.crosstab(
470+
scalars_pandas_df["int64_col"],
471+
scalars_pandas_df["int64_too"],
472+
values=scalars_pandas_df["float64_col"],
473+
aggfunc="mean",
474+
)
475+
bf_result=bpd.crosstab(
476+
scalars_df["int64_col"],
477+
scalars_df["int64_too"],
478+
values=scalars_df["float64_col"],
479+
aggfunc="mean",
480+
).to_pandas()
481+
482+
assert_pandas_df_equal(bf_result,pd_result,check_dtype=False)
483+
484+
485+
deftest_crosstab_multi_cols(scalars_dfs):
486+
scalars_df,scalars_pandas_df=scalars_dfs
487+
488+
pd_result=pd.crosstab(
489+
[scalars_pandas_df["int64_col"],scalars_pandas_df["bool_col"]],
490+
[scalars_pandas_df["int64_too"],scalars_pandas_df["string_col"]],
491+
rownames=["a","b"],
492+
colnames=["c","d"],
493+
)
494+
bf_result=bpd.crosstab(
495+
[scalars_df["int64_col"],scalars_df["bool_col"]],
496+
[scalars_df["int64_too"],scalars_df["string_col"]],
497+
rownames=["a","b"],
498+
colnames=["c","d"],
499+
).to_pandas()
500+
501+
assert_pandas_df_equal(bf_result,pd_result,check_dtype=False)
502+
503+
504+
deftest_crosstab_unaligned_series(scalars_dfs,session):
505+
scalars_df,scalars_pandas_df=scalars_dfs
506+
other_pd_series=pd.Series(
507+
[10,20,10,30,10],index=[5,4,1,2,3],dtype="Int64",name="nums"
508+
)
509+
other_bf_series=session.Series(
510+
[10,20,10,30,10],index=[5,4,1,2,3],name="nums"
511+
)
512+
513+
pd_result=pd.crosstab(scalars_pandas_df["int64_col"],other_pd_series)
514+
bf_result=bpd.crosstab(scalars_df["int64_col"],other_bf_series).to_pandas()
515+
516+
assert_pandas_df_equal(bf_result,pd_result,check_dtype=False)
517+
518+
453519
def_convert_pandas_category(pd_s:pd.Series):
454520
"""
455521
Transforms a pandas Series with Categorical dtype into a bigframes-compatible
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/reshape/pivot.py
2+
from __future__importannotations
3+
4+
frombigframesimportconstants
5+
6+
7+
defcrosstab(
8+
index,
9+
columns,
10+
values=None,
11+
rownames=None,
12+
colnames=None,
13+
aggfunc=None,
14+
):
15+
"""
16+
Compute a simple cross tabulation of two (or more) factors.
17+
18+
By default, computes a frequency table of the factors unless an
19+
array of values and an aggregation function are passed.
20+
21+
**Examples:**
22+
>>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
23+
... "bar", "bar", "foo", "foo", "foo"], dtype=object)
24+
>>> b = np.array(["one", "one", "one", "two", "one", "one",
25+
... "one", "two", "two", "two", "one"], dtype=object)
26+
>>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
27+
... "shiny", "dull", "shiny", "shiny", "shiny"],
28+
... dtype=object)
29+
>>> bpd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
30+
b one two
31+
c dull shiny dull shiny
32+
a
33+
bar 1 2 1 0
34+
foo 2 2 1 2
35+
<BLANKLINE>
36+
[2 rows x 4 columns]
37+
38+
Args:
39+
index (array-like, Series, or list of arrays/Series):
40+
Values to group by in the rows.
41+
columns (array-like, Series, or list of arrays/Series):
42+
Values to group by in the columns.
43+
values (array-like, optional):
44+
Array of values to aggregate according to the factors.
45+
Requires `aggfunc` be specified.
46+
rownames (sequence, default None):
47+
If passed, must match number of row arrays passed.
48+
colnames (sequence, default None):
49+
If passed, must match number of column arrays passed.
50+
aggfunc (function, optional):
51+
If specified, requires `values` be specified as well.
52+
53+
Returns:
54+
DataFrame:
55+
Cross tabulation of the data.
56+
"""
57+
raiseNotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp