import pandas as pd
df = pd.DataFrame(dict(A=[1,1,2,2,3], B=[4,4,5,5,6]))
def f(x):
return x['A'] < 'taco'
df.groupby('A').apply(f)
Results in three exceptions stacked on top of each other. See below. This is particularly unpleasant for less sophisticated users because the bottom-most error is KeyError: 'A'
which is very confusing. The actually relevant exception is the first of the three.
TypeError Traceback (most recent call last)
File ~/.mambaforge/envs/confirm/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:1567, in GroupBy.apply(self, func, *args, **kwargs)
1564 with rewrite_warning(
1565 old_msg, FutureWarning, new_msg
1566 ) if is_np_func else nullcontext():
-> 1567 result = self._python_apply_general(f, self._selected_obj)
1568 except TypeError:
1569 # gh-20949
1570 # try again, with .apply acting as a filtering
(...)
1574 # fails on *some* columns, e.g. a numeric operation
1575 # on a string grouper column
File ~/.mambaforge/envs/confirm/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:1629, in GroupBy._python_apply_general(self, f, data, not_indexed_same, is_transform, is_agg)
1601 """
1602 Apply function f in python space
1603
(...)
1627 data after applying f
1628 """
-> 1629 values, mutated = self.grouper.apply(f, data, self.axis)
1630 if not_indexed_same is None:
File ~/.mambaforge/envs/confirm/lib/python3.10/site-packages/pandas/core/groupby/ops.py:839, in BaseGrouper.apply(self, f, data, axis)
838 group_axes = group.axes
--> 839 res = f(group)
840 if not mutated and not _is_indexed_like(res, group_axes, axis):
Cell In[29], line 4, in f(x)
3 def f(x):
----> 4 return x['A'] < 'taco'
File ~/.mambaforge/envs/confirm/lib/python3.10/site-packages/pandas/core/ops/common.py:72, in _unpack_zerodim_and_defer.<locals>.new_method(self, other)
70 other = item_from_zerodim(other)
---> 72 return method(self, other)
File ~/.mambaforge/envs/confirm/lib/python3.10/site-packages/pandas/core/arraylike.py:50, in OpsMixin.__lt__(self, other)
48 @unpack_zerodim_and_defer("__lt__")
49 def __lt__(self, other):
---> 50 return self._cmp_method(other, operator.lt)
File ~/.mambaforge/envs/confirm/lib/python3.10/site-packages/pandas/core/series.py:6243, in Series._cmp_method(self, other, op)
6242 with np.errstate(all="ignore"):
-> 6243 res_values = ops.comparison_op(lvalues, rvalues, op)
6245 return self._construct_result(res_values, name=res_name)
File ~/.mambaforge/envs/confirm/lib/python3.10/site-packages/pandas/core/ops/array_ops.py:284, in comparison_op(left, right, op)
282 elif is_numeric_v_string_like(lvalues, rvalues):
283 # GH#36377 going through the numexpr path would incorrectly raise
--> 284 return invalid_comparison(lvalues, rvalues, op)
286 elif is_object_dtype(lvalues.dtype) or isinstance(rvalues, str):
File ~/.mambaforge/envs/confirm/lib/python3.10/site-packages/pandas/core/ops/invalid.py:36, in invalid_comparison(left, right, op)
35 typ = type(right).__name__
---> 36 raise TypeError(f"Invalid comparison between dtype={left.dtype} and {typ}")
37 return res_values
TypeError: Invalid comparison between dtype=int64 and str
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
File ~/.mambaforge/envs/confirm/lib/python3.10/site-packages/pandas/core/indexes/base.py:3802, in Index.get_loc(self, key, method, tolerance)
3801 try:
-> 3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:
File ~/.mambaforge/envs/confirm/lib/python3.10/site-packages/pandas/_libs/index.pyx:138, in pandas._libs.index.IndexEngine.get_loc()
File ~/.mambaforge/envs/confirm/lib/python3.10/site-packages/pandas/_libs/index.pyx:165, in pandas._libs.index.IndexEngine.get_loc()
File pandas/_libs/hashtable_class_helper.pxi:5745, in pandas._libs.hashtable.PyObjectHashTable.get_item()
File pandas/_libs/hashtable_class_helper.pxi:5753, in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'A'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
Cell In[29], line 5
3 def f(x):
4 return x['A'] < 'taco'
----> 5 df.groupby('A').apply(f)
File ~/.mambaforge/envs/confirm/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:1588, in GroupBy.apply(self, func, *args, **kwargs)
1580 new_msg = (
1581 f"The operation {orig_func} failed on a column. If any error "
1582 f"is raised, this will raise an exception in a future version "
1583 f"of pandas. Drop these columns to avoid this warning."
1584 )
1585 with rewrite_warning(
1586 old_msg, FutureWarning, new_msg
1587 ) if is_np_func else nullcontext():
-> 1588 return self._python_apply_general(f, self._selected_obj)
1590 return result
File ~/.mambaforge/envs/confirm/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:1629, in GroupBy._python_apply_general(self, f, data, not_indexed_same, is_transform, is_agg)
1592 @final
1593 def _python_apply_general(
1594 self,
(...)
1599 is_agg: bool = False,
1600 ) -> NDFrameT:
1601 """
1602 Apply function f in python space
1603
(...)
1627 data after applying f
1628 """
-> 1629 values, mutated = self.grouper.apply(f, data, self.axis)
1630 if not_indexed_same is None:
1631 not_indexed_same = mutated or self.mutated
File ~/.mambaforge/envs/confirm/lib/python3.10/site-packages/pandas/core/groupby/ops.py:839, in BaseGrouper.apply(self, f, data, axis)
837 # group might be modified
838 group_axes = group.axes
--> 839 res = f(group)
840 if not mutated and not _is_indexed_like(res, group_axes, axis):
841 mutated = True
Cell In[29], line 4, in f(x)
3 def f(x):
----> 4 return x['A'] < 'taco'
File ~/.mambaforge/envs/confirm/lib/python3.10/site-packages/pandas/core/frame.py:3807, in DataFrame.__getitem__(self, key)
3805 if self.columns.nlevels > 1:
3806 return self._getitem_multilevel(key)
-> 3807 indexer = self.columns.get_loc(key)
3808 if is_integer(indexer):
3809 indexer = [indexer]
File ~/.mambaforge/envs/confirm/lib/python3.10/site-packages/pandas/core/indexes/base.py:3804, in Index.get_loc(self, key, method, tolerance)
3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:
-> 3804 raise KeyError(key) from err
3805 except TypeError:
3806 # If we have a listlike key, _check_indexing_error will raise
3807 # InvalidIndexError. Otherwise we fall through and re-raise
3808 # the TypeError.
3809 self._check_indexing_error(key)
KeyError: 'A'
This seems to be because https://github.com/pandas-dev/pandas/blob/dba96f97abc96712946067efb63a587e47786caf/pandas/core/groupby/groupby.py#L1437 is catching all TypeError exceptions.
Thanks for the report! In the code you highlighted, apply is first trying to operation with the groupings, and then when this fails trying it without them. In my mind this is undesirable behavior but it will need to go through deprecation. I don't recall which issues / PRs this was in, but in the past there was also concern that users might find including the groupings useful (most of groupby does not and I think we should be consistent). So there might not be a clear way forward here. Tangentially related: #49543
In the meantime, I agree the traceback is undesirable. Perhaps it would be better to do something like
raised_typeerror = False
try:
...
except TypeError:
raised_typeerror = True
if raised_typerror:
...
I think this could be done without any deprecation. Further suggestions and PRs to fix are welcome!