I have a data-frame with multiple columns containing list items.
I eventually want to explode the list columns into rows. This will only work if all lists have the same length (per row).
df.explode("a", "b")
For each row, to match the number of elements in the lists, I would like to insert dummy items into the lists.
def generate_dummy(c1, c2):
return pl.lit([""] * (pl.col(c1).cast(pl.Int32) - pl.col(c2).cast(pl.Int32)), dtype=pl.List(pl.String))
# Original dataframe
df = pl.DataFrame({"a": [[1, 2], [3], [4, 5], [1]], "b": [[4, 5, 7], [6], [4, 5], [3, 2]]})
# Collect the list lengths in each column.
df = df.with_columns(alens=pl.col("a").list.len(), blens=pl.col("b").list.len())
### ERROR STEP ###
# Add dummy element [""] where the length is shorter.
df = df.with_columns(
pl.when(pl.col("alens") > pl.col("blens"))
.then(pl.col("b").list.concat(generate_dummy("alens", "blens")))
.otherwise(pl.col("a").list.concat(generate_dummy("blens", "alens")))
)
But I am stuck when counting #'s of dummy
elements to be added.
The error I get,
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[304], line 3
1 df.with_columns(
2 pl.col("a").list.concat(
----> 3 pl.lit(
4 [""] * (pl.col("alens").cast(pl.Int32) - pl.col("blens").cast(pl.Int32)), dtype=pl.List(pl.String)
5 )
6 )
7 )
File ~/data/opt/mambaforge/envs/orbital_py395/lib/python3.9/site-packages/polars/functions/lit.py:130, in lit(value, dtype, allow_object)
127 return lit(pl.Series("literal", [value], dtype=dtype))
129 if dtype:
--> 130 return wrap_expr(plr.lit(value, allow_object)).cast(dtype)
132 try:
133 # numpy literals like np.float32(0) have item/dtype
134 item = value.item()
TypeError: cannot create expression literal for value of type Expr: <Expr ['[(Series[literal]) * ([(col("a…'] at 0x15007DCF7370>
Hint: Pass `allow_object=True` to accept any value and create a literal of type Object.
I tried with kwargs allow_object=True
and end up with the error
---------------------------------------------------------------------------
ComputeError Traceback (most recent call last)
Cell In[305], line 1
----> 1 df.with_columns(
2 pl.col("a").list.concat(
3 pl.lit(
4 [""] * (pl.col("alens").cast(pl.Int32) - pl.col("blens").cast(pl.Int32)), dtype=pl.List(pl.String), allow_object=True
5 )
6 )
7 )
File ~/data/opt/mambaforge/envs/orbital_py395/lib/python3.9/site-packages/polars/dataframe/frame.py:8310, in DataFrame.with_columns(self, *exprs, **named_exprs)
8164 def with_columns(
8165 self,
8166 *exprs: IntoExpr | Iterable[IntoExpr],
8167 **named_exprs: IntoExpr,
8168 ) -> DataFrame:
8169 """
8170 Add columns to this DataFrame.
8171
(...)
8308 └─────┴──────┴─────────────┘
8309 """
-> 8310 return self.lazy().with_columns(*exprs, **named_exprs).collect(_eager=True)
File ~/data/opt/mambaforge/envs/orbital_py395/lib/python3.9/site-packages/polars/lazyframe/frame.py:1816, in LazyFrame.collect(self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, no_optimization, streaming, background, _eager, **_kwargs)
1813 # Only for testing purposes atm.
1814 callback = _kwargs.get("post_opt_callback")
-> 1816 return wrap_df(ldf.collect(callback))
ComputeError: cannot cast 'Object' type