Your answers are all too complicated. Actually there is a very simple method.
[package]
name = "pol"
version = "0.1.0"
edition = "2021"
[dependencies]
polars = {version="0.43.0",features=["mode","polars-io","csv","polars-ops","lazy","docs-selection","streaming","regex","temporal","is_unique","is_between","dtype-date","dtype-datetime","dtype-time","dtype-duration","dtype-categorical","rows","is_in","pivot"]}
polars-io = "0.43.0"
polars-lazy = "0.43.0"
For Dataframe:
Just df.group_by(["date"])?.apply(user_defined_function)?
let mut employee_df: DataFrame = df!("Name"=> ["老李", "老李", "老李", "老李", "老张", "老张", "老张", "老张", "老王", "老王", "老王", "老王"],
"employee_ID"=> ["员工01", "员工01", "员工01", "员工01", "员工02", "员工02", "员工02", "员工02", "员工03", "员工03", "员工03", "员工03"],
"date"=> ["8月", "9月", "10月", "11月", "8月", "9月", "10月", "11月", "8月", "9月", "10月", "11月"],
"score"=> [83, 24, 86, 74, 89, 59, 48, 79, 51, 71, 44, 90])?;
let user_defined_function = |x: DataFrame| -> Result<DataFrame, PolarsError> {
let col1: &Series = x.column("Name")?;
let col2: &Series = x.column("employee_ID")?;
let col3: &Series = x.column("score")?;
let group_id = x.column("date")?.str()?.get(0).unwrap();
// Please do something; we get those results below.
//For each group, you can return complex two-dimensional results,
//rather than just a single value like a simple aggregation.
//For each group,Keep the "Schema" of dataframe consistent,
//"Schema" is the order,names,datatype of all fields.
let group_field = Series::new("group".into(), vec![group_id, group_id, group_id]);
let res_field1 = Series::new("field1".into(), vec!["a1,1", "a2,1", "a3,1"]);
let res_field2 = Series::new("field2".into(), vec!["a1,2", "a2,2", "a3,2"]);
let res_field3 = Series::new("field3".into(), vec!["a1,3", "a2,3", "a3,3"]);
let result = DataFrame::new(vec![group_field, res_field1, res_field2, res_field3])?;
return Ok(result);
};
let res = employee_df.group_by(["date"])?.apply(user_defined_function)?; //For each group, one aggregation returns results that include multiple rows and columns.
println!("{}", res);
The output:
shape: (12, 4)
┌───────┬────────┬────────┬────────┐
│ group ┆ field1 ┆ field2 ┆ field3 │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str │
╞═══════╪════════╪════════╪════════╡
│ 8月 ┆ a1,1 ┆ a1,2 ┆ a1,3 │
│ 8月 ┆ a2,1 ┆ a2,2 ┆ a2,3 │
│ 8月 ┆ a3,1 ┆ a3,2 ┆ a3,3 │
│ 9月 ┆ a1,1 ┆ a1,2 ┆ a1,3 │
│ 9月 ┆ a2,1 ┆ a2,2 ┆ a2,3 │
│ … ┆ … ┆ … ┆ … │
│ 10月 ┆ a2,1 ┆ a2,2 ┆ a2,3 │
│ 10月 ┆ a3,1 ┆ a3,2 ┆ a3,3 │
│ 11月 ┆ a1,1 ┆ a1,2 ┆ a1,3 │
│ 11月 ┆ a2,1 ┆ a2,2 ┆ a2,3 │
│ 11月 ┆ a3,1 ┆ a3,2 ┆ a3,3 │
└───────┴────────┴────────┴────────┘
For LazyFrame
Expression in lazy().group_by/agg
context just col("score").apply_many
use polars::prelude::*;
let mut employee_df: DataFrame = df!("Name"=> ["老李", "老李", "老李", "老李", "老张", "老张", "老张", "老张", "老王", "老王", "老王", "老王"],
"employee_ID"=> ["员工01", "员工01", "员工01", "员工01", "员工02", "员工02", "员工02", "员工02", "员工03", "员工03", "员工03", "员工03"],
"date"=> ["8月", "9月", "10月", "11月", "8月", "9月", "10月", "11月", "8月", "9月", "10月", "11月"],
"score"=> [83, 24, 86, 74, 89, 59, 48, 79, 51, 71, 44, 90])?;
let user_defined_function= |x: & mut[Series]| -> Result<Option<Series>, PolarsError>{
let arg0 = &x[0];
let arg1 = &x[1];
let arg2 = &x[2];
//Please do something; we get those results below.
let res_field1 = Series::new("rank".into(), vec!["field1,row[10]","row[11]","row[12]"]);
let res_field2 = Series::new("rank2".into(), vec!["field2,row[20]","row[21]","row[22]"]);
let res_field3 = Series::new("rank3".into(), vec![1,2,3]);
//For each group, you can return complex two-dimensional results,
//rather than just a single value like a simple aggregation.
//Complex two-dimensional results must be nest by StructChunked,So that can be stored in one Series .
//For each group,Keep the "Schema" of StructChunked consistent,
//"Schema" is the order,names,datatype of all fields in StructChunked.
let res=StructChunked::from_series("res".into(), &[res_field1,res_field2,res_field3])?.into_series();
println!("res = {}",res);
Ok(Some(res))
};
// let sc = DataType::Struct(vec![
// Field::new("f1".into(), DataType::String),
// Field::new("f2".into(), DataType::String),
// Field::new("f3".into(), DataType::Int32 )
// ]);
//In the API documentation, `GetOutput::from_type(DataType::Boolean)` should be `GetOutput::from_type(sc)`. However, in fact, any `GetOutput` does work.
let output_type = GetOutput::from_type(DataType::Boolean);
let res = employee_df.lazy().group_by([col("date")]).agg(
[
//col("date"),
col("score").apply_many(user_defined_function, &[col("Name"),col("employee_ID"),col("score")], output_type)
]
).collect()?;
// expolde unnest for unpack StructChunked
println!("{}",res.explode(["score"])?.unnest(["score"])?);
the output:
shape: (12, 4)
┌──────┬────────────────┬────────────────┬───────┐
│ date ┆ rank ┆ rank2 ┆ rank3 │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ i32 │
╞══════╪════════════════╪════════════════╪═══════╡
│ 10月 ┆ field1,row[10] ┆ field2,row[20] ┆ 1 │
│ 10月 ┆ row[11] ┆ row[21] ┆ 2 │
│ 10月 ┆ row[12] ┆ row[22] ┆ 3 │
│ 8月 ┆ field1,row[10] ┆ field2,row[20] ┆ 1 │
│ 8月 ┆ row[11] ┆ row[21] ┆ 2 │
│ … ┆ … ┆ … ┆ … │
│ 11月 ┆ row[11] ┆ row[21] ┆ 2 │
│ 11月 ┆ row[12] ┆ row[22] ┆ 3 │
│ 9月 ┆ field1,row[10] ┆ field2,row[20] ┆ 1 │
│ 9月 ┆ row[11] ┆ row[21] ┆ 2 │
│ 9月 ┆ row[12] ┆ row[22] ┆ 3 │
└──────┴────────────────┴────────────────┴───────┘