For Spark 3.1+, there is a column method withField
that can be used to update struct fields.
Assuming this is your input dataframe (corresponding to the schema you provided):
from pyspark.sql import Row
df = spark.createDataFrame([
Row(abc=Row(xyz=Row(abc123="value123", services=[Row(type="type1", subtype="subtype1")])))
])
df.show(truncate=False)
#+---------------------------------+
#|abc |
#+---------------------------------+
#|{{value123, [{type1, subtype1}]}}|
#+---------------------------------+
You can achieve that using transform
on the array services
to hash the field type
for each struct element (here I used xxhash64
function to illustrate) like this:
import pyspark.sql.functions as F
df2 = df.withColumn(
"abc",
F.col("abc").withField(
"xyz",
F.col("abc.xyz").withField(
"services",
F.expr("transform(abc.xyz.services, x -> struct(xxhash64(x.type) as type, x.subtype))")
)
)
)
df2.show(truncate=False)
#+-----------------------------------------------+
#|abc |
#+-----------------------------------------------+
#|{{value123, [{2134479862461603894, subtype1}]}}|
#+-----------------------------------------------+
For older Spark versions, you'll need to recreate the whole structs in order to update the fields, which makes it tedious when there are many nested fields. In your case it would be like this:
df2 = df.withColumn(
"abc",
F.struct(
F.struct(
F.col("abc.xyz.abc123"),
F.expr(
"transform(abc.xyz.services, x -> struct(xxhash64(x.type) as type, x.subtype))"
).alias("services")
).alias("xyz")
)
)