Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- FROM pyspark.sql.types import *
- import pyspark.sql.functions AS F
- schema = StructType([
- StructField('name', StringType(), nullable=FALSE),
- StructField("business", ArrayType(StringType())),
- StructField('total', IntegerType(), nullable=FALSE)
- ])
- test_list = ['a',['Hello', 'world'],200], ['a',['I', 'am', 'fine'],300],['b',[],500]
- df = spark.createDataFrame(test_list,schema=schema)
- df1=df.groupBy('name','business').agg(F.when(F.size('business')>1,F.SUM('total')/F.size('business')).otherwise(F.SUM('total')).alias('total'))
- df1 = df1.withColumn('business',F.explode_outer('business'))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement