Advertisement
hivefans

Untitled

Nov 26th, 2018
3,425
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
SPARQL 0.60 KB | None | 0 0
  1. FROM pyspark.sql.types import *
  2. import pyspark.sql.functions AS F
  3.  
  4. schema = StructType([
  5.     StructField('name', StringType(), nullable=FALSE),
  6.     StructField("business", ArrayType(StringType())),
  7.     StructField('total', IntegerType(), nullable=FALSE)
  8. ])
  9.  
  10. test_list = ['a',['Hello', 'world'],200], ['a',['I', 'am', 'fine'],300],['b',[],500]
  11.  
  12. df = spark.createDataFrame(test_list,schema=schema)
  13.  
  14.  
  15. df1=df.groupBy('name','business').agg(F.when(F.size('business')>1,F.SUM('total')/F.size('business')).otherwise(F.SUM('total')).alias('total'))
  16. df1 = df1.withColumn('business',F.explode_outer('business'))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement