Advertisement
PeachLemonade

partition

Mar 14th, 2024
59
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.75 KB | None | 0 0
  1. import sys
  2.  
  3. from pyspark import SparkContext, SparkConf
  4. from pyspark.sql import SQLContext
  5. import pyspark.sql.functions as F
  6.  
  7. def main():
  8.         date = sys.argv[1]
  9.         base_input_path = sys.argv[2]
  10.         base_output_path = sys.argv[3]
  11.  
  12.         conf = SparkConf().setAppName(f"EventsPartitioningJob-{date}")
  13.         sc = SparkContext(conf=conf)
  14.         sql = SQLContext(sc)
  15.  
  16.  # Напишите директорию чтения в общем виде
  17.         events = sql.read.json(f"{base_input_path}/date={date}")
  18.  
  19. # Напишите директорию записи
  20.         events.write.partitionBy('event.event_type')\
  21.         .format('parquet').save(f'{base_output_path}/date={date}')
  22.  
  23.  
  24. if __name__ == "__main__":
  25.         main()
  26.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement