Advertisement
PeachLemonade

dag_spark

Mar 14th, 2024
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.41 KB | None | 0 0
  1. from datetime import datetime
  2. from airflow import DAG
  3. from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator
  4. import os
  5.  
  6. os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
  7. os.environ['YARN_CONF_DIR'] = '/etc/hadoop/conf'
  8. os.environ['JAVA_HOME']='/usr'
  9. os.environ['SPARK_HOME'] ='/usr/lib/spark'
  10. os.environ['PYTHONPATH'] ='/usr/local/lib/python3.8'
  11.  
  12. default_args = {
  13.                                 'owner': 'airflow',
  14.                                 'start_date':datetime(2020, 1, 1),
  15.                                 }
  16.  
  17. dag_spark = DAG(
  18.                         dag_id = "sparkoperator_demo",
  19.                         default_args=default_args,
  20.                         schedule_interval=None,
  21.                         )
  22.  
  23. # объявляем задачу с помощью SparkSubmitOperator
  24. spark_submit_local = SparkSubmitOperator(
  25.                         task_id='spark_submit_task',
  26.                         dag=dag_spark,
  27.                         application ='/lessons/partition.py' ,
  28.                         conn_id= 'yarn_spark',
  29.                         application_args = ["2020-05-01", '/user/master/data/events', '/user/kotlyarovb/data/events'],
  30.                         conf={
  31.             "spark.driver.maxResultSize": "20g"
  32.         },
  33.                         executor_cores = 2,
  34.                         executor_memory = '2g'
  35.                         )
  36.  
  37. spark_submit_local
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement