Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
- os.environ['YARN_CONF_DIR'] = '/etc/hadoop/conf'
- import sys
- from datetime import datetime as dt
- import datetime
- import findspark
- findspark.init()
- findspark.find()
- import pyspark
- from pyspark.sql import SparkSession
- from pyspark.context import SparkContext
- import pyspark.sql.functions as F
- from pyspark import SparkContext, SparkConf
- from pyspark.sql import SQLContext
- from pyspark.sql.window import Window
- spark = SparkSession \
- .builder \
- .master("yarn") \
- .config("spark.driver.cores", "2") \
- .config("spark.driver.memory", "2g") \
- .appName("MadAxell_7.5.3.2.1") \
- .getOrCreate()
- def compare_df(original, student):
- result = original.join(student, student.user_id == original.user_id, "anti")
- if result.count > 0:
- return False
- else:
- return True
- D7_Original = spark.read.parquet("/user/examples/data/interests_d7")
- D28_Original = spark.read.parquet("/user/examples/data/interests_d28")
- D7_Student = spark.read.parquet("/user/madaxell/data/analytics//interests_d7")
- D28_Student = spark.read.parquet("/user/madaxell/data/analytics/interests_d28")
- compare_df(D7_Original, D7_Student)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement