Advertisement
Mad_Axell

Untitled

Feb 28th, 2023
46
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.24 KB | None | 0 0
  1. import os
  2. os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
  3. os.environ['YARN_CONF_DIR'] = '/etc/hadoop/conf'
  4.  
  5. import sys
  6. from datetime import datetime as dt
  7. import datetime
  8.  
  9. import findspark
  10. findspark.init()
  11. findspark.find()
  12.  
  13. import pyspark
  14. from pyspark.sql import SparkSession
  15. from pyspark.context import SparkContext
  16. import pyspark.sql.functions as F
  17.  
  18. from pyspark import SparkContext, SparkConf
  19. from pyspark.sql import SQLContext
  20. from pyspark.sql.window import Window
  21.  
  22.  
  23. spark = SparkSession \
  24. .builder \
  25. .master("yarn") \
  26. .config("spark.driver.cores", "2") \
  27. .config("spark.driver.memory", "2g") \
  28. .appName("MadAxell_7.5.3.2.1") \
  29. .getOrCreate()
  30.  
  31. def compare_df(original, student):
  32. result = original.join(student, student.user_id == original.user_id, "anti")
  33. if result.count > 0:
  34. return False
  35. else:
  36. return True
  37.  
  38. D7_Original = spark.read.parquet("/user/examples/data/interests_d7")
  39. D28_Original = spark.read.parquet("/user/examples/data/interests_d28")
  40.  
  41. D7_Student = spark.read.parquet("/user/madaxell/data/analytics//interests_d7")
  42. D28_Student = spark.read.parquet("/user/madaxell/data/analytics/interests_d28")
  43.  
  44. compare_df(D7_Original, D7_Student)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement