Advertisement
huutho_96

run.py

Nov 6th, 2018
284
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.07 KB | None | 0 0
  1. from pyspark.context import SparkContext
  2. from pyspark.conf import SparkConf
  3.  
  4. import json
  5. from datetime import datetime
  6. import crawl
  7.  
  8. def char(c):
  9.     print (c)
  10.     return True
  11.  
  12. if __name__ == "__main__":
  13.     conf = SparkConf()
  14.     conf.setMaster("spark://192.168.1.104:7077")
  15.     conf.setAppName("Crawl 3000 Urls")
  16.     conf.set("spark.executor.memory", "6g")
  17.     # conf.set("")
  18.  
  19.     sc = SparkContext(conf=conf)
  20.  
  21.     sc.addFile("/home/louis/Desktop/Spark/crawl.py")
  22.     arr = json.load(open("/home/louis/Downloads/urls.json", "r"))
  23.    
  24.     print (len(arr))
  25.     size = 300
  26.     params = [arr[i * size : (i + 1) * size] for i in range(10)]
  27.     print (len (params))
  28.     urlData = sc.parallelize(params, numSlices=10)
  29.  
  30.     print ("Number of partitions: {}".format(urlData.getNumPartitions()))
  31.     print ("Partitioner: {}".format(urlData.partitioner))
  32.     print ("Partitions structure: {}".format(urlData.glom().collect()))
  33.  
  34.  
  35.  
  36.     print (datetime.now())
  37.     ret = urlData.map(lambda url: crawl.run(url)).collect()
  38.     print (ret)
  39.     print (datetime.now())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement