Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pyspark.context import SparkContext
- from pyspark.conf import SparkConf
- import json
- from datetime import datetime
- import crawl
- def char(c):
- print (c)
- return True
- if __name__ == "__main__":
- conf = SparkConf()
- conf.setMaster("spark://192.168.1.104:7077")
- conf.setAppName("Crawl 3000 Urls")
- conf.set("spark.executor.memory", "6g")
- # conf.set("")
- sc = SparkContext(conf=conf)
- sc.addFile("/home/louis/Desktop/Spark/crawl.py")
- arr = json.load(open("/home/louis/Downloads/urls.json", "r"))
- print (len(arr))
- size = 300
- params = [arr[i * size : (i + 1) * size] for i in range(10)]
- print (len (params))
- urlData = sc.parallelize(params, numSlices=10)
- print ("Number of partitions: {}".format(urlData.getNumPartitions()))
- print ("Partitioner: {}".format(urlData.partitioner))
- print ("Partitions structure: {}".format(urlData.glom().collect()))
- print (datetime.now())
- ret = urlData.map(lambda url: crawl.run(url)).collect()
- print (ret)
- print (datetime.now())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement