Advertisement
VladNitu

VladFastPersist

Jun 4th, 2023
107
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.20 KB | None | 0 0
  1. def save(self):
  2. # Implemented in a batch processing fashion
  3. visited_fps = set() # the fingerprints that the current document has
  4. doc = {
  5. '_id': self.url,
  6. 'fingerprints': self.fingerprints
  7. }
  8. db.news_collection.insert_one(doc)
  9.  
  10. for fp in self.fingerprints:
  11. if fp not in visited_fps:
  12. visited_fps.add(fp)
  13. existing_fps = set(db.hashes_collection.find({}, {"_id": 1}).distinct('_id')) # the already existing fps in the collection
  14. need_to_update_fps = visited_fps & existing_fps # the already existing fp_s
  15. need_to_insert_fps = visited_fps - need_to_update_fps # the fp_s that need to be inserted
  16. # Update matching documents in hashes_collection collection
  17.  
  18. filter_condition = {"_id": {"$in": list(need_to_update_fps)}}
  19. update_query = {"$addToSet": {"urls": self.url}}
  20. db.hashes_collection.update_many(filter_condition, update_query)
  21. to_insert = []
  22. for i in need_to_insert_fps:
  23. to_insert.append({"_id": i, "urls": [self.url]})
  24. if len(to_insert) != 0:
  25. db.hashes_collection.insert_many(to_insert)
  26.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement