Advertisement
hoangreal

Cluster near duplicate Union Find

Oct 21st, 2024 (edited)
88
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.84 KB | None | 0 0
  1. """
  2. We have a near duplicate pipeline at Pinterest,
  3. which produces a mapping from every image to a list of up to k near-duplicate images, such as:
  4. near_dups = {
  5. "A": ["B", "I", "K"],
  6. "B": ["A", "D"],
  7. "C": ["E"],
  8. "D": [],
  9. "E": [],
  10. "F": [],
  11. "G": ["K"],
  12. "I": [],
  13. "K": [],
  14. }
  15.  
  16. Given a mapping such as this one, form neardup clusters: collections of almost identical images.
  17. In the example above, we would form the following groups: (A, B, D, I, G, K), (C, E), and (F).**
  18. """
  19.  
  20. class UnionFind:
  21.     """
  22.    Union-Find (Disjoint Set Union) class to group elements into clusters.
  23.    
  24.    Time Complexity: O(α(n)), where α is the Inverse Ackermann function, very slow-growing.
  25.    Space Complexity: O(n), where n is the number of elements.
  26.    """
  27.  
  28.     def __init__(self):
  29.         self.parent = {}
  30.         self.rank = {}
  31.  
  32.     def find(self, item):
  33.         """Find the root representative of the item."""
  34.         if self.parent[item] != item:
  35.             self.parent[item] = self.find(self.parent[item])  # Path compression
  36.         return self.parent[item]
  37.  
  38.     def union(self, item1, item2):
  39.         """Union the sets that contain item1 and item2."""
  40.         root1 = self.find(item1)
  41.         root2 = self.find(item2)
  42.         if root1 != root2:
  43.             # Union by rank
  44.             if self.rank[root1] > self.rank[root2]:
  45.                 self.parent[root2] = root1
  46.             elif self.rank[root1] < self.rank[root2]:
  47.                 self.parent[root1] = root2
  48.             else:
  49.                 self.parent[root2] = root1
  50.                 self.rank[root1] += 1
  51.  
  52.     def add(self, item):
  53.         """Add an item to the union-find structure."""
  54.         if item not in self.parent:
  55.             self.parent[item] = item
  56.             self.rank[item] = 0
  57.            
  58. def form_near_dup_clusters(near_dups):
  59.     """
  60.    Form near-duplicate clusters from the given mapping.
  61.  
  62.    Time Complexity: O(len(near_dups) * average_size(near_dups.values()))
  63.    Space Complexity: O(len(near_dups))
  64.    """
  65.     uf = UnionFind()
  66.    
  67.     # Add all images to the Union-Find structure
  68.     for image in near_dups.keys():
  69.         uf.add(image)
  70.  
  71.     # Union images that are near duplicates
  72.     for image, duplicates in near_dups.items():
  73.         for dup in duplicates:
  74.             uf.union(image, dup)
  75.  
  76.     # Collect clusters
  77.     clusters = {}
  78.     for image in near_dups.keys():
  79.         root = uf.find(image)
  80.         if root not in clusters:
  81.             clusters[root] = []
  82.         clusters[root].append(image)
  83.  
  84.     return list(clusters.values())
  85.  
  86. near_dups = {
  87.     "A": ["B", "I", "K"],
  88.     "B": ["A", "D"],
  89.     "C": ["E"],
  90.     "D": [],
  91.     "E": [],
  92.     "F": [],
  93.     "G": ["K"],
  94.     "I": [],
  95.     "K": [],
  96. }
  97.  
  98. clusters = form_near_dup_clusters(near_dups)
  99. print(clusters)  # Example output: [['A', 'B', 'D', 'I', 'G', 'K'], ['C', 'E'], ['F']]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement