Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- We have a near duplicate pipeline at Pinterest,
- which produces a mapping from every image to a list of up to k near-duplicate images, such as:
- near_dups = {
- "A": ["B", "I", "K"],
- "B": ["A", "D"],
- "C": ["E"],
- "D": [],
- "E": [],
- "F": [],
- "G": ["K"],
- "I": [],
- "K": [],
- }
- Given a mapping such as this one, form neardup clusters: collections of almost identical images.
- In the example above, we would form the following groups: (A, B, D, I, G, K), (C, E), and (F).**
- """
- class UnionFind:
- """
- Union-Find (Disjoint Set Union) class to group elements into clusters.
- Time Complexity: O(α(n)), where α is the Inverse Ackermann function, very slow-growing.
- Space Complexity: O(n), where n is the number of elements.
- """
- def __init__(self):
- self.parent = {}
- self.rank = {}
- def find(self, item):
- """Find the root representative of the item."""
- if self.parent[item] != item:
- self.parent[item] = self.find(self.parent[item]) # Path compression
- return self.parent[item]
- def union(self, item1, item2):
- """Union the sets that contain item1 and item2."""
- root1 = self.find(item1)
- root2 = self.find(item2)
- if root1 != root2:
- # Union by rank
- if self.rank[root1] > self.rank[root2]:
- self.parent[root2] = root1
- elif self.rank[root1] < self.rank[root2]:
- self.parent[root1] = root2
- else:
- self.parent[root2] = root1
- self.rank[root1] += 1
- def add(self, item):
- """Add an item to the union-find structure."""
- if item not in self.parent:
- self.parent[item] = item
- self.rank[item] = 0
- def form_near_dup_clusters(near_dups):
- """
- Form near-duplicate clusters from the given mapping.
- Time Complexity: O(len(near_dups) * average_size(near_dups.values()))
- Space Complexity: O(len(near_dups))
- """
- uf = UnionFind()
- # Add all images to the Union-Find structure
- for image in near_dups.keys():
- uf.add(image)
- # Union images that are near duplicates
- for image, duplicates in near_dups.items():
- for dup in duplicates:
- uf.union(image, dup)
- # Collect clusters
- clusters = {}
- for image in near_dups.keys():
- root = uf.find(image)
- if root not in clusters:
- clusters[root] = []
- clusters[root].append(image)
- return list(clusters.values())
- near_dups = {
- "A": ["B", "I", "K"],
- "B": ["A", "D"],
- "C": ["E"],
- "D": [],
- "E": [],
- "F": [],
- "G": ["K"],
- "I": [],
- "K": [],
- }
- clusters = form_near_dup_clusters(near_dups)
- print(clusters) # Example output: [['A', 'B', 'D', 'I', 'G', 'K'], ['C', 'E'], ['F']]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement