Advertisement
fkudinov

Proof-of-concept for a more space-efficient, faster-looping dictionary

Apr 13th, 2024
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.29 KB | Source Code | 0 0
  1. import array
  2. import collections
  3. import itertools
  4.  
  5. # Placeholder constants
  6. FREE = -1
  7. DUMMY = -2
  8.  
  9. class Dict(collections.MutableMapping):
  10.     'Space efficient dictionary with fast iteration and cheap resizes.'
  11.  
  12.     def _lookup(self, key, hashvalue):
  13.         'Same lookup logic as currently used in real dicts'
  14.         assert self.filled < len(self.indices)   # At least one open slot
  15.         freeslot = None
  16.         PERTURB_SHIFT = 5
  17.         if hashvalue < 0:
  18.             perturb = -hashvalue
  19.         else:
  20.             perturb = hashvalue
  21.         n = len(self.indices)
  22.         i = perturb & (n-1)
  23.         while True:
  24.             index = self.indices[i]
  25.             if index == FREE:
  26.                 return (FREE, i) if freeslot is None else (DUMMY, freeslot)
  27.             elif index == DUMMY:
  28.                 freeslot = i
  29.             elif (self.keylist[index] is key or
  30.                   self.hashlist[index] == hashvalue
  31.                   and self.keylist[index] == key):
  32.                     return (index, i)
  33.             i = 5 * i + perturb + 1
  34.             i = i & (n-1)
  35.             perturb >>= PERTURB_SHIFT
  36.  
  37.     @staticmethod
  38.     def _make_index(n):
  39.         'New sequence of indices using the smallest possible datatype'
  40.         if n <= 2**7: return array.array('b', [FREE]) * n       # signed char
  41.         if n <= 2**15: return array.array('h', [FREE]) * n      # signed short
  42.         if n <= 2**31: return array.array('l', [FREE]) * n      # signed long
  43.         return [FREE] * n                                       # python integers
  44.  
  45.     def _resize(self, n):
  46.         '''Reindex the existing hash/key/value entries.
  47.           Entries do not get moved, they only get new indices.
  48.           No calls are made to hash() or __eq__().
  49.  
  50.        '''
  51.         n = 2 ** n.bit_length()                     # round-up to power-of-two
  52.         self.indices = self._make_index(n)
  53.         PERTURB_SHIFT = 5
  54.         for index, hashvalue in enumerate(self.hashlist):
  55.             if hashvalue < 0:
  56.                 perturb = -hashvalue
  57.             else:
  58.                 perturb = hashvalue
  59.             i = hashvalue & (n-1)
  60.             while True:
  61.                 if self.indices[i] == FREE:
  62.                     break
  63.                 i = 5 * i + perturb + 1
  64.                 i = i & (n-1)
  65.                 perturb >>= PERTURB_SHIFT
  66.             self.indices[i] = index
  67.         self.filled = self.used
  68.  
  69.     def clear(self):
  70.         self.indices = self._make_index(8)
  71.         self.hashlist = []
  72.         self.keylist = []
  73.         self.valuelist = []
  74.         self.used = 0
  75.         self.filled = 0                                         # used + dummies
  76.  
  77.     def __init__(self, *args, **kwds):
  78.         try:
  79.             self.keylist
  80.         except AttributeError:
  81.             self.clear()
  82.         self.update(*args, **kwds)
  83.  
  84.     def __getitem__(self, key):
  85.         hashvalue = hash(key)
  86.         index, i = self._lookup(key, hashvalue)
  87.         if index < 0:
  88.             raise KeyError(key)
  89.         return self.valuelist[index]
  90.  
  91.     def __setitem__(self, key, value):
  92.         hashvalue = hash(key)
  93.         index, i = self._lookup(key, hashvalue)
  94.         if index < 0:
  95.             self.indices[i] = self.used
  96.             self.hashlist.append(hashvalue)
  97.             self.keylist.append(key)
  98.             self.valuelist.append(value)
  99.             self.used += 1
  100.             if index == FREE:
  101.                 self.filled += 1
  102.                 if self.filled * 3 > len(self.indices) * 2:
  103.                     self._resize(4 * len(self))
  104.         else:
  105.             self.valuelist[index] = value
  106.  
  107.     def __delitem__(self, key):
  108.         hashvalue = hash(key)
  109.         index, i = self._lookup(key, hashvalue)
  110.         if index < 0:
  111.             raise KeyError(key)
  112.         self.indices[i] = DUMMY
  113.         self.used -= 1
  114.         # If needed, swap with the lastmost entry to avoid leaving a "hole"
  115.         if index != self.used:
  116.             lasthash = self.hashlist[-1]
  117.             lastkey = self.keylist[-1]
  118.             lastvalue = self.valuelist[-1]
  119.             lastindex, j = self._lookup(lastkey, lasthash)
  120.             assert lastindex >= 0 and i != j
  121.             self.indices[j] = index
  122.             self.hashlist[index] = lasthash
  123.             self.keylist[index] = lastkey
  124.             self.valuelist[index] = lastvalue
  125.         # Remove the lastmost entry
  126.         self.hashlist.pop()
  127.         self.keylist.pop()
  128.         self.valuelist.pop()
  129.  
  130.     def __len__(self):
  131.         return self.used
  132.  
  133.     def __iter__(self):
  134.         return iter(self.keylist)
  135.  
  136.     def iterkeys(self):
  137.         return iter(self.keylist)
  138.  
  139.     def keys(self):
  140.         return list(self.keylist)
  141.  
  142.     def itervalues(self):
  143.         return iter(self.valuelist)
  144.  
  145.     def values(self):
  146.         return list(self.valuelist)
  147.  
  148.     def iteritems(self):
  149.         return itertools.izip(self.keylist, self.valuelist)
  150.  
  151.     def items(self):
  152.         return zip(self.keylist, self.valuelist)
  153.  
  154.     def __contains__(self, key):
  155.         index, i = self._lookup(key, hash(key))
  156.         return index >= 0
  157.  
  158.     def get(self, key, default=None):
  159.         'D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None.'
  160.         index, i = self._lookup(key, hash(key))
  161.         return self.valuelist[index] if index >= 0 else default
  162.  
  163.     def popitem(self):
  164.         ''' D.popitem() -> (k, v), remove and return some (key, value) pair as a
  165.            2-tuple; but raise KeyError if D is empty.
  166.        '''
  167.         try:
  168.             key = self.keylist[-1]
  169.             value = self.valuelist[-1]
  170.         except IndexError:
  171.             raise KeyError( 'popitem(): dictionary is empty')
  172.         del self[key]
  173.         return key, value
  174.  
  175.     def __repr__(self):
  176.         return 'Dict(%r)' % self.items()
  177.  
  178.     def show_structure(self):
  179.         'Diagnostic method.  Not part of the API.'
  180.         print '=' * 50
  181.         print self
  182.         print 'Indices:', self.indices
  183.         for i, row in enumerate(zip(self.hashlist, self.keylist, self.valuelist)):
  184.             print i, row
  185.         print '-' * 50
  186.  
  187.  
  188. if __name__ == '__main__':
  189.     import sys
  190.     def f():
  191.         if len(sys.argv) > 1:
  192.             d = {}
  193.         else:
  194.             d = Dict()
  195.         class A(object):
  196.             pass
  197.         for i in range(10000000):
  198.             d[i] = A()
  199.     f()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement