Advertisement
Skytrias

Bitfield Trie and CTrie

Oct 28th, 2022
1,455
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Go 6.96 KB | Source Code | 0 0
  1. package art
  2.  
  3. import "core:os"
  4. import "core:mem"
  5. import "core:math/bits"
  6. import "core:fmt"
  7.  
  8. ALPHABET_SIZE :: 26
  9.  
  10. // treats nil trie as non existant node
  11. Trie :: struct {
  12.     array: [ALPHABET_SIZE]^Trie,
  13. }
  14.  
  15. /*
  16.     CTrie (104B)
  17.         uses smallest index possible to reduce size constraints
  18.         data allocated in array -> easy to free/clear/delete
  19.         should be the same speed of the default Trie
  20. */
  21.  
  22. // treating 0 as nil as 0 is the root
  23. CTrie :: struct {
  24.     array: [ALPHABET_SIZE]u32,
  25. }
  26.  
  27. ctrie_data: [dynamic]CTrie
  28.  
  29. ctrie_init :: proc(cap: int) {
  30.     ctrie_data = make([dynamic]CTrie, 0, cap)
  31.     append(&ctrie_data, CTrie {})
  32. }
  33.  
  34. ctrie_destroy :: proc() {
  35.     delete(ctrie_data)
  36. }
  37.  
  38. // push a node to the ctrie array and return its index
  39. ctrie_push :: proc() -> u32 {
  40.     append(&ctrie_data, CTrie {})
  41.     return u32(len(ctrie_data) - 1)
  42. }
  43.  
  44. // simple helper
  45. ctrie_get :: #force_inline proc(index: u32) -> ^CTrie {
  46.     return &ctrie_data[index]
  47. }
  48.  
  49. // get the root of the ctrie tree
  50. ctrie_root :: #force_inline proc() -> ^CTrie {
  51.     return &ctrie_data[0]
  52. }
  53.  
  54. // insert a key
  55. ctrie_insert :: proc(key: string) {
  56.     t := ctrie_root()
  57.  
  58.     for b in key {
  59.         idx := b - 'a'
  60.  
  61.         if t.array[idx] == 0 {
  62.             t.array[idx] = ctrie_push()
  63.         }
  64.  
  65.         t = ctrie_get(t.array[idx])
  66.     }  
  67. }
  68.  
  69. // print the ctrie tree
  70. ctrie_print :: proc() {
  71.     depth: int
  72.     ctrie_print_recursive(ctrie_root(), &depth, 0)
  73. }
  74.  
  75. // print the ctrie tree recursively by nodes
  76. ctrie_print_recursive :: proc(t: ^CTrie, depth: ^int, b: u8) {
  77.     if depth^ != 0 {
  78.         for i in 0..<depth^ - 1 {
  79.             fmt.eprint('\t')
  80.         }
  81.         codepoint := rune(b + 'a')
  82.         fmt.eprint(codepoint, '\n')
  83.     }
  84.  
  85.     for i in 0..<ALPHABET_SIZE {
  86.         if t.array[i] != 0 {
  87.             depth^ += 1
  88.             ctrie_print_recursive(ctrie_get(t.array[i]), depth, u8(i))
  89.             depth^ -= 1
  90.         }
  91.     }
  92. }
  93.  
  94. // DEBUG print the ctrie tree size
  95. ctrie_print_size :: proc() {
  96.     size := len(ctrie_data) * size_of(CTrie)
  97.     fmt.eprintf("SIZE in %dB %dKB %dMB for CTrie\n", size, size / 1024, size / 1024 / 1024)
  98. }
  99.  
  100. /*
  101.     Compressed trie in flat memory
  102.         uses the least amount of memory possible while still being accessible
  103.         bitfield with 32 bits describing 26 wanted bits (a-z)
  104. */
  105.  
  106. // dynamic byte array containing flexible compressed trie data
  107. comp: []byte
  108. comp_index: int
  109.  
  110. // init the comp to some cap
  111. comp_init :: proc(cap := mem.Megabyte) {
  112.     comp = make([]byte, cap)
  113. }
  114.  
  115. // destroy the comp data
  116. comp_destroy :: proc() {
  117.     delete(comp)   
  118. }
  119.  
  120. // push u32 alphabet bits data
  121. comp_push_bits :: proc() -> (res: ^u32) {
  122.     old := comp_index
  123.     res = cast(^u32) &comp[old]
  124.     comp_index += size_of(u32)
  125.     return
  126. }
  127.  
  128. // push trie children nodes as indexes
  129. comp_push_data :: proc(count: int) -> (res: []u32) {
  130.     old := comp_index
  131.     res = mem.slice_ptr(cast(^u32) &comp[old], count)
  132.     comp_index += size_of(u32) * count
  133.     return
  134. }
  135.  
  136. // push a ctrie bitfield and its dynamic data
  137. comp_push_ctrie :: proc(t: ^CTrie, previous_data: ^u32) {
  138.     if previous_data != nil {
  139.         previous_data^ = u32(comp_index)
  140.     }
  141.  
  142.     alphabet_bits := comp_push_bits()
  143.  
  144.     for i in 0..<ALPHABET_SIZE {
  145.         if t.array[i] != 0 {
  146.             alphabet_bits^ = bits.bitfield_insert_u32(alphabet_bits^, 1, uint(i), 1)
  147.         }
  148.     }
  149.  
  150.     if alphabet_bits^ != 0 {
  151.         ones := bits.count_ones(alphabet_bits^)
  152.         data := comp_push_data(int(ones))
  153.         index: int
  154.  
  155.         // TODO optimize to store actual used characters to iterate?
  156.         for i in 0..<ALPHABET_SIZE {
  157.             if t.array[i] != 0 {
  158.                 comp_push_ctrie(ctrie_get(t.array[i]), &data[index])
  159.                 index += 1
  160.             }
  161.         }
  162.     }
  163. }
  164.  
  165. // DEBUG print the trie tree
  166. comp_print :: proc() {
  167.     assert(comp_index != 0)
  168.     depth: int
  169.     comp_print_recursive(cast(^u32) &comp[0], &depth)
  170. }
  171.  
  172. // DEBUG print the trie tree recursively
  173. comp_print_recursive :: proc(alphabet_bits: ^u32, depth: ^int) {
  174.     b := alphabet_bits^
  175.  
  176.     if b == 0 {
  177.         return
  178.     }
  179.  
  180.     index: int
  181.     mask: u32
  182.     for i in 0..<u32(ALPHABET_SIZE) {
  183.         mask = (1 << i)
  184.        
  185.         // search for matching bits
  186.         if b & mask == mask {
  187.             depth^ += 1
  188.             for i in 0..<depth^ - 1 {
  189.                 fmt.eprint('\t')
  190.             }
  191.             codepoint := rune(i + 'a')
  192.             fmt.eprint(codepoint, '\n')
  193.  
  194.             next := mem.ptr_offset(alphabet_bits, index + 1)
  195.             comp_print_recursive(cast(^u32) &comp[next^], depth)
  196.             depth^ -= 1
  197.             index += 1
  198.         }
  199.     }  
  200. }
  201.  
  202. // comp_bits_contains_byte :: proc(bits: u32, b: u8) -> bool {
  203. //  mask := u32(1 << u32(b - 'a'))
  204. //  return bits & mask == mask
  205. // }
  206.  
  207. // true when the idx exists in the bitfield
  208. comp_bits_contains_index :: proc(bits: u32, idx: u32) -> bool {
  209.     mask := u32(1 << idx)
  210.     return bits & mask == mask
  211. }
  212.  
  213. // print logical size of the compressed trie
  214. comp_print_size :: proc() {
  215.     size := comp_index
  216.     fmt.eprintf("SIZE in %dB %dKB %dMB for compressed\n", size, size / 1024, size / 1024 / 1024)
  217. }
  218.  
  219. // converts alphabetic byte index into the remapped bitfield space
  220. comp_bits_index_to_counted_one :: proc(bits: u32, idx: u32) -> (res: u32, ok: bool) {
  221.     mask: u32
  222.     for i in 0..<u32(ALPHABET_SIZE) {
  223.         mask = u32(1 << i)
  224.        
  225.         if bits & mask == mask {
  226.             if idx == i {
  227.                 ok = true
  228.                 return
  229.             }
  230.  
  231.             res += 1
  232.         }
  233.     }
  234.  
  235.     return
  236. }
  237.  
  238. // prints used characters in the bitset
  239. comp_print_characters :: proc(bits: u32) {
  240.     if bits == 0 {
  241.         fmt.eprintln("EMPTY BITS")
  242.         return
  243.     }
  244.  
  245.     for i in 0..<ALPHABET_SIZE {
  246.         if comp_bits_contains_index(bits, u32(i)) {
  247.             fmt.eprint(rune(i + 'a'), ' ')
  248.         }
  249.     }
  250.  
  251.     fmt.eprintln()
  252. }
  253.  
  254. // TODO escape on utf8 byte?
  255. // lowercase valid alpha
  256. ascii_check_lower :: proc(b: u8) -> u8 {
  257.     if 'A' <= b && b <= 'Z' {
  258.         return b + 32
  259.     } else {
  260.         return b
  261.     }
  262. }
  263.  
  264. // wether the byte is a letter
  265. ascii_is_letter :: #force_inline proc(b: u8) -> bool {
  266.     return 'a' <= b && b <= 'z'
  267. }
  268.  
  269. // // searches the compressed trie for the wanted word
  270. // comp_search :: proc(key: string) -> bool {
  271. //  alphabet_bits := cast(^u32) &comp[0]
  272.  
  273. //  for i in 0..<len(key) {
  274. //      b := ascii_check_lower(key[i])
  275.  
  276. //      if ascii_is_letter(b) {
  277. //          idx := b - 'a'
  278. //          // comp_print_characters(alphabet_bits^)
  279.            
  280. //          if res, ok := comp_bits_index_to_counted_one(alphabet_bits^, u32(idx)); ok {
  281. //              next := mem.ptr_offset(alphabet_bits, res + 1)
  282. //              alphabet_bits = cast(^u32) &comp[next^]
  283. //          } else {
  284. //              return false
  285. //          }
  286. //      } else {
  287. //          return false
  288. //      }
  289. //  }
  290.  
  291. //  return true
  292. // }
  293.  
  294. // less pointer offseting sent by Jeroen :)
  295. comp_search :: proc(key: string) -> bool {
  296.     alphabet := mem.slice_data_cast([]u32, comp)
  297.     next := u32(0)
  298.  
  299.     for i in 0..<len(key) {
  300.         b := ascii_check_lower(key[i])
  301.  
  302.         if ascii_is_letter(b) {
  303.             letter := b - 'a'
  304.  
  305.             if res, ok := comp_bits_index_to_counted_one(alphabet[next], u32(letter)); ok {
  306.                 next = alphabet[next + res + 1] / size_of(u32)
  307.             } else {
  308.                 return false
  309.             }
  310.         } else {
  311.             return false
  312.         }
  313.     }
  314.    
  315.     return true
  316. }
  317.  
  318. comp_write_to_file :: proc(path: string) -> bool {
  319.     return os.write_entire_file(path, comp[:])
  320. }
  321.  
  322. comp_read_from_file :: proc(path: string) {
  323.     content, ok := os.read_entire_file(path)
  324.  
  325.     if ok {
  326.         comp = content
  327.         comp_index = len(content)
  328.     }
  329. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement