distance

distance:
;;Inputs: B is the difference between X coordinates
;;        C is the difference between Y coordinates
    ld h,b \ ld e,b \ call H_Times_E \ push hl
    ld h,c \ ld e,c \ call H_Times_E \ pop de
    add hl,de
sqrt16:
;;Inputs: HL is the number to find the square root of
;;Outputs: A is the square root
;;111 bytes
;;555 t-states worst case
;zero some registers
   xor a
   ld c,a
   ld d,a

;move the LSB of the input into E for later use, then shift the LSB into L and load H with 0.
;H will be a carry register, where the bits in L are rotated in
   ld e,l
   ld l,h
   ld h,c

;Iteration 1 is optimised
; C is treated as the accumulator
   add hl,hl
   add hl,hl
   sub h
   jr nc,$+5
   inc c
   cpl
   ld h,a

;Iteration 2
; rotate in 2 more bits from the MSB of the input into H
   add hl,hl
   add hl,hl
; shift the accumulator
   rl c
   ld a,c
   rla
; A is now double the shifted accumulator
   sub h
; doubles as a comparison of the carry register (H) to double the accumulator
   jr nc,$+5
; If the carry is > 2*accumulator, the bit in the accumulator needs to be 1:
   inc c
; We need to perform H-(2C+1), but A=2C-H.
; We could do NEG to get A=H-2C, then DEC A, but NEG = CPL \ INC A
; NEG \ DEC A  =  CPL \ INC A \ DEC A
; So just use CPL, saving 8 t-states, 1 byte
   cpl
   ld h,a

;Iteration 3
   add hl,hl
   add hl,hl
   rl c
   ld a,c
   rla
   sub h
   jr nc,$+5
   inc c
   cpl
   ld h,a

;Iteration 4
   add hl,hl
   add hl,hl
   rl c
   ld a,c
   rla
   sub h
   jr nc,$+5
   inc c
   cpl
   ld h,a

;L is 0, H is the current carry
;E is the lower 8 bits
; Load the next set of bits (LSB of input) into L so that they can be rotated into H
   ld l,e

;Iteration 5
   add hl,hl
   add hl,hl
   rl c
   ld a,c
   rla
   sub h
   jr nc,$+5
   inc c
   cpl
   ld h,a

;Iteration 6
   add hl,hl
   add hl,hl
   rl c
   ld a,c
   rla
   sub h
   jr nc,$+5
   inc c
   cpl
   ld h,a

;Iteration 7
; Now we need to start worrying about 8 bit overflow.
; In particular, the carry register, H should be ideally 9 bits for this iteration, 10 for the last.
; The accumulator, C, is 8 bits, but we need to compare H to 2*C, and 2*C is up to 9 bits on the last iteration.
;l has 4 more bits to rotate into h

   sla c \ ld a,c \ add a,a
   add hl,hl
   add hl,hl
   jr nc,$+6
   sub h \ jp $+6
   sub h
   jr nc,$+5
   inc c
   cpl
   ld h,a

;Iteration 8
; A lot of fancy stuff here
; D is 0, from way back at the beginning
; now I put H->E so that DE can hold the potentially 10-bit number
; Now C->A, L->H
; H thus has the last two bits of the input that need to be rotated into DE
; L has the value of the accumualtor which needs to be multiplied by 4 for a comparison to DE
; So 2 shifts of HL into DE results in DE holding the carry, HL holding 4*accumulated result!
   ld e,h
   ld h,l
   ld l,c
      ld a,l
   add hl,hl \ rl e \ rl d
   add hl,hl \ rl e \ rl d
   sbc hl,de
;the c flag now has the state of the last bit of the result, HL does not need to be restored.
   rla
   ret
H_Times_E:
    ld l,0 \ ld d,l
    sla h \ jr nc,$+3 \ ld l,e
    add hl,de \ jr nc,$+3 \ add hl,de
    add hl,de \ jr nc,$+3 \ add hl,de
    add hl,de \ jr nc,$+3 \ add hl,de
    add hl,de \ jr nc,$+3 \ add hl,de
    add hl,de \ jr nc,$+3 \ add hl,de
    add hl,de \ jr nc,$+3 \ add hl,de
    add hl,de \ ret nc \ add hl,de \ ret