; Implementation of periodic transaction scheduler for USB.
; Bandwidth dedicated to periodic transactions is limited, so
; different pipes should be scheduled as uniformly as possible.

; USB1 scheduler.
; Algorithm is simple:
; when adding a pipe, optimize the following quantity:
;  * for every millisecond, take all bandwidth scheduled to periodic transfers,
;  * calculate maximum over all milliseconds,
;  * select a variant which minimizes that maximum;
; when removing a pipe, do nothing (except for bookkeeping).

; The caller must provide CONTROLLER_NAME define.
macro define_controller_name name
{
_hci_static_ep.SoftwarePart = name # _static_ep.SoftwarePart
_hci_static_ep.NextList = name # _static_ep.NextList
sizeof._hci_static_ep = sizeof. # name # _static_ep
}

; Select a list for a new pipe.
; in: esi -> usb_controller, maxpacket, type, interval can be found in the stack
; in: ecx = 2 * maximal interval = total number of periodic lists + 1
; in: edx -> {u|o}hci_static_ep for the first list
; in: eax -> byte past {u|o}hci_static_ep for the last list in the first group
; out: edx -> usb_static_ep for the selected list or zero if failed
proc usb1_select_interrupt_list
; inherit some variables from usb_open_pipe
virtual at ebp-12
.speed          db      ?
                rb      3
.bandwidth      dd      ?
.target         dd      ?
                dd      ?
                dd      ?
.config_pipe    dd      ?
.endpoint       dd      ?
.maxpacket      dd      ?
.type           dd      ?
.interval       dd      ?
end virtual
        push    ebx edi         ; save used registers to be stdcall
        push    eax             ; save eax for checks in step 3
; 1. Only intervals 2^k ms can be supported.
; The core specification says that the real interval should not be greater
; than the interval given by the endpoint descriptor, but can be less.
; Determine the actual interval as 2^k ms.
        mov     eax, ecx
; 1a. Set [.interval] to 1 if it was zero; leave it as is otherwise
        cmp     [.interval], 1
        adc     [.interval], 0
; 1b. Divide ecx by two while it is strictly greater than [.interval].
@@:
        shr     ecx, 1
        cmp     [.interval], ecx
        jb      @b
; ecx = the actual interval
;
; For example, let ecx = 8, eax = 64.
; The scheduler space is 32 milliseconds,
; we need to schedule something every 8 ms;
; there are 8 variants: schedule at times 0,8,16,24,
; schedule at times 1,9,17,25,..., schedule at times 7,15,23,31.
; Now concentrate: there are three nested loops,
; * the innermost loop calculates the total periodic bandwidth scheduled
;   in the given millisecond,
; * the intermediate loop calculates the maximum over all milliseconds
;   in the given variant, that is the quantity we're trying to minimize,
; * the outermost loop checks all variants.
; 2. Calculate offset between the first list and the first list for the
; selected interval, in bytes; save in the stack for step 4.
        sub     eax, ecx
        sub     eax, ecx
        imul    eax, sizeof._hci_static_ep
        push    eax
        imul    ebx, ecx, sizeof._hci_static_ep
; 3. Select the best variant.
; 3a. The outermost loop.
; Prepare for the loop: set the current optimal bandwidth to maximum
; possible value (so that any variant will pass the first comparison),
; calculate delta for the intermediate loop.
        or      [.bandwidth], -1
.varloop:
; 3b. The intermediate loop.
; Prepare for the loop: set the maximum to be calculated to zero,
; save counter of the outermost loop.
        xor     edi, edi
        push    edx
virtual at esp
.cur_variant    dd      ?       ; step 3b
.result_delta   dd      ?       ; step 2
.group1_limit   dd      ?       ; function prolog
end virtual
.calc_max_bandwidth:
; 3c. The innermost loop. Sum over all lists.
        xor     eax, eax
        push    edx
.calc_bandwidth:
        add     eax, [edx+_hci_static_ep.SoftwarePart+usb_static_ep.Bandwidth]
        mov     edx, [edx+_hci_static_ep.NextList]
        test    edx, edx
        jnz     .calc_bandwidth
        pop     edx
; 3d. The intermediate loop continued: update maximum.
        cmp     eax, edi
        jb      @f
        mov     edi, eax
@@:
; 3e. The intermediate loop continued: advance counter.
        add     edx, ebx
        cmp     edx, [.group1_limit]
        jb      .calc_max_bandwidth
; 3e. The intermediate loop done: restore counter of the outermost loop.
        pop     edx
; 3f. The outermost loop continued: if the current variant is
; better (maybe not strictly) then the previous optimum, update
; the optimal bandwidth and resulting list.
        cmp     edi, [.bandwidth]
        ja      @f
        mov     [.bandwidth], edi
        mov     [.target], edx
@@:
; 3g. The outermost loop continued: advance counter.
        add     edx, sizeof._hci_static_ep
        dec     ecx
        jnz     .varloop
; 4. Calculate bandwidth for the new pipe.
        mov     eax, [.maxpacket]
        mov     cl, [.speed]
        mov     ch, byte [.endpoint]
        and     ch, 80h
        call    calc_usb1_bandwidth
; 5. Get the pointer to the best list.
        pop     edx             ; restore value from step 2
        pop     ecx             ; purge stack var from prolog
        add     edx, [.target]
; 6. Check that bandwidth for the new pipe plus old bandwidth
; still fits to maximum allowed by the core specification, 90% of 12000 bits.
        mov     ecx, eax
        add     ecx, [.bandwidth]
        cmp     ecx, 10800
        ja      .no_bandwidth
; 7. Convert {o|u}hci_static_ep to usb_static_ep, update bandwidth and return.
        add     edx, _hci_static_ep.SoftwarePart
        add     [edx+usb_static_ep.Bandwidth], eax
        pop     edi ebx         ; restore used registers to be stdcall
        ret
.no_bandwidth:
        dbgstr 'Periodic bandwidth limit reached'
        xor     edx, edx
        pop     edi ebx
        ret
endp

; Pipe is removing, update the corresponding lists.
; We do not reorder anything, so just update book-keeping variable
; in the list header.
proc usb1_interrupt_list_unlink
virtual at esp
                dd      ?       ; return address
.maxpacket      dd      ?
.lowspeed       db      ?
.direction      db      ?
                rb      2
end virtual
; calculate bandwidth on the bus
        mov     eax, [.maxpacket]
        mov     ecx, dword [.lowspeed]
        call    calc_usb1_bandwidth
        mov     edx, [ebx+usb_pipe.BaseList]
; subtract pipe bandwidth
        sub     [edx+usb_static_ep.Bandwidth], eax
        ret     8
endp

; Helper procedure for USB1 scheduler: calculate bandwidth on the bus.
; in: low 11 bits of eax = payload size in bytes
; in: cl = 0 - full-speed, nonzero - high-speed
; in: ch = 0 - OUT, nonzero - IN
; out: eax = maximal bandwidth in FS-bits
proc calc_usb1_bandwidth
        and     eax, (1 shl 11) - 1     ; get payload for one transaction
        add     eax, 3  ; add 3 bytes for other fields in data packet, PID+CRC16
        test    cl, cl
        jnz     .low_speed
; Multiply by 8 for bytes -> bits, by 7/6 to accomodate bit stuffing
; and by 401/400 for IN transfers to accomodate timers difference
; 9+107/300 for IN transfers, 9+1/3 for OUT transfers
; For 0 <= eax < 09249355h, floor(eax * 107/300) = floor(eax * 5B4E81B5h / 2^32).
; For 0 <= eax < 80000000h, floor(eax / 3) = floor(eax * 55555556h / 2^32).
        mov     edx, 55555556h
        test    ch, ch
        jz      @f
        mov     edx, 5B4E81B5h
@@:
        lea     ecx, [eax*9]
        mul     edx
; Add 93 extra bits: 39 bits for Token packet (8 for SYNC, 24 for token+address,
; 4 extra bits for possible bit stuffing in token+address, 3 for EOP),
; 18 bits for bus turn-around, 11 bits for SYNC+EOP in Data packet plus 1 bit
; for possible timers difference, 2 bits for inter-packet delay, 20 bits for
; Handshake packet, 2 bits for another inter-packet delay.
        lea     eax, [ecx+edx+93]
        ret
.low_speed:
; Multiply by 8 for bytes -> bits, by 7/6 to accomodate bit stuffing,
; by 8 for LS -> FS and by 406/50 for IN transfers to accomodate timers difference.
; 75+59/75 for IN transfers, 74+2/3 for OUT transfers.
        mov     edx, 0AAAAAABh
        test    ch, ch
        mov     ecx, 74
        jz      @f
        mov     edx, 0C962FC97h
        inc     ecx
@@:
        imul    ecx, eax
        mul     edx
; Add 778 extra bits:
; 16 bits for PRE packet, 4 bits for hub delay, 8*39 bits for Token packet
; 8*18 bits for bus turn-around
; (406/50)*11 bits for SYNC+EOP in Data packet,
; 8*2 bits for inter-packet delay,
; 16 bits for PRE packet, 4 bits for hub delay, 8*20 bits for Handshake packet,
; 8*2 bits for another inter-packet delay.
        lea     eax, [ecx+edx+778]
        ret
endp