Enable xsave/xrstor, attempt 2.

git-svn-id: svn://kolibrios.org@7276 a494cfbc-eb01-0410-851d-a64ba20cac60
This commit is contained in:
Ivan Baravy 2018-05-13 00:16:10 +00:00
parent 06eafb0c92
commit db8eddbd53
6 changed files with 79 additions and 25 deletions

View File

@ -13,26 +13,35 @@ init_fpu:
fninit
bt [cpu_caps+(CAPS_XSAVE/32)*4], CAPS_XSAVE mod 32
jmp .no_xsave ; not ready to be jnc so far
jnc .no_xsave
mov ecx, cr4
or ecx, CR4_OSXSAVE
mov cr4, ecx
; don't call cpuid again
bts [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32
mov eax, 0x0d
; zero xsave header
mov ecx, 64/4
xor eax, eax
mov edi, fpu_data + 512 ; skip legacy region
rep stosd
mov eax, 0x0d ; extended state enumeration main leaf
xor ecx, ecx
cpuid
mov ebx, XCR0_FPU_MMX + XCR0_SSE + XCR0_AVX + XCR0_AVX512
and ebx, eax
xor ecx, ecx
xgetbv
or eax, ebx
and eax, XCR0_FPU_MMX + XCR0_SSE + XCR0_AVX + XCR0_AVX512
xor edx, edx
mov [xsave_eax], eax
mov [xsave_edx], edx
xor ecx, ecx
xsetbv
mov eax, 0x0d
xor ecx, ecx
cpuid
add ebx, 63
and ebx, NOT 63
mov [xsave_area_size], ebx
cmp ebx, fpu_data_size
ja $
@ -40,18 +49,26 @@ init_fpu:
test eax, XCR0_AVX512
jz @f
call init_avx512
mov eax, [xsave_eax]
mov edx, [xsave_edx]
xsave [fpu_data]
ret
@@:
test eax, XCR0_AVX
jz @f
call init_avx
mov eax, [xsave_eax]
mov edx, [xsave_edx]
xsave [fpu_data]
ret
@@:
test eax, XCR0_SSE
jnz .sse
jmp .fpu_mmx
jz $
call init_sse
mov eax, [xsave_eax]
mov edx, [xsave_edx]
xsave [fpu_data]
ret
.no_xsave:
mov [xsave_area_size], 512 ; enough for FPU/MMX and SSE
bt [cpu_caps], CAPS_SSE
@ -75,7 +92,7 @@ init_fpu_mmx:
init_sse:
mov ebx, cr4
mov ecx, cr0
or ebx, CR4_OSFXSR+CR4_OSXMMEXPT
or ebx, CR4_OSFXSR + CR4_OSXMMEXPT
mov cr4, ebx
and ecx, not (CR0_EM + CR0_MP)
@ -186,6 +203,7 @@ avx_save_size:
; param
; eax= avx_save_size() bytes memory area aligned on a 64-byte boundary
align 4
avx_save:
push ecx
push esi
@ -230,7 +248,12 @@ align 4
save_context:
bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32
jnc save_fpu_context
xsave [eax]
push eax edx
mov ecx, eax
mov eax, [xsave_eax]
mov edx, [xsave_edx]
xsave [ecx]
pop edx eax
ret
save_fpu_context:
bt [cpu_caps], CAPS_SSE
@ -284,6 +307,7 @@ fpu_restore:
pop ecx
ret
align 4
avx_restore:
push ecx
push esi
@ -301,7 +325,11 @@ avx_restore:
clts
bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32
jnc .no_xsave
push edx
mov eax, [xsave_eax]
mov edx, [xsave_edx]
xrstor [esi]
pop edx
popfd
pop esi
pop ecx
@ -351,12 +379,15 @@ except_7: ;#NM exception handler
mov eax, [ebx+SLOT_BASE+APPDATA.fpu_state]
bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32
jnc .no_xsave
xsave [eax]
mov ecx, eax
mov eax, [xsave_eax]
mov edx, [xsave_edx]
xsave [ecx]
mov ebx, [CURRENT_TASK]
mov [fpu_owner], ebx
shl ebx, 8
mov eax, [ebx+SLOT_BASE+APPDATA.fpu_state]
xrstor [eax]
mov ecx, [ebx+SLOT_BASE+APPDATA.fpu_state]
xrstor [ecx]
.exit:
restore_ring3_context
iret

View File

@ -132,6 +132,20 @@ do_change_task:
; set gs selector unconditionally
Mov ax, graph_data
Mov gs, ax
; TS flag is not triggered by AVX* instructions, therefore
; we have to xsave/xrstor SIMD registers each task change
bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32
jnc .no_xsave
mov ecx, [esi+APPDATA.fpu_state]
mov eax, [xsave_eax]
mov edx, [xsave_edx]
xsave [ecx]
mov ecx, [CURRENT_TASK]
mov [fpu_owner], ecx
mov ecx, [current_slot]
mov ecx, [ecx+APPDATA.fpu_state]
xrstor [ecx]
.no_xsave:
; set CR0.TS
cmp bh, byte[fpu_owner] ;bh == incoming task (new)
clts ;clear a task switch flag

View File

@ -929,10 +929,8 @@ proc set_app_params stdcall,slot:dword, params:dword, flags:dword
shr ecx, 2
rep movsd
cmp ebx, [TASK_COUNT]
jle .noinc
inc dword [TASK_COUNT] ;update number of processes
.noinc:
cmp [TASK_COUNT], ebx
adc dword [TASK_COUNT], 0 ; update number of processes
shl ebx, 8
lea edx, [ebx+SLOT_BASE+APP_EV_OFFSET]
mov [SLOT_BASE+APPDATA.fd_ev+ebx], edx

View File

@ -335,8 +335,9 @@ diff16 "end of .data segment",0,$
align 16
cur_saved_data:
rb 4096
align 64
fpu_data:
rb 1024
rb 0xa80 ; bochs avx512
fpu_data_size = $ - fpu_data
draw_data:
rb 32*256
@ -434,6 +435,8 @@ cpu_info dd ?
cpu_caps rd 4
xsave_area_size dd ?
xsave_eax dd ?
xsave_edx dd ?
pg_data PG_DATA
heap_test dd ?

View File

@ -434,7 +434,10 @@ high_code:
;lidt [idtreg]
call init_kernel_heap
stdcall kernel_alloc, (RING0_STACK_SIZE+512) * 2
call init_fpu
mov eax, [xsave_area_size]
lea eax, [eax*2 + RING0_STACK_SIZE*2]
stdcall kernel_alloc, eax
mov [os_stack_seg], eax
lea esp, [eax+RING0_STACK_SIZE]
@ -469,7 +472,6 @@ high_code:
mov [LFBAddress], LFB_BASE
mov ecx, bios_fb
call set_framebuffer
call init_fpu
call init_malloc
stdcall alloc_kernel_space, 0x50000 ; FIXME check size
@ -590,7 +592,8 @@ high_code:
mov edx, SLOT_BASE+256*1
mov ebx, [os_stack_seg]
add ebx, 0x2000
add ebx, RING0_STACK_SIZE
add ebx, [xsave_area_size]
call setup_os_slot
mov dword [edx], 'IDLE'
sub [edx+APPDATA.saved_esp], 4

View File

@ -7,7 +7,8 @@
; Optimized for KolibriOS, By Diamond
; Assemble with
; c:fasm firework.asm firework.kex
; NOTE: Needs MMX & SSE, optionally AVX
; NOTE: Needs MMX & SSE,
; optionally AVX, AVX2, AVX512
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
use32
org 0x0
@ -23,8 +24,12 @@ use32
include '../../../macros.inc'
SCREEN_WIDTH = 320
SCREEN_HEIGHT = 200
SIMD equ SSE
SIMD_BYTES = 8
SIMD equ AVX
SIMD_BYTES = 16
; SSE 8
; AVX 16
; AVX2 32
; AVX512 64
assert SCREEN_WIDTH mod SIMD_BYTES = 0
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Global defines