Enable xsave/xrstor, attempt 2.

git-svn-id: svn://kolibrios.org@7276 a494cfbc-eb01-0410-851d-a64ba20cac60
This commit is contained in:
Ivan Baravy 2018-05-13 00:16:10 +00:00
parent 06eafb0c92
commit db8eddbd53
6 changed files with 79 additions and 25 deletions

View File

@ -13,26 +13,35 @@ init_fpu:
fninit fninit
bt [cpu_caps+(CAPS_XSAVE/32)*4], CAPS_XSAVE mod 32 bt [cpu_caps+(CAPS_XSAVE/32)*4], CAPS_XSAVE mod 32
jmp .no_xsave ; not ready to be jnc so far jnc .no_xsave
mov ecx, cr4 mov ecx, cr4
or ecx, CR4_OSXSAVE or ecx, CR4_OSXSAVE
mov cr4, ecx mov cr4, ecx
; don't call cpuid again
bts [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32
mov eax, 0x0d ; zero xsave header
mov ecx, 64/4
xor eax, eax
mov edi, fpu_data + 512 ; skip legacy region
rep stosd
mov eax, 0x0d ; extended state enumeration main leaf
xor ecx, ecx xor ecx, ecx
cpuid cpuid
mov ebx, XCR0_FPU_MMX + XCR0_SSE + XCR0_AVX + XCR0_AVX512 and eax, XCR0_FPU_MMX + XCR0_SSE + XCR0_AVX + XCR0_AVX512
and ebx, eax xor edx, edx
xor ecx, ecx mov [xsave_eax], eax
xgetbv mov [xsave_edx], edx
or eax, ebx
xor ecx, ecx xor ecx, ecx
xsetbv xsetbv
mov eax, 0x0d mov eax, 0x0d
xor ecx, ecx xor ecx, ecx
cpuid cpuid
add ebx, 63
and ebx, NOT 63
mov [xsave_area_size], ebx mov [xsave_area_size], ebx
cmp ebx, fpu_data_size cmp ebx, fpu_data_size
ja $ ja $
@ -40,18 +49,26 @@ init_fpu:
test eax, XCR0_AVX512 test eax, XCR0_AVX512
jz @f jz @f
call init_avx512 call init_avx512
mov eax, [xsave_eax]
mov edx, [xsave_edx]
xsave [fpu_data] xsave [fpu_data]
ret ret
@@: @@:
test eax, XCR0_AVX test eax, XCR0_AVX
jz @f jz @f
call init_avx call init_avx
mov eax, [xsave_eax]
mov edx, [xsave_edx]
xsave [fpu_data] xsave [fpu_data]
ret ret
@@: @@:
test eax, XCR0_SSE test eax, XCR0_SSE
jnz .sse jz $
jmp .fpu_mmx call init_sse
mov eax, [xsave_eax]
mov edx, [xsave_edx]
xsave [fpu_data]
ret
.no_xsave: .no_xsave:
mov [xsave_area_size], 512 ; enough for FPU/MMX and SSE mov [xsave_area_size], 512 ; enough for FPU/MMX and SSE
bt [cpu_caps], CAPS_SSE bt [cpu_caps], CAPS_SSE
@ -75,7 +92,7 @@ init_fpu_mmx:
init_sse: init_sse:
mov ebx, cr4 mov ebx, cr4
mov ecx, cr0 mov ecx, cr0
or ebx, CR4_OSFXSR+CR4_OSXMMEXPT or ebx, CR4_OSFXSR + CR4_OSXMMEXPT
mov cr4, ebx mov cr4, ebx
and ecx, not (CR0_EM + CR0_MP) and ecx, not (CR0_EM + CR0_MP)
@ -186,6 +203,7 @@ avx_save_size:
; param ; param
; eax= avx_save_size() bytes memory area aligned on a 64-byte boundary ; eax= avx_save_size() bytes memory area aligned on a 64-byte boundary
align 4
avx_save: avx_save:
push ecx push ecx
push esi push esi
@ -230,7 +248,12 @@ align 4
save_context: save_context:
bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32 bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32
jnc save_fpu_context jnc save_fpu_context
xsave [eax] push eax edx
mov ecx, eax
mov eax, [xsave_eax]
mov edx, [xsave_edx]
xsave [ecx]
pop edx eax
ret ret
save_fpu_context: save_fpu_context:
bt [cpu_caps], CAPS_SSE bt [cpu_caps], CAPS_SSE
@ -284,6 +307,7 @@ fpu_restore:
pop ecx pop ecx
ret ret
align 4
avx_restore: avx_restore:
push ecx push ecx
push esi push esi
@ -301,7 +325,11 @@ avx_restore:
clts clts
bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32 bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32
jnc .no_xsave jnc .no_xsave
push edx
mov eax, [xsave_eax]
mov edx, [xsave_edx]
xrstor [esi] xrstor [esi]
pop edx
popfd popfd
pop esi pop esi
pop ecx pop ecx
@ -351,12 +379,15 @@ except_7: ;#NM exception handler
mov eax, [ebx+SLOT_BASE+APPDATA.fpu_state] mov eax, [ebx+SLOT_BASE+APPDATA.fpu_state]
bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32 bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32
jnc .no_xsave jnc .no_xsave
xsave [eax] mov ecx, eax
mov eax, [xsave_eax]
mov edx, [xsave_edx]
xsave [ecx]
mov ebx, [CURRENT_TASK] mov ebx, [CURRENT_TASK]
mov [fpu_owner], ebx mov [fpu_owner], ebx
shl ebx, 8 shl ebx, 8
mov eax, [ebx+SLOT_BASE+APPDATA.fpu_state] mov ecx, [ebx+SLOT_BASE+APPDATA.fpu_state]
xrstor [eax] xrstor [ecx]
.exit: .exit:
restore_ring3_context restore_ring3_context
iret iret

View File

@ -132,6 +132,20 @@ do_change_task:
; set gs selector unconditionally ; set gs selector unconditionally
Mov ax, graph_data Mov ax, graph_data
Mov gs, ax Mov gs, ax
; TS flag is not triggered by AVX* instructions, therefore
; we have to xsave/xrstor SIMD registers each task change
bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32
jnc .no_xsave
mov ecx, [esi+APPDATA.fpu_state]
mov eax, [xsave_eax]
mov edx, [xsave_edx]
xsave [ecx]
mov ecx, [CURRENT_TASK]
mov [fpu_owner], ecx
mov ecx, [current_slot]
mov ecx, [ecx+APPDATA.fpu_state]
xrstor [ecx]
.no_xsave:
; set CR0.TS ; set CR0.TS
cmp bh, byte[fpu_owner] ;bh == incoming task (new) cmp bh, byte[fpu_owner] ;bh == incoming task (new)
clts ;clear a task switch flag clts ;clear a task switch flag

View File

@ -929,10 +929,8 @@ proc set_app_params stdcall,slot:dword, params:dword, flags:dword
shr ecx, 2 shr ecx, 2
rep movsd rep movsd
cmp ebx, [TASK_COUNT] cmp [TASK_COUNT], ebx
jle .noinc adc dword [TASK_COUNT], 0 ; update number of processes
inc dword [TASK_COUNT] ;update number of processes
.noinc:
shl ebx, 8 shl ebx, 8
lea edx, [ebx+SLOT_BASE+APP_EV_OFFSET] lea edx, [ebx+SLOT_BASE+APP_EV_OFFSET]
mov [SLOT_BASE+APPDATA.fd_ev+ebx], edx mov [SLOT_BASE+APPDATA.fd_ev+ebx], edx

View File

@ -335,8 +335,9 @@ diff16 "end of .data segment",0,$
align 16 align 16
cur_saved_data: cur_saved_data:
rb 4096 rb 4096
align 64
fpu_data: fpu_data:
rb 1024 rb 0xa80 ; bochs avx512
fpu_data_size = $ - fpu_data fpu_data_size = $ - fpu_data
draw_data: draw_data:
rb 32*256 rb 32*256
@ -434,6 +435,8 @@ cpu_info dd ?
cpu_caps rd 4 cpu_caps rd 4
xsave_area_size dd ? xsave_area_size dd ?
xsave_eax dd ?
xsave_edx dd ?
pg_data PG_DATA pg_data PG_DATA
heap_test dd ? heap_test dd ?

View File

@ -434,7 +434,10 @@ high_code:
;lidt [idtreg] ;lidt [idtreg]
call init_kernel_heap call init_kernel_heap
stdcall kernel_alloc, (RING0_STACK_SIZE+512) * 2 call init_fpu
mov eax, [xsave_area_size]
lea eax, [eax*2 + RING0_STACK_SIZE*2]
stdcall kernel_alloc, eax
mov [os_stack_seg], eax mov [os_stack_seg], eax
lea esp, [eax+RING0_STACK_SIZE] lea esp, [eax+RING0_STACK_SIZE]
@ -469,7 +472,6 @@ high_code:
mov [LFBAddress], LFB_BASE mov [LFBAddress], LFB_BASE
mov ecx, bios_fb mov ecx, bios_fb
call set_framebuffer call set_framebuffer
call init_fpu
call init_malloc call init_malloc
stdcall alloc_kernel_space, 0x50000 ; FIXME check size stdcall alloc_kernel_space, 0x50000 ; FIXME check size
@ -590,7 +592,8 @@ high_code:
mov edx, SLOT_BASE+256*1 mov edx, SLOT_BASE+256*1
mov ebx, [os_stack_seg] mov ebx, [os_stack_seg]
add ebx, 0x2000 add ebx, RING0_STACK_SIZE
add ebx, [xsave_area_size]
call setup_os_slot call setup_os_slot
mov dword [edx], 'IDLE' mov dword [edx], 'IDLE'
sub [edx+APPDATA.saved_esp], 4 sub [edx+APPDATA.saved_esp], 4

View File

@ -7,7 +7,8 @@
; Optimized for KolibriOS, By Diamond ; Optimized for KolibriOS, By Diamond
; Assemble with ; Assemble with
; c:fasm firework.asm firework.kex ; c:fasm firework.asm firework.kex
; NOTE: Needs MMX & SSE, optionally AVX ; NOTE: Needs MMX & SSE,
; optionally AVX, AVX2, AVX512
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
use32 use32
org 0x0 org 0x0
@ -23,8 +24,12 @@ use32
include '../../../macros.inc' include '../../../macros.inc'
SCREEN_WIDTH = 320 SCREEN_WIDTH = 320
SCREEN_HEIGHT = 200 SCREEN_HEIGHT = 200
SIMD equ SSE SIMD equ AVX
SIMD_BYTES = 8 SIMD_BYTES = 16
; SSE 8
; AVX 16
; AVX2 32
; AVX512 64
assert SCREEN_WIDTH mod SIMD_BYTES = 0 assert SCREEN_WIDTH mod SIMD_BYTES = 0
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Global defines ; Global defines