diff --git a/kernel/trunk/core/fpu.inc b/kernel/trunk/core/fpu.inc index a3e6970884..096bc58cee 100644 --- a/kernel/trunk/core/fpu.inc +++ b/kernel/trunk/core/fpu.inc @@ -13,26 +13,35 @@ init_fpu: fninit bt [cpu_caps+(CAPS_XSAVE/32)*4], CAPS_XSAVE mod 32 - jmp .no_xsave ; not ready to be jnc so far + jnc .no_xsave mov ecx, cr4 or ecx, CR4_OSXSAVE mov cr4, ecx + ; don't call cpuid again + bts [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32 - mov eax, 0x0d + ; zero xsave header + mov ecx, 64/4 + xor eax, eax + mov edi, fpu_data + 512 ; skip legacy region + rep stosd + + mov eax, 0x0d ; extended state enumeration main leaf xor ecx, ecx cpuid - mov ebx, XCR0_FPU_MMX + XCR0_SSE + XCR0_AVX + XCR0_AVX512 - and ebx, eax - xor ecx, ecx - xgetbv - or eax, ebx + and eax, XCR0_FPU_MMX + XCR0_SSE + XCR0_AVX + XCR0_AVX512 + xor edx, edx + mov [xsave_eax], eax + mov [xsave_edx], edx xor ecx, ecx xsetbv mov eax, 0x0d xor ecx, ecx cpuid + add ebx, 63 + and ebx, NOT 63 mov [xsave_area_size], ebx cmp ebx, fpu_data_size ja $ @@ -40,18 +49,26 @@ init_fpu: test eax, XCR0_AVX512 jz @f call init_avx512 + mov eax, [xsave_eax] + mov edx, [xsave_edx] xsave [fpu_data] ret @@: test eax, XCR0_AVX jz @f call init_avx + mov eax, [xsave_eax] + mov edx, [xsave_edx] xsave [fpu_data] ret @@: test eax, XCR0_SSE - jnz .sse - jmp .fpu_mmx + jz $ + call init_sse + mov eax, [xsave_eax] + mov edx, [xsave_edx] + xsave [fpu_data] + ret .no_xsave: mov [xsave_area_size], 512 ; enough for FPU/MMX and SSE bt [cpu_caps], CAPS_SSE @@ -75,7 +92,7 @@ init_fpu_mmx: init_sse: mov ebx, cr4 mov ecx, cr0 - or ebx, CR4_OSFXSR+CR4_OSXMMEXPT + or ebx, CR4_OSFXSR + CR4_OSXMMEXPT mov cr4, ebx and ecx, not (CR0_EM + CR0_MP) @@ -186,6 +203,7 @@ avx_save_size: ; param ; eax= avx_save_size() bytes memory area aligned on a 64-byte boundary +align 4 avx_save: push ecx push esi @@ -230,7 +248,12 @@ align 4 save_context: bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32 jnc save_fpu_context - xsave [eax] + push eax edx + mov ecx, eax + mov eax, [xsave_eax] + mov edx, [xsave_edx] + xsave [ecx] + pop edx eax ret save_fpu_context: bt [cpu_caps], CAPS_SSE @@ -284,6 +307,7 @@ fpu_restore: pop ecx ret +align 4 avx_restore: push ecx push esi @@ -301,7 +325,11 @@ avx_restore: clts bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32 jnc .no_xsave + push edx + mov eax, [xsave_eax] + mov edx, [xsave_edx] xrstor [esi] + pop edx popfd pop esi pop ecx @@ -351,12 +379,15 @@ except_7: ;#NM exception handler mov eax, [ebx+SLOT_BASE+APPDATA.fpu_state] bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32 jnc .no_xsave - xsave [eax] + mov ecx, eax + mov eax, [xsave_eax] + mov edx, [xsave_edx] + xsave [ecx] mov ebx, [CURRENT_TASK] mov [fpu_owner], ebx shl ebx, 8 - mov eax, [ebx+SLOT_BASE+APPDATA.fpu_state] - xrstor [eax] + mov ecx, [ebx+SLOT_BASE+APPDATA.fpu_state] + xrstor [ecx] .exit: restore_ring3_context iret diff --git a/kernel/trunk/core/sched.inc b/kernel/trunk/core/sched.inc index 7227f0ce05..955c5e42cd 100644 --- a/kernel/trunk/core/sched.inc +++ b/kernel/trunk/core/sched.inc @@ -132,6 +132,20 @@ do_change_task: ; set gs selector unconditionally Mov ax, graph_data Mov gs, ax + ; TS flag is not triggered by AVX* instructions, therefore + ; we have to xsave/xrstor SIMD registers each task change + bt [cpu_caps+(CAPS_OSXSAVE/32)*4], CAPS_OSXSAVE mod 32 + jnc .no_xsave + mov ecx, [esi+APPDATA.fpu_state] + mov eax, [xsave_eax] + mov edx, [xsave_edx] + xsave [ecx] + mov ecx, [CURRENT_TASK] + mov [fpu_owner], ecx + mov ecx, [current_slot] + mov ecx, [ecx+APPDATA.fpu_state] + xrstor [ecx] + .no_xsave: ; set CR0.TS cmp bh, byte[fpu_owner] ;bh == incoming task (new) clts ;clear a task switch flag diff --git a/kernel/trunk/core/taskman.inc b/kernel/trunk/core/taskman.inc index 73e1d6b85f..21ae9705f9 100644 --- a/kernel/trunk/core/taskman.inc +++ b/kernel/trunk/core/taskman.inc @@ -929,10 +929,8 @@ proc set_app_params stdcall,slot:dword, params:dword, flags:dword shr ecx, 2 rep movsd - cmp ebx, [TASK_COUNT] - jle .noinc - inc dword [TASK_COUNT] ;update number of processes -.noinc: + cmp [TASK_COUNT], ebx + adc dword [TASK_COUNT], 0 ; update number of processes shl ebx, 8 lea edx, [ebx+SLOT_BASE+APP_EV_OFFSET] mov [SLOT_BASE+APPDATA.fd_ev+ebx], edx diff --git a/kernel/trunk/data32.inc b/kernel/trunk/data32.inc index 2ac5ae4e9b..2fa23dd1d5 100644 --- a/kernel/trunk/data32.inc +++ b/kernel/trunk/data32.inc @@ -335,8 +335,9 @@ diff16 "end of .data segment",0,$ align 16 cur_saved_data: rb 4096 +align 64 fpu_data: - rb 1024 + rb 0xa80 ; bochs avx512 fpu_data_size = $ - fpu_data draw_data: rb 32*256 @@ -434,6 +435,8 @@ cpu_info dd ? cpu_caps rd 4 xsave_area_size dd ? +xsave_eax dd ? +xsave_edx dd ? pg_data PG_DATA heap_test dd ? diff --git a/kernel/trunk/kernel.asm b/kernel/trunk/kernel.asm index 3aaa66b68d..bac536640d 100644 --- a/kernel/trunk/kernel.asm +++ b/kernel/trunk/kernel.asm @@ -434,7 +434,10 @@ high_code: ;lidt [idtreg] call init_kernel_heap - stdcall kernel_alloc, (RING0_STACK_SIZE+512) * 2 + call init_fpu + mov eax, [xsave_area_size] + lea eax, [eax*2 + RING0_STACK_SIZE*2] + stdcall kernel_alloc, eax mov [os_stack_seg], eax lea esp, [eax+RING0_STACK_SIZE] @@ -469,7 +472,6 @@ high_code: mov [LFBAddress], LFB_BASE mov ecx, bios_fb call set_framebuffer - call init_fpu call init_malloc stdcall alloc_kernel_space, 0x50000 ; FIXME check size @@ -590,7 +592,8 @@ high_code: mov edx, SLOT_BASE+256*1 mov ebx, [os_stack_seg] - add ebx, 0x2000 + add ebx, RING0_STACK_SIZE + add ebx, [xsave_area_size] call setup_os_slot mov dword [edx], 'IDLE' sub [edx+APPDATA.saved_esp], 4 diff --git a/programs/demos/firework/trunk/firework.asm b/programs/demos/firework/trunk/firework.asm index 35244be70a..57ba19c024 100644 --- a/programs/demos/firework/trunk/firework.asm +++ b/programs/demos/firework/trunk/firework.asm @@ -7,7 +7,8 @@ ; Optimized for KolibriOS, By Diamond ; Assemble with ; c:fasm firework.asm firework.kex -; NOTE: Needs MMX & SSE, optionally AVX +; NOTE: Needs MMX & SSE, +; optionally AVX, AVX2, AVX512 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; use32 org 0x0 @@ -23,8 +24,12 @@ use32 include '../../../macros.inc' SCREEN_WIDTH = 320 SCREEN_HEIGHT = 200 -SIMD equ SSE -SIMD_BYTES = 8 +SIMD equ AVX +SIMD_BYTES = 16 +; SSE 8 +; AVX 16 +; AVX2 32 +; AVX512 64 assert SCREEN_WIDTH mod SIMD_BYTES = 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Global defines