command line parser for the system library

git-svn-id: svn://kolibrios.org@6614 a494cfbc-eb01-0410-851d-a64ba20cac60
This commit is contained in:
CleverMouse 2016-10-24 18:44:58 +00:00
parent 31a4eb5247
commit 45f221c5f5
6 changed files with 660 additions and 34 deletions

View File

@ -0,0 +1,229 @@
; Splits command line to argv array.
; Uses standard Windows rules:
; * in normal mode, arguments are separated with spaces and tabs,
; duplicate spaces and tabs are ignored
; (two sequential spaces are the same as one);
; * unescaped quote " in normal mode starts quoted mode,
; it does not end the current argument, it is not included in the argument;
; * spaces and tabs in quoted mode are included in the argument as is;
; * unescaped quote " in quoted mode returns to normal mode,
; it does not end the current argument, it is not included in the argument;
; * quotes can be escaped with backslashes \ in both modes
; (the recommended way), \" means copying " to the argument
; without switching modes;
; * backslashes not before a quote are just regular characters,
; backslashes before a quote should be escaped by another backslash:
; " means unescaped quote
; \" means character "
; \\" means character \ plus unescaped quote
; \\\" means characters \"
; and so on;
; * quotes in quoted mode can also be escaped by doubling them, ""
; (the confusing way); note that in normal mode "" means empty argument.
; For example, the command line
; begin"quoted mode"end\ \"escaped" "quotes" "1\" "" """escaped quotes 2"""
; has 4 arguments:
; 1) beginquoted modeend\
; 2) "escaped quotes 1"
; 3)
; 4) "escaped quotes 2"
; The recommended way to create a command line with the same arguments:
; "beginquoted modeend"\ "\"escaped quotes 1\"" "\"escaped quotes 2\"".
;
; in: esi -> command line
; in: edi -> data for arguments, maybe null
; in: edx -> pointers to arguments, maybe null
; out: ebx = argument count
;
; There are too many branches and labels here,
; isolate some of them into macro.
macro start_arg
; Increments argument count;
; if arguments are tracked, stores the current address.
{
local .label
test edx, edx
jz .label
mov [edx], edi
add edx, 4
.label:
inc ebx
}
; In typical cases decoded arguments and input line have large chunks in common.
; When going through the input string, we do not copy arguments immediately,
; but track size of last chunk that should be copied instead.
; This macros copies the last chunk of data if arguments are tracked.
; If arguments are tracked, ecx is reset to zero;
; otherwise, we do not care about ecx.
macro copy_arg_data
{
local .label
test edi, edi
jz .label
dec esi
sub esi, ecx
rep movsb
inc esi
.label:
}
; Process backslash.
macro process_slash
{
; 1. Count number of backslashes.
local .label1, .label2
xor ecx, ecx
.label1:
inc ecx
mov al, byte [esi]
inc esi
cmp al, '\'
jz .label1
; 2. If the next character is not ", backslash is a regular character;
; copy all of them.
cmp al, '"'
jnz .label2
; 3. If the next character is ", then only half of backslashes
; should be copied, other are escaping characters.
; If number of backslashes is odd, include " to copied chunk
; and advance to the next character.
shr ecx, 1
jnc .label2
mov al, byte [esi]
inc esi
inc ecx
.label2:
copy_arg_data
}
; Parser procedure.
proc parse_cmdline
; Registers:
; ebx = argc = argument count
; ecx = size of last chunk if edi is nonzero, garbage otherwise
; al = current input character = [esi-1]
; esi = pointer to input past the current character
; edi = zero or pointer to the next output data
; edx = zero or pointer to the next output pointer
xor ebx, ebx
xor ecx, ecx
; There are two large blocks of code for normal and quoted modes.
; We start in normal mode.
; 1. Processing in normal mode.
; 1a. Skip initial spaces and tabs.
.skip_spaces:
mov al, byte [esi]
inc esi
cmp al, ' '
jz .skip_spaces
cmp al, 9
jz .skip_spaces
; 1b. If the command line has ended, exit.
test al, al
jz .done
; 1c. Any character in this state starts a new argument.
start_arg
; 1d. Loop over the input string, watching for one of:
; (space), (tab), (terminator), ", \
; All other characters should be copied as is.
; The first character here cannot be (space), (tab) or (terminator),
; but " and \ are possible. For these, skip 1e, because we have nothing
; to copy yet, and go directly where 1f would direct us.
cmp al, '"'
jz .enter_quoted_mode
cmp al, '\'
jz .slash_normal
.normal_mode:
inc ecx
.enter_normal_mode:
mov al, byte [esi]
inc esi
.reenter_normal_mode:
cmp al, ' '
jz .copydata
cmp al, 9
jz .copydata
test al, al
jz .copydata
cmp al, '\'
jz .copydata
cmp al, '"'
jnz .normal_mode
.copydata:
; 1e. Copy the found chunk.
copy_arg_data
; 1f. One of (space), (tab), (terminator), ", \ is found.
; For terminator, end the current argument and exit.
; For \, go to 1h.
; For ", switch to quoted mode.
test al, al
jz .done_termarg
cmp al, '\'
jz .slash_normal
cmp al, '"'
jz .enter_quoted_mode
; 1g. If we are here, (space) or (tab) has occured in 1d.
; End the current argument and restart processing from 1a.
test edi, edi
jz .skip_spaces
mov byte [edi], 0
inc edi
jmp .skip_spaces
.done_termarg:
test edi, edi
jz .done
mov byte [edi], 0
inc edi
.done:
ret
.slash_normal:
; 1h. Process chunk of slashes with possible ending " if escaped
; as described in process_slash macros.
; After that, return to loop in 1d; note that the next character can be space.
process_slash
jmp .reenter_normal_mode
; 2. Processing in quoted mode.
; This block is simpler because the current argument never ends in quoted mode,
; except when the input ends.
; 2a. Loop over the input string, watching for one of:
; (terminator), ", \.
.quoted_mode:
inc ecx
.enter_quoted_mode:
mov al, byte [esi]
inc esi
.reenter_quoted_mode:
test al, al
jz .copydata2
cmp al, '\'
jz .copydata2
cmp al, '"'
jnz .quoted_mode
.copydata2:
; 2b. Copy the found chunk.
copy_arg_data
; 2c. One of (terminator), ", \ is found.
; For terminator, end the current argument and exit.
; For \, go to 2d.
test al, al
jz .done_termarg
cmp al, '\'
jz .slash_quoted
; For ", check whether the next character is also ":
; for a single quote, switch to the normal mode 1d,
; for a double quote, skip the first quote
; and start a new chunk from the second one.
cmp byte [esi], '"'
jnz .enter_normal_mode
.double_quote:
inc esi
jmp .quoted_mode
.slash_quoted:
; 2d. Process chunk of slashes with possible ending " if escaped
; as described in process_slash macros.
; After that, return to loop in 2a.
process_slash
jmp .reenter_quoted_mode
endp
purge start_arg
purge copy_arg_data
purge process_slash

View File

@ -0,0 +1,121 @@
; Just a test of cmdline.inc.
; Checks that parsing of some predefined command lines
; gives some predefined command arguments.
; Nothing to see here.
format PE console 4.0
entry start
include 'win32a.inc'
include '../../struct.inc'
include '../../proc32.inc'
start:
stdcall run_test, empty_cmdline, empty_args
stdcall run_test, spaces_tabs_cmdline, empty_args
stdcall run_test, fancy_quotes_cmdline, fancy_quotes_args
stdcall run_test, fancy_slashes_cmdline, fancy_slashes_args
stdcall run_test, unmatched_quote_cmdline, unmatched_quote_args
xor eax, eax
ret
proc run_test
mov esi, [esp+4]
xor edi, edi
xor edx, edx
call parse_cmdline
mov eax, [esp+8]
cmp ebx, [eax]
jnz .invalid_argc
test edx, edx
jnz .invalid_edx
mov eax, [esp+4]
@@:
inc eax
cmp byte [eax-1], 0
jnz @b
cmp esi, eax
jnz .invalid_esi
mov esi, [esp+4]
mov edi, data_area
mov edx, argv_area
call parse_cmdline
mov eax, [esp+4]
@@:
inc eax
cmp byte [eax-1], 0
jnz @b
cmp esi, eax
jnz .invalid_esi
mov eax, [esp+8]
cmp ebx, [eax]
jnz .invalid_argc
lea ecx, [argv_area+ebx*4]
cmp edx, ecx
jnz .invalid_edx
lea esi, [eax+4]
mov edi, data_area
mov edx, argv_area
test ebx, ebx
jz .args_done
.args_check:
cmp [edx], edi
jnz .invalid_argv
add edx, 4
@@:
cmpsb
jnz .invalid_argv
cmp byte [esi-1], 0
jnz @b
dec ebx
jnz .args_check
.args_done:
ret 8
.invalid_argc:
mov eax, 1
int3
jmp $
.invalid_edx:
mov eax, 2
int3
jmp $
.invalid_esi:
mov eax, 3
int3
jmp $
.invalid_argv:
mov eax, 4
int3
jmp $
endp
include 'cmdline.inc'
empty_cmdline db 0
spaces_tabs_cmdline db ' ',9,' ',9,0
empty_args dd 0
fancy_quotes_cmdline db 'begin"quoted mode"end\ \"escaped" "quotes" "1\" "" """escaped quotes 2"""',0
fancy_quotes_args dd 4
db 'beginquoted modeend\',0
db '"escaped quotes 1"',0
db 0
db '"escaped quotes 2"',0
fancy_slashes_cmdline db 'arg\\" "1\\x "arg 2\\x" arg3\" arg4\\\"',9,'"arg 5\"" "arg6\\\"" "arg 7\\"',0
fancy_slashes_args dd 7
db 'arg\ 1\\x',0
db 'arg 2\\x',0
db 'arg3"',0
db 'arg4\"',0
db 'arg 5"',0
db 'arg6\"',0
db 'arg 7\',0
unmatched_quote_cmdline db 'some string"test',0
unmatched_quote_args dd 2
db 'some',0
db 'stringtest',0
align 4
data_area rb 1024
argv_area rd 256

View File

@ -10,8 +10,12 @@ local loc,regcount
regcount = regcount+1
push reg
\}
parmbase@proc equ esp+4+regcount*4
localbase@proc equ esp-localbytes
if loc
sub esp, loc
end if
parmbase@proc equ esp+4+loc+regcount*4
localbase@proc equ esp
fpo_localsize = loc
}
macro fpo_epilogue procname,flag,parmbytes,localbytes,reglist
{

View File

@ -37,15 +37,36 @@ stack_base dd ?
stack_size dd ?
exe_path dd ?
command_line dd ?
environment dd ?
ends
include 'malloc.inc'
include 'peloader.inc'
include 'cmdline.inc'
proc syscall_int40
int 0x40
ret
endp
proc syscall_sysenter
push ebp
mov ebp, esp
push @f
sysenter
@@:
pop edx
pop ecx
ret
endp
proc syscall_syscall
push ecx
syscall
pop ecx
ret
endp
proc kercall
jmp FS_SYSCALL_PTR
endp
@ -54,27 +75,52 @@ prologue@proc equ fpo_prologue
epilogue@proc equ fpo_epilogue
proc start stdcall, dll_base, reason, reserved
locals
exe_base dd ?
exe_path_size dd ?
endl
; 1. Do nothing unless called by the kernel for DLL_PROCESS_ATTACH.
cmp [reason], DLL_PROCESS_ATTACH
jnz .nothing
; 2. Validate version of the init struct.
; 2. Initialize process.
; 2a. Validate version of the init struct.
; If not known, say a debug message and die.
mov ebp, [reserved]
mov esi, [dll_base]
cmp [ebp+kernel_init_data.version], 1
jnz .version_mismatch
; 3. Setup common data based on the init struct.
mov eax, [ebp+kernel_init_data.stack_base]
mov FS_STACK_MIN, eax
add eax, [ebp+kernel_init_data.stack_size]
mov FS_STACK_MAX, eax
; 2b. Get the system call code.
; Note: relocations have not been fixed yet,
; so we cannot use absolute addresses, only RVAs.
mov eax, [ebp+kernel_init_data.syscall_method]
cmp eax, 0x10000
jae .syscall_absolute
dec eax
mov edx, rva syscall_int40
cmp eax, num_syscall_methods
jae @f
mov eax, syscall_int40
mov edx, [esi+eax*4+rva syscall_methods]
@@:
lea eax, [edx+esi]
.syscall_absolute:
mov FS_SYSCALL_PTR, eax
; 4. Initialize the process heap.
; 2c. Fixup relocations so that we can use absolute offsets instead of RVAs
; in rest of code.
; Note: this uses syscalls, so this step should be done after
; configuring FS_SYSCALL_PTR at step 2b.
push kolibri_dll
call fixup_pe_relocations
pop ecx
jc .die
; 2d. Allocate process data.
mov eax, 68
mov ebx, 12
mov ecx, 0x1000
call FS_SYSCALL_PTR
mov FS_PROCESS_DATA, eax
; 2e. Initialize process heap.
mov eax, [ebp+kernel_init_data.exe_base]
mov [exe_base], eax
mov edx, [eax+STRIPPED_PE_HEADER.SizeOfHeapReserve]
cmp word [eax], 'MZ'
jnz @f
@ -82,39 +128,106 @@ proc start stdcall, dll_base, reason, reserved
mov edx, [eax+IMAGE_NT_HEADERS.OptionalHeader.SizeOfHeapReserve]
@@:
malloc_init
; ...TBD...
; Call exe entry point.
mov eax, [ebp+kernel_init_data.exe_base]
mov edx, [eax+STRIPPED_PE_HEADER.AddressOfEntryPoint]
cmp word [eax], 'MZ'
; 2f. Copy rest of init struct and free memory.
; Parse command line to argc/argv here and move arguments to the heap
; in order to save memory: init struct and heap use different pages,
; but typically data from init struct are far from the entire page,
; so moving it to heap does not increase actual physical heap size
; and allows to free init struct.
mov eax, [ebp+kernel_init_data.stack_base]
mov FS_STACK_MIN, eax
add eax, [ebp+kernel_init_data.stack_size]
mov FS_STACK_MAX, eax
mov eax, [ebp+kernel_init_data.exe_path]
@@:
inc eax
cmp byte [eax-1], 0
jnz @b
sub eax, [ebp+kernel_init_data.exe_path]
mov [exe_path_size], eax
mov esi, [ebp+kernel_init_data.command_line]
xor edx, edx
xor edi, edi
call parse_cmdline
inc ebx ; argv[0] = exe path
.argc equ dll_base
.argv equ reason
.envp equ reserved
mov [.argc], ebx
sub esi, [ebp+kernel_init_data.command_line]
lea esi, [esi+(ebx+1)*4]
add esi, [exe_path_size]
stdcall malloc, esi
mov [.argv], eax
mov edx, eax
lea edi, [eax+ebx*4]
mov esi, [ebp+kernel_init_data.exe_path]
mov [edx], edi
add edx, 4
mov ecx, [exe_path_size]
rep movsb
mov esi, [ebp+kernel_init_data.command_line]
call parse_cmdline
and dword [edx], 0 ; argv[argc] = NULL
and [.envp], 0
mov eax, 68
mov ebx, 13
mov ecx, ebp
call FS_SYSCALL_PTR
; 3. Configure modules: main EXE and possible statically linked DLLs.
mov esi, [exe_base]
mov eax, [.argv]
pushd [eax]
call fixup_pe_relocations
pop ecx
jc .die
; 4. Call exe entry point.
mov edx, [esi+STRIPPED_PE_HEADER.AddressOfEntryPoint]
cmp word [esi], 'MZ'
jnz @f
mov ecx, [eax+IMAGE_DOS_HEADER.e_lfanew]
add ecx, eax
mov ecx, [esi+IMAGE_DOS_HEADER.e_lfanew]
add ecx, esi
mov edx, [ecx+IMAGE_NT_HEADERS.OptionalHeader.AddressOfEntryPoint]
@@:
add edx, eax
add edx, esi
add esp, fpo_localsize+4
call edx
; If exe entry point has returned control, die.
mov eax, -1
call FS_SYSCALL_PTR
jmp .die
.version_mismatch:
mov esi, version_mismatch_msg
mov eax, 63
mov ebx, 1
@@:
mov cl, [esi]
test cl, cl
jz @f
int 0x40 ; can't use FS_SYSCALL_PTR here, it has not yet been set
inc esi
jmp @b
@@:
mov eax, -1
int 0x40
lea eax, [esi + rva syscall_int40]
mov FS_SYSCALL_PTR, eax
add esi, rva msg_version_mismatch
call sys_msg_board_str
.die:
or eax, -1
call FS_SYSCALL_PTR
.nothing:
ret
endp
proc sys_msg_board_str
push eax ebx
@@:
push ecx
mov cl, [ecx]
test cl, cl
jz @f
mov eax, 63
mov ebx, 1
call FS_SYSCALL_PTR
pop ecx
inc ecx
jmp @b
@@:
pop ecx ebx eax
ret
endp
align 4
syscall_methods dd rva syscall_int40, rva syscall_sysenter, rva syscall_syscall
num_syscall_methods = ($ - syscall_methods) / 4
align 4
data export
export 'kolibri.dll' \
@ -136,7 +249,13 @@ export 'kolibri.dll' \
end data
version_mismatch_msg db 'Version mismatch between kernel and kolibri.dll',13,10,0
kolibri_dll db 'kolibri.dll',0
msg_version_mismatch db 'S : Version mismatch between kernel and kolibri.dll',13,10,0
msg_bad_relocation1 db 'S : Bad relocation type in ',0
msg_newline db 13,10,0
msg_relocated1 db 'S : fixups for ',0
msg_relocated2 db ' applied',13,10,0
if FOOTERS
section '.data' data readable writable

View File

@ -59,6 +59,9 @@ struct IMAGE_OPTIONAL_HEADER32
DataDirectory IMAGE_DATA_DIRECTORY ?
Directories rb sizeof.IMAGE_DATA_DIRECTORY*15
ends
IMAGE_DIRECTORY_ENTRY_EXPORT = 0
IMAGE_DIRECTORY_ENTRY_IMPORT = 1
IMAGE_DIRECTORY_ENTRY_BASERELOC = 5
struct IMAGE_FILE_HEADER
Machine dw ?
@ -69,6 +72,8 @@ struct IMAGE_FILE_HEADER
SizeOfOptionalHeader dw ?
Characteristics dw ?
ends
IMAGE_FILE_RELOCS_STRIPPED = 1
IMAGE_FILE_DLL = 0x2000
struct IMAGE_NT_HEADERS
Signature dd ?
@ -98,6 +103,13 @@ struct IMAGE_IMPORT_DIRECTORY
FirstThunk dd ?
ends
struct IMAGE_BASE_RELOCATION
VirtualAddress dd ?
SizeOfBlock dd ?
ends
IMAGE_REL_BASED_ABSOLUTE = 0
IMAGE_REL_BASED_HIGHLOW = 3
struct IMAGE_DOS_HEADER
e_magic dw ?
e_cblp dw ?

View File

@ -0,0 +1,141 @@
; Check whether PE module has been loaded at preferred address.
; If not, relocate the module.
;
; in: esi = PE base address
; in: [esp+4] = module name for debug print
; out: CF=1 - fail
proc fixup_pe_relocations uses edi ebp
; 1. Fetch some data from PE header or stripped PE header.
; We need:
; * ImageBase - preferred address, compare with esi = actual load address;
; ebp will keep the delta
; * RVA and size of fixups directory
; * flag IMAGE_FILE_RELOCS_STRIPPED from Characteristics
; If the actual address equals the preferred address, do nothing.
; If fixups directory is present, proceed to 2.
; If there is no fixups directory, there are two options:
; * either the directory has not been created
; * or the module has no fixups (data-only module, for example).
; In the first case, IMAGE_FILE_RELOCS_STRIPPED is set, and this is an error.
; In the second case, IMAGE_FILE_RELOCS_STRIPPED is not set; do nothing.
mov ebp, esi
cmp word [esi], 'MZ'
jz .parse_mz
sub ebp, [esi+STRIPPED_PE_HEADER.ImageBase]
jnz @f
.nothing:
ret
@@:
mov dl, byte [esi+STRIPPED_PE_HEADER.Characteristics]
lea eax, [esi+sizeof.STRIPPED_PE_HEADER+SPE_DIRECTORY_BASERELOC*sizeof.IMAGE_DATA_DIRECTORY]
cmp [esi+STRIPPED_PE_HEADER.NumberOfRvaAndSizes], SPE_DIRECTORY_BASERELOC
ja .common
.norelocs:
test dl, IMAGE_FILE_RELOCS_STRIPPED
jz .nothing
stc
ret
.parse_mz:
mov eax, [esi+3Ch]
add eax, esi
sub ebp, [eax+IMAGE_NT_HEADERS.OptionalHeader.ImageBase]
jz .nothing
mov dl, byte [esi+IMAGE_NT_HEADERS.FileHeader.Characteristics]
cmp [eax+IMAGE_NT_HEADERS.OptionalHeader.NumberOfDirectories], IMAGE_DIRECTORY_ENTRY_BASERELOC
jbe .norelocs
add eax, IMAGE_NT_HEADERS.OptionalHeader.DataDirectory+IMAGE_DIRECTORY_ENTRY_BASERELOC*sizeof.IMAGE_DATA_DIRECTORY
.common:
mov edi, [eax+IMAGE_DATA_DIRECTORY.VirtualAddress]
push [eax+IMAGE_DATA_DIRECTORY.isize]
virtual at esp
.sizeleft dd ?
end virtual
add edi, esi
cmp [.sizeleft], 0
jz .norelocs
; 2. We need to relocate and we have the relocation table.
; esi = PE base address
; edi = pointer to current data of relocation table
; 2a. Relocation table is organized into blocks describing every page.
; End of table is defined from table size fetched from the header.
; Loop 2b-2g over all blocks until no more data is left.
.pageloop:
; 2b. Load the header of the current block: address and size.
; Advance total size.
; Size in the block includes size of the header, subtract it.
; If there is no data in this block, go to 2g.
mov edx, [edi+IMAGE_BASE_RELOCATION.VirtualAddress]
mov ecx, [edi+IMAGE_BASE_RELOCATION.SizeOfBlock]
sub [.sizeleft], ecx
add edi, sizeof.IMAGE_BASE_RELOCATION
sub ecx, sizeof.IMAGE_BASE_RELOCATION
jbe .pagedone
; 2c. We are going to modify data, so mprotect the current page to be writable.
; Save the old protection, we will restore it after the block is processed.
; Ignore any error.
PROT_READ = 1
PROT_WRITE = 2
PROT_EXEC = 4
push esi ecx
mov eax, 68
mov ebx, 30
mov ecx, PROT_READ+PROT_WRITE
add edx, esi
mov esi, 0x1000
call FS_SYSCALL_PTR
pop ecx
push eax
; 2d. Block data is an array of word values. Repeat 2e for every of those.
.relocloop:
sub ecx, 2
jb .relocdone
; 2e. Every value consists of a 4-bit type and 12-bit offset in the page.
; x86 uses two types: 0 = no data (used for padding), 3 = 32-bit relative.
movzx eax, word [edi]
add edi, 2
mov ebx, eax
and ebx, 0xFFF
shr eax, 12
jz .relocloop
cmp al, IMAGE_REL_BASED_HIGHLOW
jnz .badreloc
add [edx+ebx], ebp
jmp .relocloop
.relocdone:
; 2f. Restore memory protection changed in 2c.
pop ecx
cmp ecx, -1
jz @f
mov eax, 68
mov ebx, 30
mov esi, 0x1000
call FS_SYSCALL_PTR
@@:
pop esi
.pagedone:
cmp [.sizeleft], 0
jnz .pageloop
pop eax ; pop .sizeleft
; 3. For performance reasons, relocation should be avoided
; by choosing an appropriate preferred address.
; If we have actually relocated something, yell to the debug board,
; so the programmer can notice that.
mov ecx, msg_relocated1
call sys_msg_board_str
mov ecx, [esp+4]
call sys_msg_board_str
mov ecx, msg_relocated2
call sys_msg_board_str
clc
ret
.badreloc:
pop eax
mov ecx, msg_bad_relocation1
call sys_msg_board_str
mov ecx, [esp+4]
call sys_msg_board_str
mov ecx, msg_newline
call sys_msg_board_str
stc
ret
endp