lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <c3584f30-2316-1545-e83d-3be20d38d9b1@amazon.com>
Date:   Fri, 21 Jun 2019 15:40:16 +0200
From:   Alexander Graf <graf@...zon.com>
To:     Sam Caccavale <samcacc@...zon.de>
CC:     <samcaccavale@...il.com>, <nmanthey@...zon.de>,
        <wipawel@...zon.de>, <dwmw@...zon.co.uk>, <mpohlack@...zon.de>,
        <karahmed@...zon.de>, <andrew.cooper3@...rix.com>,
        <JBeulich@...e.com>, <pbonzini@...hat.com>, <rkrcmar@...hat.com>,
        <tglx@...utronix.de>, <mingo@...hat.com>, <bp@...en8.de>,
        <hpa@...or.com>, <paullangton4@...il.com>,
        <anirudhkaushik@...gle.com>, <x86@...nel.org>,
        <kvm@...r.kernel.org>, <linux-kernel@...r.kernel.org>
Subject: Re: [v2, 2/4] Emulate simple x86 instructions in userspace


On 12.06.19 17:35, Sam Caccavale wrote:
> Added the minimal subset of code to run afl-harness with a binary file
> as input.  These bytes are used to populate the vcpu structure and then
> as an instruction stream for the emulator.  It does not attempt to handle
> exceptions an only supports very simple ops.
>
> CR: https://code.amazon.com/reviews/CR-8552453
> ---
>   tools/fuzz/x86ie/afl-harness.c    |   8 +-
>   tools/fuzz/x86ie/emulator_ops.c   | 342 +++++++++++++++++++++++++++++-
>   tools/fuzz/x86ie/emulator_ops.h   |   7 +-
>   tools/fuzz/x86ie/scripts/afl-many |  28 +++
>   4 files changed, 379 insertions(+), 6 deletions(-)
>   create mode 100755 tools/fuzz/x86ie/scripts/afl-many
>
> diff --git a/tools/fuzz/x86ie/afl-harness.c b/tools/fuzz/x86ie/afl-harness.c
> index b3b09d7f15f2..a3eeab0cfc90 100644
> --- a/tools/fuzz/x86ie/afl-harness.c
> +++ b/tools/fuzz/x86ie/afl-harness.c
> @@ -50,7 +50,7 @@ int main(int argc, char **argv)
>
>   		switch (c) {
>   		case OPT_INPUT_SIZE:
> -			printf("Min: %u\n", MIN_INPUT_SIZE);
> +			printf("Min: %lu\n", MIN_INPUT_SIZE);


Why this change here?


>   			printf("Max: %u\n", MAX_INPUT_SIZE);
>   			exit(0);
>   			break;
> @@ -77,6 +77,10 @@ int main(int argc, char **argv)
>
>   	state = create_emulator();
>   	state->data = malloc(INSTRUCTION_BYTES);
> +	if (!state->data) {
> +		printf("Malloc failed.\n");
> +		return -1;
> +	}


Why here and not in 1/4?


>
>   #ifdef __AFL_HAVE_MANUAL_CONTROL
>   	__AFL_INIT();
> @@ -109,8 +113,6 @@ int main(int argc, char **argv)
>   			fseek(fp, 0, SEEK_SET);
>   		}
>   #endif
> -		reset_emulator(state);
> -


Same question.


>   		size = fread(state, 1, MIN_INPUT_SIZE, fp);
>   		if (size != MIN_INPUT_SIZE) {
>   			printf("Input does not populate state\n");
> diff --git a/tools/fuzz/x86ie/emulator_ops.c b/tools/fuzz/x86ie/emulator_ops.c
> index 55ae4e8fbd96..370ac970ab9d 100644
> --- a/tools/fuzz/x86ie/emulator_ops.c
> +++ b/tools/fuzz/x86ie/emulator_ops.c
> @@ -29,17 +29,349 @@
>   #include <asm/user_64.h>
>   #include <asm/kvm.h>
>
> +ulong emul_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned int reg)
> +{
> +	assert(reg < number_of_gprs(ctxt));
> +	return get_state(ctxt)->vcpu.regs[reg];
> +}
> +
> +void emul_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned int reg, ulong val)
> +{
> +	assert(reg < number_of_gprs(ctxt));
> +	get_state(ctxt)->vcpu.regs[reg] = val;
> +}
> +
> +/* All read ops: */
> +
> +static int _get_bytes(void *dst, struct state *state, unsigned int bytes,
> +		      char *callee)
> +{
> +	if (state->bytes_consumed + bytes > state->data_available) {
> +		fprintf(stderr, "Tried retrieving %d bytes\n", bytes);
> +		fprintf(stderr, "%s failed to retrieve bytes for %s.\n",
> +			__func__, callee);
> +		return X86EMUL_UNHANDLEABLE;
> +	}
> +
> +	memcpy(dst, &state->data[state->bytes_consumed], bytes);
> +	return X86EMUL_CONTINUE;
> +}
> +
> +/*
> + * The only function that any x86_emulate_ops should call to retrieve bytes.
> + * See comments in struct state definition for more information.
> + */
> +static int get_bytes_and_increment(void *dst, struct state *state,
> +				   unsigned int bytes, char *callee)
> +{
> +	int rc = _get_bytes(dst, state, bytes, callee);
> +
> +	if (rc == X86EMUL_CONTINUE)
> +		state->bytes_consumed += bytes;
> +
> +	return rc;
> +}
> +
> +/*
> + * This is called by x86_decode_insn to fetch bytes.
> + */
> +int emul_fetch(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *val,
> +	       unsigned int bytes, struct x86_exception *fault)
> +{
> +	if (get_bytes_and_increment(val, get_state(ctxt), bytes,
> +		"emul_fetch") != X86EMUL_CONTINUE) {
> +		return X86EMUL_UNHANDLEABLE;
> +	}
> +
> +	return X86EMUL_CONTINUE;
> +}
> +
> +int emul_read_emulated(struct x86_emulate_ctxt *ctxt,
> +		       unsigned long addr, void *val, unsigned int bytes,
> +		       struct x86_exception *fault)
> +{
> +	if (get_bytes_and_increment(val, get_state(ctxt), bytes,
> +		"emul_read_emulated") != X86EMUL_CONTINUE) {
> +		return X86EMUL_UNHANDLEABLE;
> +	}
> +
> +	return X86EMUL_CONTINUE;
> +}
> +
> +int emul_write_emulated(struct x86_emulate_ctxt *ctxt,
> +		   unsigned long addr, const void *val,
> +		   unsigned int bytes,
> +		   struct x86_exception *fault)
> +{
> +	return X86EMUL_CONTINUE;
> +}
> +
> +ulong emul_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
> +{
> +	return get_state(ctxt)->vcpu.cr[cr];
> +}
> +
> +int emul_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
> +{
> +	get_state(ctxt)->vcpu.cr[cr] = val;
> +	return 0;
> +}
> +
> +unsigned int emul_get_hflags(struct x86_emulate_ctxt *ctxt)
> +{
> +	return get_state(ctxt)->vcpu.rflags;
> +}
> +
> +void emul_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned int hflags)
> +{
> +	get_state(ctxt)->vcpu.rflags = hflags;
> +}
> +
> +/* End of emulator ops */
> +
> +#define SET(h) .h = emul_##h


This is a pretty dangerous macro name, as it has great potential for 
conflict with a random header. I would suggest EMUL_OP() instead.


> +const struct x86_emulate_ops all_emulator_ops = {
> +	SET(read_gpr),
> +	SET(write_gpr),
> +	SET(fetch),
> +	SET(read_emulated),
> +	SET(write_emulated),
> +	SET(get_cr),
> +	SET(set_cr),
> +	SET(get_hflags),
> +	SET(set_hflags),
> +};
> +#undef SET
> +
> +enum {
> +	HOOK_read_gpr,
> +	HOOK_write_gpr,
> +	HOOK_fetch,
> +	HOOK_read_emulated,
> +	HOOK_write_emulated,
> +	HOOK_get_cr,
> +	HOOK_set_cr,
> +	HOOK_get_hflags,
> +	HOOK_set_hflags
> +};
> +
> +/*
> + * Disable an x86_emulate_op if options << HOOK_op is set.
> + *
> + * Expects options to be defined.
> + */
> +#define MAYBE_DISABLE_HOOK(h)						\
> +	do {								\
> +		if (options & (1 << HOOK_##h)) {			\
> +			vcpu->ctxt.ops.h = NULL;			\
> +			debug("Disabling hook " #h "\n");		\
> +		}							\
> +	} while (0)
> +
> +/*
> + * FROM XEN:
> + *
> + * Constrain input to architecturally-possible states where
> + * the emulator relies on these
> + *
> + * In general we want the emulator to be as absolutely robust as
> + * possible; which means that we want to minimize the number of things
> + * it assumes about the input state.  Tesing this means minimizing and
> + * removing as much of the input constraints as possible.
> + *
> + * So we only add constraints that (in general) have been proven to
> + * cause crashes in the emulator.
> + *
> + * For future reference: other constraints which might be necessary at
> + * some point:
> + *
> + * - EFER.LMA => !EFLAGS.NT
> + * - In VM86 mode, force segment...
> + *  - ...access rights to 0xf3
> + *  - ...limits to 0xffff
> + *  - ...bases to below 1Mb, 16-byte aligned
> + *  - ...selectors to (base >> 4)
> + */
> +static void sanitize_input(struct state *s)
> +{
> +	/* Some hooks can't be disabled. */
> +	// options &= ~((1<<HOOK_read)|(1<<HOOK_insn_fetch));
> +
> +	/* Zero 'private' entries */
> +	// regs->error_code = 0;
> +	// regs->entry_vector = 0;
> +
> +	// CANONICALIZE_MAYBE(rip);
> +	// CANONICALIZE_MAYBE(rsp);
> +	// CANONICALIZE_MAYBE(rbp);


Now that you removed the macro, please also remove all commented out lines.


> +
> +	/*
> +	 * CR0.PG can't be set if CR0.PE isn't set.  Set is more interesting, so
> +	 * set PE if PG is set.
> +	 */
> +	if (s->vcpu.cr[0] & X86_CR0_PG)
> +		s->vcpu.cr[0] |= X86_CR0_PE;
> +
> +	/* EFLAGS.VM not available in long mode */
> +	if (s->ctxt.mode == X86EMUL_MODE_PROT64)
> +		s->vcpu.rflags &= ~X86_EFLAGS_VM;
> +
> +	/* EFLAGS.VM implies 16-bit mode */
> +	if (s->vcpu.rflags & X86_EFLAGS_VM) {
> +		s->vcpu.segments[x86_seg_cs].db = 0;
> +		s->vcpu.segments[x86_seg_ss].db = 0;
> +	}
> +}
> +
>   void initialize_emulator(struct state *state)
>   {
> +	reset_emulator(state);
> +	state->ctxt.ops = &all_emulator_ops;
> +
> +	/* See also sanitize_input, some hooks can't be disabled. */
> +	// MAYBE_DISABLE_HOOK(read_gpr);
> +
> +	sanitize_input(state);
> +}
> +
> +static const char *const x86emul_mode_string[] = {
> +	[X86EMUL_MODE_REAL] = "X86EMUL_MODE_REAL",
> +	[X86EMUL_MODE_VM86] = "X86EMUL_MODE_VM86",
> +	[X86EMUL_MODE_PROT16] = "X86EMUL_MODE_PROT16",
> +	[X86EMUL_MODE_PROT32] = "X86EMUL_MODE_PROT32",
> +	[X86EMUL_MODE_PROT64] = "X86EMUL_MODE_PROT64",
> +};
> +
> +static void dump_state_after(const char *desc, struct state *state)
> +{
> +	debug(" -- State after %s --\n", desc);
> +	debug("mode: %s\n", x86emul_mode_string[state->ctxt.mode]);
> +	debug(" cr0: %lx\n", state->vcpu.cr[0]);
> +	debug(" cr3: %lx\n", state->vcpu.cr[3]);
> +	debug(" cr4: %lx\n", state->vcpu.cr[4]);
> +
> +	debug("Decode _eip: %lu\n", state->ctxt._eip);
> +	debug("Emulate eip: %lu\n", state->ctxt.eip);
> +
> +	debug("\n");
>   }
>
> +static void init_emulate_ctxt(struct state *state)
> +{
> +	struct x86_emulate_ctxt *ctxt = &state->ctxt;
> +
> +	ctxt->eflags = ctxt->ops->get_hflags(ctxt);
> +	ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
> +
> +	ctxt->mode = X86EMUL_MODE_PROT64; // TODO: eventually vary this
> +
> +	init_decode_cache(ctxt);
> +}
> +
> +
>   int step_emulator(struct state *state)
>   {
> -	return 0;
> +	int rc;
> +	unsigned long prev_eip = state->ctxt._eip;
> +	unsigned long emul_offset;
> +	int decode_size = state->data_available - state->bytes_consumed;
> +
> +	/*
> +	 * This is annoing to have to explain the reasoning behind:
> +	 * ._eip is incremented by x86_decode_insn.  It will be > .eip between
> +	 * decoding and emulating.
> +	 * .eip is incremented by x86_emulate_insn.  It may be incremented
> +	 * beyond the length of instruction emulated E.G. if a jump is taken.
> +	 *
> +	 * If these are out of sync before emulating, then something is
> +	 * horribly wrong with the harness.
> +	 */
> +	assert(state->ctxt.eip == state->ctxt._eip);
> +
> +	if (decode_size <= 0) {
> +		debug("Out of instructions\n");
> +		return X86EMUL_UNHANDLEABLE;
> +	}
> +
> +	init_emulate_ctxt(state);
> +	state->ctxt.interruptibility = 0;
> +	state->ctxt.have_exception = false;
> +	state->ctxt.exception.vector = -1;
> +	state->ctxt.perm_ok = false;
> +	state->ctxt.ud = 0; // (emulation_type(0) & EMULTYPE_TRAP_UD);
> +
> +	/*
> +	 * When decoding with NULL, 0, the emulator will use the emul_fetch
> +	 * op which handles incrementing the state->data variables.  However
> +	 * x86_decode_insn will always try to grab 15 bytes which may be more
> +	 * than are left in the stream.
> +	 *
> +	 * Calling x86_decode_insn from a buffer with a length causes it to
> +	 * directly memcpy those bytes into the ctxt structure and does not
> +	 * increment state->bytes_consumed.  In that case, we manually
> +	 * update state->bytes_consumed by the difference in the decoding
> +	 * _eip.  This is gross but I cannot figure out a better way to do
> +	 * this.
> +	 *
> +	 * We must limit the size to avoid going over the buffer and since
> +	 * calling x86_decode_insn with a buffer does not go through any of
> +	 * our ops, we need to update bytes_consumed.  The only improvement
> +	 * I can currently think of would be a nicer way to get the size of
> +	 * the decoded instruction.
> +	 */
> +	if (decode_size > 15)
> +		decode_size = 15;
> +
> +	rc = x86_decode_insn(&state->ctxt,
> +		&state->data[state->bytes_consumed], decode_size);
> +	assert(state->ctxt._eip - prev_eip > 0); // Only move forward.
> +	state->bytes_consumed += state->ctxt._eip - prev_eip;
> +
> +	debug("Decode result: %d\n", rc);
> +	if (rc != X86EMUL_CONTINUE)
> +		return rc;
> +
> +	emul_offset = state->ctxt._eip - state->ctxt.eip;
> +	debug("Instruction: ");
> +	print_n_bytes(&state->data[state->bytes_consumed - emul_offset],
> +		      emul_offset);
> +
> +	state->ctxt.exception.address = state->vcpu.cr[2];
> +
> +	// This is extraneous but explicit due to the above assert
> +	prev_eip = state->ctxt.eip;
> +	rc = x86_emulate_insn(&state->ctxt);
> +	debug("Emulation result: %d\n", rc);
> +	dump_state_after("emulating", state);
> +
> +	if (rc == -1) {
> +		return rc;
> +	} else if (state->ctxt.have_exception) {
> +		fprintf(stderr, "Emulator propagated exception: { ");
> +		fprintf(stderr, "vector: %d, ", state->ctxt.exception.vector);
> +		fprintf(stderr, "error code: %d }\n",
> +			state->ctxt.exception.error_code);
> +		rc = X86EMUL_UNHANDLEABLE;
> +	} else if (prev_eip == state->ctxt.eip) {
> +		fprintf(stderr, "ctxt.eip not advanced.\n");
> +		rc = X86EMUL_UNHANDLEABLE;
> +	}
> +
> +	if (state->bytes_consumed == state->data_available)
> +		debug("emulator is done\n");
> +
> +	return rc;
>   }
>
>   int emulate_until_complete(struct state *state)
>   {
> +	int count = 0;
> +
> +	do {
> +		count++;
> +	} while (step_emulator(state) == X86EMUL_CONTINUE);
> +
> +	debug("Emulated %d instructions\n", count);
>   	return 0;
>   }
>
> @@ -51,8 +383,16 @@ struct state *create_emulator(void)
>
>   void reset_emulator(struct state *state)
>   {
> +	unsigned char *data = state->data;
> +	size_t data_available = state->data_available;
> +
> +	memset(state, 0, sizeof(struct state));
> +
> +	state->data = data;
> +	state->data_available = data_available;
>   }
>
>   void free_emulator(struct state *state)
>   {
> +	free(state);
>   }
> diff --git a/tools/fuzz/x86ie/emulator_ops.h b/tools/fuzz/x86ie/emulator_ops.h
> index 5ae072d5f205..19f3bd0ec6a3 100644
> --- a/tools/fuzz/x86ie/emulator_ops.h
> +++ b/tools/fuzz/x86ie/emulator_ops.h
> @@ -59,7 +59,7 @@ struct state {
>   	 * Amount of bytes consumed for purposes other than instructions.
>   	 * E.G. whether a memory access should fault.
>   	 */
> -	size_t other_bytes_consumed;
> +	size_t bytes_consumed;


Why in this patch?


>
>   	/* Emulation context */
>   	struct x86_emulate_ctxt ctxt;
> @@ -75,7 +75,10 @@ struct state {
>
>   #define get_state(h) container_of(h, struct state, ctxt)
>
> -void buffer_stderr(void) __attribute__((constructor));


Same question.


> +static inline int number_of_gprs(struct x86_emulate_ctxt *c)
> +{
> +	return (c->mode == X86EMUL_MODE_PROT64 ? 16 : 8);
> +}
>
>   /*
>    * Allocates space for, and creates a `struct state`.  The user should set
> diff --git a/tools/fuzz/x86ie/scripts/afl-many b/tools/fuzz/x86ie/scripts/afl-many
> new file mode 100755
> index 000000000000..ab15258573a2
> --- /dev/null
> +++ b/tools/fuzz/x86ie/scripts/afl-many
> @@ -0,0 +1,28 @@
> +#!/bin/bash
> +# SPDX-License-Identifier: GPL-2.0+
> +# This is for running AFL over NPROC or `nproc` cores with normal AFL options.
> +
> +export AFL_NO_AFFINITY=1
> +
> +while [ -z "$sync_dir" ]; do
> +  while getopts ":o:" opt; do
> +    case "${opt}" in
> +      o)
> +        sync_dir="${OPTARG}"
> +        ;;
> +      *)
> +        ;;
> +    esac
> +  done
> +  ((OPTIND++))
> +  [ $OPTIND -gt $# ] && break
> +done
> +
> +for i in $(seq 1 $(( ${NPROC:-$(nproc)} - 1)) ); do
> +    taskset -c "$i" ./afl-fuzz -S "slave$i" $@ >/dev/null 2>&1 &
> +done
> +taskset -c 0 ./afl-fuzz -M master $@ >/dev/null 2>&1 &


You want to add a comment above the execution via taskset explaining 
that this is necessary for performance because the Linux scheduler 
otherwise works against you for local caching effects.


> +
> +sleep 5


I'm still not a big fan of the sleep. Is there really no way to 
determine a sane monitoring state for real? Can you maybe find it out 
from looking at the open FDs on the afl-fuzz processes via /proc/$pid/fds?

Alex


> +watch -n1 "echo \"Executing './afl-fuzz $@' on ${NPROC:-$(nproc)} cores.\" && ./afl-whatsup -s ${sync_dir}"
> +pkill afl-fuzz
> --
> 2.17.1
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ