lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <mhng-0770bfe6-73bd-4b8a-9fa7-142ed95a6974@palmer-si-x1c4>
Date:   Fri, 25 Jan 2019 12:21:07 -0800 (PST)
From:   Palmer Dabbelt <palmer@...ive.com>
To:     bjorn.topel@...il.com, Jim Wilson <jimw@...ive.com>
CC:     Christoph Hellwig <hch@...radead.org>,
        linux-riscv@...ts.infradead.org, davidlee@...ive.com,
        daniel@...earbox.net, netdev@...r.kernel.org
Subject:     Re: [RFC PATCH 1/3] riscv: set HAVE_EFFICIENT_UNALIGNED_ACCESS

On Tue, 15 Jan 2019 08:06:47 PST (-0800), bjorn.topel@...il.com wrote:
> Den tis 15 jan. 2019 kl 16:39 skrev Christoph Hellwig <hch@...radead.org>:
>>
>> Hmm, while the RISC-V spec requires misaligned load/store support,
>> who says they are efficient?  Maybe add a little comment that says
>> on which cpus they are efficient.
>
> Good point! :-) I need to check how other architectures does this.
> Enabling it for *all* RV64 is probably not correct.

RISC-V mandates that misaligned memory accesses execute correctly in S-mode, 
but allow them to be trapped and emulated in M-mode.  As a result they can be 
quite slow.  Every microarchitecture I know of traps misaligned accesses into 
M-mode, so for now we're probably safe just unconditionally saying they're 
slow.

GCC does have a tuning parameter that says "are misaligned accesses fast?" that 
we set depending on -mtune, but it doesn't appear to be exposed as a 
preprocessor macro.  I think it's probably best to just expose the tuning 
parameter as a macro so software that needs to know this has one standard way 
of doing it.

Jim, would you be opposed to something like this?

    diff --git a/riscv-c-api.md b/riscv-c-api.md
    index 0b0236c38826..a790f5cc23ee 100644
    --- a/riscv-c-api.md
    +++ b/riscv-c-api.md
    @@ -52,6 +52,10 @@ https://creativecommons.org/licenses/by/4.0/.
     * `__riscv_cmodel_medlow`
     * `__riscv_cmodel_medany`
     * `__riscv_cmodel_pic`
    +* `__riscv_tune_misaligned_load_cost`: The number of cycles a word-sized
    +  misaligned load will take.
    +* `__riscv_tune_misaligned_store_cost`: The number of cycles a word-sized
    +  misaligned store will take.
    
     ## Function Attributes

Which I think shouldn't be too much of a headache to implement in GCC -- I 
haven't compiled this yet, though...

    diff --git a/gcc/config/riscv/riscv-c.c b/gcc/config/riscv/riscv-c.c
    index ca72de74a7b4..fa71a4a22104 100644
    --- a/gcc/config/riscv/riscv-c.c
    +++ b/gcc/config/riscv/riscv-c.c
    @@ -98,4 +98,9 @@ riscv_cpu_cpp_builtins (cpp_reader *pfile)
           builtin_define ("__riscv_cmodel_pic");
           break;
         }
    +
    +    builtin_define_with_int_value ("__riscv_tune_misaligned_load_cost",
    +                                   riscv_tune_info->slow_unaligned_access ? 1024 : 1);
    +    builtin_define_with_int_value ("__riscv_tune_misaligned_store_cost",
    +                                   riscv_tune_info->slow_unaligned_access ? 1024 : 1);
     }
    diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h
    index a3ab6cec33b4..d58a307d27b4 100644
    --- a/gcc/config/riscv/riscv-opts.h
    +++ b/gcc/config/riscv/riscv-opts.h
    @@ -39,4 +39,6 @@ enum riscv_code_model {
     };
     extern enum riscv_code_model riscv_cmodel;
     
    +extern struct riscv_tune_info riscv_tune_info;
    +
     #endif /* ! GCC_RISCV_OPTS_H */
    diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c
    index bf4571d91b8c..671c2ddaaa0f 100644
    --- a/gcc/config/riscv/riscv.c
    +++ b/gcc/config/riscv/riscv.c
    @@ -226,7 +226,7 @@ struct riscv_cpu_info {
       const char *name;
     
       /* Tuning parameters for this CPU.  */
    -  const struct riscv_tune_info *tune_info;
    +  const struct riscv_tune_info *riscv_tune_info;
     };
     
     /* Global variables for machine-dependent things.  */
    @@ -243,7 +243,7 @@ unsigned riscv_stack_boundary;
     static int epilogue_cfa_sp_offset;
     
     /* Which tuning parameters to use.  */
    -static const struct riscv_tune_info *tune_info;
    +const struct riscv_tune_info *riscv_tune_info;
     
     /* Index R is the smallest register class that contains register R.  */
     const enum reg_class riscv_regno_to_class[FIRST_PSEUDO_REGISTER] = {
    @@ -1528,7 +1528,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
     	 instructions it needs.  */
           if ((cost = riscv_address_insns (XEXP (x, 0), mode, true)) > 0)
     	{
    -	  *total = COSTS_N_INSNS (cost + tune_info->memory_cost);
    +	  *total = COSTS_N_INSNS (cost + riscv_tune_info->memory_cost);
     	  return true;
     	}
           /* Otherwise use the default handling.  */
    @@ -1592,7 +1592,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
     	 mode instead.  */
           mode = GET_MODE (XEXP (x, 0));
           if (float_mode_p)
    -	*total = tune_info->fp_add[mode == DFmode];
    +	*total = riscv_tune_info->fp_add[mode == DFmode];
           else
     	*total = riscv_binary_cost (x, 1, 3);
           return false;
    @@ -1601,14 +1601,14 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
         case ORDERED:
           /* (FEQ(A, A) & FEQ(B, B)) compared against 0.  */
           mode = GET_MODE (XEXP (x, 0));
    -      *total = tune_info->fp_add[mode == DFmode] + COSTS_N_INSNS (2);
    +      *total = riscv_tune_info->fp_add[mode == DFmode] + COSTS_N_INSNS (2);
           return false;
     
         case UNEQ:
         case LTGT:
           /* (FEQ(A, A) & FEQ(B, B)) compared against FEQ(A, B).  */
           mode = GET_MODE (XEXP (x, 0));
    -      *total = tune_info->fp_add[mode == DFmode] + COSTS_N_INSNS (3);
    +      *total = riscv_tune_info->fp_add[mode == DFmode] + COSTS_N_INSNS (3);
           return false;
     
         case UNGE:
    @@ -1617,13 +1617,13 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
         case UNLT:
           /* FLT or FLE, but guarded by an FFLAGS read and write.  */
           mode = GET_MODE (XEXP (x, 0));
    -      *total = tune_info->fp_add[mode == DFmode] + COSTS_N_INSNS (4);
    +      *total = riscv_tune_info->fp_add[mode == DFmode] + COSTS_N_INSNS (4);
           return false;
     
         case MINUS:
         case PLUS:
           if (float_mode_p)
    -	*total = tune_info->fp_add[mode == DFmode];
    +	*total = riscv_tune_info->fp_add[mode == DFmode];
           else
     	*total = riscv_binary_cost (x, 1, 4);
           return false;
    @@ -1633,7 +1633,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
     	rtx op = XEXP (x, 0);
     	if (GET_CODE (op) == FMA && !HONOR_SIGNED_ZEROS (mode))
     	  {
    -	    *total = (tune_info->fp_mul[mode == DFmode]
    +	    *total = (riscv_tune_info->fp_mul[mode == DFmode]
     		      + set_src_cost (XEXP (op, 0), mode, speed)
     		      + set_src_cost (XEXP (op, 1), mode, speed)
     		      + set_src_cost (XEXP (op, 2), mode, speed));
    @@ -1642,23 +1642,23 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
           }
     
           if (float_mode_p)
    -	*total = tune_info->fp_add[mode == DFmode];
    +	*total = riscv_tune_info->fp_add[mode == DFmode];
           else
     	*total = COSTS_N_INSNS (GET_MODE_SIZE (mode) > UNITS_PER_WORD ? 4 : 1);
           return false;
     
         case MULT:
           if (float_mode_p)
    -	*total = tune_info->fp_mul[mode == DFmode];
    +	*total = riscv_tune_info->fp_mul[mode == DFmode];
           else if (!TARGET_MUL)
     	/* Estimate the cost of a library call.  */
     	*total = COSTS_N_INSNS (speed ? 32 : 6);
           else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
    -	*total = 3 * tune_info->int_mul[0] + COSTS_N_INSNS (2);
    +	*total = 3 * riscv_tune_info->int_mul[0] + COSTS_N_INSNS (2);
           else if (!speed)
     	*total = COSTS_N_INSNS (1);
           else
    -	*total = tune_info->int_mul[mode == DImode];
    +	*total = riscv_tune_info->int_mul[mode == DImode];
           return false;
     
         case DIV:
    @@ -1666,7 +1666,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
         case MOD:
           if (float_mode_p)
     	{
    -	  *total = tune_info->fp_div[mode == DFmode];
    +	  *total = riscv_tune_info->fp_div[mode == DFmode];
     	  return false;
     	}
           /* Fall through.  */
    @@ -1677,7 +1677,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
     	/* Estimate the cost of a library call.  */
     	*total = COSTS_N_INSNS (speed ? 32 : 6);
           else if (speed)
    -	*total = tune_info->int_div[mode == DImode];
    +	*total = riscv_tune_info->int_div[mode == DImode];
           else
     	*total = COSTS_N_INSNS (1);
           return false;
    @@ -1699,11 +1699,11 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
         case FIX:
         case FLOAT_EXTEND:
         case FLOAT_TRUNCATE:
    -      *total = tune_info->fp_add[mode == DFmode];
    +      *total = riscv_tune_info->fp_add[mode == DFmode];
           return false;
     
         case FMA:
    -      *total = (tune_info->fp_mul[mode == DFmode]
    +      *total = (riscv_tune_info->fp_mul[mode == DFmode]
     		+ set_src_cost (XEXP (x, 0), mode, speed)
     		+ set_src_cost (XEXP (x, 1), mode, speed)
     		+ set_src_cost (XEXP (x, 2), mode, speed));
    @@ -4165,7 +4165,7 @@ riscv_class_max_nregs (reg_class_t rclass, machine_mode mode)
     static int
     riscv_memory_move_cost (machine_mode mode, reg_class_t rclass, bool in)
     {
    -  return (tune_info->memory_cost
    +  return (riscv_tune_info->memory_cost
     	  + memory_move_secondary_cost (mode, rclass, in));
     }
     
    @@ -4174,7 +4174,7 @@ riscv_memory_move_cost (machine_mode mode, reg_class_t rclass, bool in)
     static int
     riscv_issue_rate (void)
     {
    -  return tune_info->issue_rate;
    +  return riscv_tune_info->issue_rate;
     }
     
     /* Implement TARGET_ASM_FILE_START.  */
    @@ -4307,22 +4307,22 @@ riscv_option_override (void)
       /* Handle -mtune.  */
       cpu = riscv_parse_cpu (riscv_tune_string ? riscv_tune_string :
     			 RISCV_TUNE_STRING_DEFAULT);
    -  tune_info = optimize_size ? &optimize_size_tune_info : cpu->tune_info;
    +  riscv_tune_info = optimize_size ? &optimize_size_tune_info : cpu->riscv_tune_info;
     
       /* Use -mtune's setting for slow_unaligned_access, even when optimizing
          for size.  For architectures that trap and emulate unaligned accesses,
          the performance cost is too great, even for -Os.  Similarly, if
          -m[no-]strict-align is left unspecified, heed -mtune's advice.  */
    -  riscv_slow_unaligned_access_p = (cpu->tune_info->slow_unaligned_access
    +  riscv_slow_unaligned_access_p = (cpu->riscv_tune_info->slow_unaligned_access
     				   || TARGET_STRICT_ALIGN);
       if ((target_flags_explicit & MASK_STRICT_ALIGN) == 0
    -      && cpu->tune_info->slow_unaligned_access)
    +      && cpu->riscv_tune_info->slow_unaligned_access)
         target_flags |= MASK_STRICT_ALIGN;
     
       /* If the user hasn't specified a branch cost, use the processor's
          default.  */
       if (riscv_branch_cost == 0)
    -    riscv_branch_cost = tune_info->branch_cost;
    +    riscv_branch_cost = riscv_tune_info->branch_cost;
     
       /* Function to allocate machine-dependent function status.  */
       init_machine_status = &riscv_init_machine_status;
    

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ