lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Fri, 14 Jul 2023 13:58:13 +0800
From:   Zhangjin Wu <falcon@...ylab.org>
To:     w@....eu
Cc:     arnd@...db.de, falcon@...ylab.org, linux-kernel@...r.kernel.org,
        linux-kselftest@...r.kernel.org, thomas@...ch.de
Subject: Re: [PATCH v3 02/11] tools/nolibc: add new crt.h with _start_c

Hi, Willy, Thomas

> Hi Zhangjin,
>
> I haven't reviewed the rest yet but regarding this point:
>
> On Thu, Jul 13, 2023 at 02:12:27PM +0800, Zhangjin Wu wrote:
> > > > +	/* find auxv */
> > > > +	i = 0;
> > > > +	while (envp[i])
> > > > +		i++;
> > > > +	_auxv = (void *)(envp + i + 1);
> > >
> > > Could be simplified a bit:
> > >
> > > _auxv = (void *) envp;
> > > while (_auxv)
> > > 	_auxv++;
> > >
> >
> > Yeah, it is better, but needs a little change.
> >
> >     _auxv = (void *) envp;
> >     while (*_auxv)
> > 	_auxv++;
> >     _auxv++;
>
> Or just:
>
>     _auxv = (void*)environ;
>     while (*_auxv++)
>          ;
>
> or:
>
>     for (_auxv = (void*)environ; *_auxv++; )
>          ;
>
> Please also have a look at the output code, because at low optimization
> levels, compilers sometimes produce a better code with a local variable
> than with a global variable in a loop. Thus I wouldn't be that much
> surprised if at -O0 or -O1 you'd see slightly more compact code using:
>

Very good suggestion, I did find some interesting results, after
removing some local variables, although the text size shrinks, but the
total size is the same with -O0/1/2/3/s.

Here is the diff with -O0/1/2/3/s (a REPORT_SIZE macro may be required for
this):

    $ diff -Nubr startup.old.txt startup.new.txt
    --- startup.old.txt	2023-07-14 10:22:45.990413661 +0800
    +++ startup.new.txt	2023-07-14 10:22:52.278869146 +0800
    @@ -2,34 +2,34 @@
     sudo strip -s nolibc-test
     size nolibc-test
        text    data     bss     dec     hex filename
    -  46872      88      80   47040    b7c0 nolibc-test
    +  46836      88      80   47004    b79c nolibc-test
     wc -c nolibc-test
     58016 nolibc-test
      O1:
     sudo strip -s nolibc-test
     size nolibc-test
        text    data     bss     dec     hex filename
    -  27298      88      72   27458    6b42 nolibc-test
    +  27287      88      72   27447    6b37 nolibc-test
     wc -c nolibc-test
     37536 nolibc-test
      O2:
     sudo strip -s nolibc-test
     size nolibc-test
        text    data     bss     dec     hex filename
    -  25688      88      72   25848    64f8 nolibc-test
    +  25672      88      72   25832    64e8 nolibc-test
     wc -c nolibc-test
     37536 nolibc-test
      O3:
     sudo strip -s nolibc-test
     size nolibc-test
        text    data     bss     dec     hex filename
    -  44020      88      72   44180    ac94 nolibc-test
    +  44004      88      72   44164    ac84 nolibc-test
     wc -c nolibc-test
     53920 nolibc-test
      Os:
     sudo strip -s nolibc-test
     size nolibc-test
        text    data     bss     dec     hex filename
    -  20887      88      72   21047    5237 nolibc-test
    +  20884      88      72   21044    5234 nolibc-test
     wc -c nolibc-test
     33440 nolibc-test

The code with local variables:

    void _start_c(long *sp)
    {
    	int argc, i;
    	char **argv;
    	char **envp;
    	/* silence potential warning: conflicting types for 'main' */
    	int _nolibc_main(int, char **, char **) __asm__ ("main");
    
    	/*
    	 * sp  :    argc          <-- argument count, required by main()
    	 * argv:    argv[0]       <-- argument vector, required by main()
    	 *          argv[1]
    	 *          ...
    	 *          argv[argc-1]
    	 *          null
    	 * environ: environ[0]    <-- environment variables, required by main() and getenv()
    	 *          environ[1]
    	 *          ...
    	 *          null
    	 * _auxv:   _auxv[0]      <-- auxiliary vector, required by getauxval()
    	 *          _auxv[1]
    	 *          ...
    	 *          null
    	 */
    
    	/* assign argc and argv */
    	argc = *sp;
    	argv = (void *)(sp + 1);
    
    	/* find environ */
    	environ = envp = argv + argc + 1;
    
    	/* find _auxv */
    	for (i = 0; *(envp + i); i++)
    		;
    	_auxv = (void *)(envp + i + 1);
    
    	/* go to application */
    	exit(_nolibc_main(argc, argv, envp));
    }

The code with global variables:

    void _start_c(long *sp)
    {
    	int argc;
    	char **argv;
    	/* silence potential warning: conflicting types for 'main' */
    	int _nolibc_main(int, char **, char **) __asm__ ("main");

    	/*
    	 * sp  :    argc          <-- argument count, required by main()
    	 * argv:    argv[0]       <-- argument vector, required by main()
    	 *          argv[1]
    	 *          ...
    	 *          argv[argc-1]
    	 *          null
    	 * environ: environ[0]    <-- environment variables, required by main() and getenv()
    	 *          environ[1]
    	 *          ...
    	 *          null
    	 * _auxv:   _auxv[0]      <-- auxiliary vector, required by getauxval()
    	 *          _auxv[1]
    	 *          ...
    	 *          null
    	 */

    	/* assign argc and argv */
    	argc = *sp;
    	argv = (void *)(sp + 1);

    	/* find environ */
    	environ = argv + argc + 1;

    	/* find _auxv */
    	for (_auxv = (void *)environ; *_auxv++;)
    		;

    	/* go to application */
    	exit(_nolibc_main(argc, argv, environ));
    }

Which one do you prefer? the one with local variables may be more readable (not
that much), the one with global variables has smaller text size (and therefore
smaller memory footprint).

BTW, just found an arch-<ARCH>.h bug with -O0, seems the
'optimize("omit-frame-pointer")' attribute not really work as expected with
-O0. It uses frame pointer for _start eventually and breaks the stack pointer
variable (a push 'rbp' inserted at the begging of _start, so, the real rsp has
an offset), so, therefore, it is not able to get the right argc, argv, environ
and _auxv with -O0 currently. A solution is reverting _start to pure assembly.

For the above tests, I manually reverted the arch-x86_64.h to:

    __asm__(
            ".text\n"
            ".weak _start\n"
            "_start:\n"
    #ifdef _NOLIBC_STACKPROTECTOR
            "call __stack_chk_init\n" /* initialize stack protector                      */
    #endif
            "xor  %ebp, %ebp\n"       /* zero the stack frame                            */
            "mov  %rsp, %rdi\n"       /* save stack pointer to %rdi, as arg1 of _start_c */
            "and  $-16, %rsp\n"       /* %rsp must be 16-byte aligned before call        */
            "call _start_c\n"         /* transfer to c runtime                           */
            "hlt\n"                   /* ensure it does not return                       */
    );


'man gcc' shows:

    Most optimizations are completely disabled at -O0 or if an -O level is not set on the command line, even if individual optimization flags are specified.

To want -O0 work again, since now we have C _start_c, is it ok for us to revert
the commit 7f8548589661 ("tools/nolibc: make compiler and assembler agree on
the section around _start") and the later __no_stack_protector changes?

At the same time, to verify such issues, as Thomas suggested, in this series,
we may need to add more startup tests to verify argc, argv, environ, _auxv, do
we need to add a standalone run_startup (or run_crt) test entry just like
run_syscall? or, let's simply add more in the run_stdlib, like the environ test
added by Thomas.  seems, the argc test is necessary one currently missing (argc
>= 1):

    CASE_TEST(argc);               EXPECT_GE(1, test_argc, 1); break;

As we summarized before, the related test cases are:

argv0:

    CASE_TEST(chmod_argv0);       EXPECT_SYSZR(1, chmod(test_argv0, 0555)); break;
    CASE_TEST(chroot_exe);        EXPECT_SYSER(1, chroot(test_argv0), -1, ENOTDIR); break;

environ:

    CASE_TEST(chdir_root);        EXPECT_SYSZR(1, chdir("/")); chdir(getenv("PWD")); break;
    CASE_TEST(environ);            EXPECT_PTREQ(1, environ, test_envp); break;
    CASE_TEST(getenv_TERM);        EXPECT_STRNZ(1, getenv("TERM")); break;
    CASE_TEST(getenv_blah);        EXPECT_STRZR(1, getenv("blah")); break;

auxv:

    CASE_TEST(getpagesize);       EXPECT_SYSZR(1, test_getpagesize()); break;

The above tests are in different test group and are not aimed to startup test,
we'd better add a run_startup (or run_crt) test group before any other tests,
it is a requiremnt of the others, we at least have these ones:

    +int run_startup(int min, int max)
    +{
    +       int test;
    +       int tmp;
    +       int ret = 0;
    +
    +       for (test = min; test >= 0 && test <= max; test++) {
    +               int llen = 0; /* line length */
    +
    +               /* avoid leaving empty lines below, this will insert holes into
    +                * test numbers.
    +                */
    +               switch (test + __LINE__ + 1) {
    +               CASE_TEST(argc);               EXPECT_GE(1, test_argc, 1); break;
    +               CASE_TEST(argv_addr);          EXPECT_PTRNZ(1, test_argv); break;
    +               CASE_TEST(argv_total);         EXPECT_EQ(1, environ - test_argv - 1, test_argc); break;
    +               CASE_TEST(argv0_addr);         EXPECT_PTRNZ(1, argv0); break;
    +               CASE_TEST(argv0_str);          EXPECT_STRNZ(1, argv0); break;
    +               CASE_TEST(argv0_len);          EXPECT_GE(1, strlen(argv0), 1); break;
    +               CASE_TEST(environ_addr);       EXPECT_PTRNZ(1, environ); break;
    +               CASE_TEST(environ_envp);       EXPECT_PTREQ(1, environ, test_envp); break;
    +               CASE_TEST(environ_total);      EXPECT_GE(1, (void *)_auxv - (void *)environ - 1, 1); break;
    +               CASE_TEST(_auxv_addr);         EXPECT_PTRNZ(1, _auxv); break;
    +               case __LINE__:
    +                       return ret; /* must be last */
    +               /* note: do not set any defaults so as to permit holes above */
    +               }
    +       }
    +       return ret;
    +}

Any more?

>        /* find envp */
>        environ = argv + argc + 1;
>
>        /* find auxv */
>        for (auxv = environ; *auxv++)
>                 ;
>        _auxv = auxv;
>
> than:
>        /* find envp */
>        envp = argv + argc + 1;
>        environ = envp;
>
>        /* find auxv */
>        for (_auxv = environ; *_auxv++)
>                 ;
>

Great, `*_auxv++` is a very good trick to avoid duplicated _auxv++, although it
is a little hard to read (not that hard in reality) ;-)

> Since it's going to become generic code, it's worth running a few tests
> to see how to best polish it.
>

Yes, I focused on the big shrinking itself but forgot to polish the _start_c
itself ;-)

Best regards,
Zhangjin

> Thanks,
> Willy

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ