lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20260119064731.23879-4-luis.augenstein@tngtech.com>
Date: Mon, 19 Jan 2026 07:47:20 +0100
From: Luis Augenstein <luis.augenstein@...tech.com>
To: nathan@...nel.org,
	nsc@...nel.org
Cc: linux-kbuild@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	akpm@...ux-foundation.org,
	gregkh@...uxfoundation.org,
	maximilian.huber@...tech.com,
	Luis Augenstein <luis.augenstein@...tech.com>
Subject: [PATCH 03/14] tools/sbom: add command parsers

Implement savedcmd_parser module for extracting input files
from kernel build commands.

Co-developed-by: Maximilian Huber <maximilian.huber@...tech.com>
Signed-off-by: Maximilian Huber <maximilian.huber@...tech.com>
Signed-off-by: Luis Augenstein <luis.augenstein@...tech.com>
---
 tools/sbom/sbom/cmd_graph/savedcmd_parser.py | 664 +++++++++++++++++++
 1 file changed, 664 insertions(+)
 create mode 100644 tools/sbom/sbom/cmd_graph/savedcmd_parser.py

diff --git a/tools/sbom/sbom/cmd_graph/savedcmd_parser.py b/tools/sbom/sbom/cmd_graph/savedcmd_parser.py
new file mode 100644
index 000000000..d72f781b4
--- /dev/null
+++ b/tools/sbom/sbom/cmd_graph/savedcmd_parser.py
@@ -0,0 +1,664 @@
+# SPDX-License-Identifier: GPL-2.0-only OR MIT
+# Copyright (C) 2025 TNG Technology Consulting GmbH
+
+import re
+import shlex
+from dataclasses import dataclass
+from typing import Any, Callable, Union
+import sbom.sbom_logging as sbom_logging
+from sbom.path_utils import PathStr
+
+
+class CmdParsingError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+        self.message = message
+
+
+@...aclass
+class Option:
+    name: str
+    value: str | None = None
+
+
+@...aclass
+class Positional:
+    value: str
+
+
+_SUBCOMMAND_PATTERN = re.compile(r"\$\$\(([^()]*)\)")
+"""Pattern to match $$(...) blocks"""
+
+
+def _tokenize_single_command(command: str, flag_options: list[str] | None = None) -> list[Union[Option, Positional]]:
+    """
+    Parse a shell command into a list of Options and Positionals.
+    - Positional: the command and any positional arguments.
+    - Options: handles flags and options with values provided as space-separated, or equals-sign
+        (e.g., '--opt val', '--opt=val', '--flag').
+
+    Args:
+        command: Command line string.
+        flag_options: Options that are flags without values (e.g., '--verbose').
+
+    Returns:
+        List of `Option` and `Positional` objects in command order.
+    """
+
+    #  Wrap all $$(...) blocks in double quotes to prevent shlex from splitting them.
+    command_with_protected_subcommands = _SUBCOMMAND_PATTERN.sub(lambda m: f'"$$({m.group(1)})"', command)
+    tokens = shlex.split(command_with_protected_subcommands)
+
+    parsed: list[Option | Positional] = []
+    i = 0
+    while i < len(tokens):
+        token = tokens[i]
+
+        # Positional
+        if not token.startswith("-"):
+            parsed.append(Positional(token))
+            i += 1
+            continue
+
+        # Option without value (--flag)
+        if (token.startswith("-") and i + 1 < len(tokens) and tokens[i + 1].startswith("-")) or (
+            flag_options and token in flag_options
+        ):
+            parsed.append(Option(name=token))
+            i += 1
+            continue
+
+        # Option with equals sign (--opt=val)
+        if "=" in token:
+            name, value = token.split("=", 1)
+            parsed.append(Option(name=name, value=value))
+            i += 1
+            continue
+
+        # Option with space-separated value (--opt val)
+        if i + 1 < len(tokens) and not tokens[i + 1].startswith("-"):
+            parsed.append(Option(name=token, value=tokens[i + 1]))
+            i += 2
+            continue
+
+        raise CmdParsingError(f"Unrecognized token: {token} in command {command}")
+
+    return parsed
+
+
+def _tokenize_single_command_positionals_only(command: str) -> list[str]:
+    command_parts = _tokenize_single_command(command)
+    positionals = [p.value for p in command_parts if isinstance(p, Positional)]
+    if len(positionals) != len(command_parts):
+        raise CmdParsingError(
+            f"Invalid command format: expected positional arguments only but got options in command {command}."
+        )
+    return positionals
+
+
+def _parse_dd_command(command: str) -> list[PathStr]:
+    match = re.match(r"dd.*?if=(\S+)", command)
+    if match:
+        return [match.group(1)]
+    return []
+
+
+def _parse_cat_command(command: str) -> list[PathStr]:
+    positionals = _tokenize_single_command_positionals_only(command)
+    # expect positionals to be ["cat", input1, input2, ...]
+    return [p for p in positionals[1:]]
+
+
+def _parse_compound_command(command: str) -> list[PathStr]:
+    compound_command_parsers: list[tuple[re.Pattern[str], Callable[[str], list[PathStr]]]] = [
+        (re.compile(r"dd\b"), _parse_dd_command),
+        (re.compile(r"cat.*?\|"), lambda c: _parse_cat_command(c.split("|")[0])),
+        (re.compile(r"cat\b[^|>]*$"), _parse_cat_command),
+        (re.compile(r"echo\b"), _parse_noop),
+        (re.compile(r"\S+="), _parse_noop),
+        (re.compile(r"printf\b"), _parse_noop),
+        (re.compile(r"sed\b"), _parse_sed_command),
+        (
+            re.compile(r"(.*/)scripts/bin2c\s*<"),
+            lambda c: [input] if (input := c.split("<")[1].strip()) != "/dev/null" else [],
+        ),
+        (re.compile(r"^:$"), _parse_noop),
+    ]
+
+    match = re.match(r"\s*[\(\{](.*)[\)\}]\s*>", command, re.DOTALL)
+    if match is None:
+        raise CmdParsingError("No inner commands found for compound command")
+    input_files: list[PathStr] = []
+    inner_commands = _split_commands(match.group(1))
+    for inner_command in inner_commands:
+        if isinstance(inner_command, IfBlock):
+            sbom_logging.error(
+                "Skip parsing inner command {inner_command} of compound command because IfBlock is not supported",
+                inner_command=inner_command,
+            )
+            continue
+
+        parser = next((parser for pattern, parser in compound_command_parsers if pattern.match(inner_command)), None)
+        if parser is None:
+            sbom_logging.error(
+                "Skip parsing inner command {inner_command} of compound command because no matching parser was found",
+                inner_command=inner_command,
+            )
+            continue
+        try:
+            input_files += parser(inner_command)
+        except CmdParsingError as e:
+            sbom_logging.error(
+                "Skip parsing inner command {inner_command} of compound command because of command parsing error: {error_message}",
+                inner_command=inner_command,
+                error_message=e.message,
+            )
+    return input_files
+
+
+def _parse_objcopy_command(command: str) -> list[PathStr]:
+    command_parts = _tokenize_single_command(command, flag_options=["-S", "-w"])
+    positionals = [part.value for part in command_parts if isinstance(part, Positional)]
+    # expect positionals to be ['objcopy', input_file] or ['objcopy', input_file, output_file]
+    if not (len(positionals) == 2 or len(positionals) == 3):
+        raise CmdParsingError(
+            f"Invalid objcopy command format: expected 2 or 3 positional arguments, got {len(positionals)} ({positionals})"
+        )
+    return [positionals[1]]
+
+
+def _parse_link_vmlinux_command(command: str) -> list[PathStr]:
+    """
+    For simplicity we do not parse the `scripts/link-vmlinux.sh` script.
+    Instead the `vmlinux.a` dependency is just hardcoded for now.
+    """
+    return ["vmlinux.a"]
+
+
+def _parse_noop(command: str) -> list[PathStr]:
+    """
+    No-op parser for commands with no input files (e.g., 'rm', 'true').
+    Returns an empty list.
+    """
+    return []
+
+
+def _parse_ar_command(command: str) -> list[PathStr]:
+    positionals = _tokenize_single_command_positionals_only(command)
+    # expect positionals to be ['ar', flags, output, input1, input2, ...]
+    flags = positionals[1]
+    if "r" not in flags:
+        # 'r' option indicates that new files are added to the archive.
+        # If this option is missing we won't find any relevant input files.
+        return []
+    return positionals[3:]
+
+
+def _parse_ar_piped_xargs_command(command: str) -> list[PathStr]:
+    printf_command, _ = command.split("|", 1)
+    positionals = _tokenize_single_command_positionals_only(printf_command.strip())
+    # expect positionals to be ['printf', '{prefix_path}%s ', input1, input2, ...]
+    prefix_path = positionals[1].rstrip("%s ")
+    return [f"{prefix_path}{filename}" for filename in positionals[2:]]
+
+
+def _parse_gcc_or_clang_command(command: str) -> list[PathStr]:
+    parts = shlex.split(command)
+    # compile mode: expect last positional argument ending in `.c` or `.S` to be the input file
+    for part in reversed(parts):
+        if not part.startswith("-") and any(part.endswith(suffix) for suffix in [".c", ".S"]):
+            return [part]
+
+    # linking mode: expect all .o files to be the inputs
+    return [p for p in parts if p.endswith(".o")]
+
+
+def _parse_rustc_command(command: str) -> list[PathStr]:
+    parts = shlex.split(command)
+    # expect last positional argument ending in `.rs` to be the input file
+    for part in reversed(parts):
+        if not part.startswith("-") and part.endswith(".rs"):
+            return [part]
+    raise CmdParsingError("Could not find .rs input source file")
+
+
+def _parse_rustdoc_command(command: str) -> list[PathStr]:
+    parts = shlex.split(command)
+    # expect last positional argument ending in `.rs` to be the input file
+    for part in reversed(parts):
+        if not part.startswith("-") and part.endswith(".rs"):
+            return [part]
+    raise CmdParsingError("Could not find .rs input source file")
+
+
+def _parse_syscallhdr_command(command: str) -> list[PathStr]:
+    command_parts = _tokenize_single_command(command.strip(), flag_options=["--emit-nr"])
+    positionals = [p.value for p in command_parts if isinstance(p, Positional)]
+    # expect positionals to be ["sh", path/to/syscallhdr.sh, input, output]
+    return [positionals[2]]
+
+
+def _parse_syscalltbl_command(command: str) -> list[PathStr]:
+    command_parts = _tokenize_single_command(command.strip())
+    positionals = [p.value for p in command_parts if isinstance(p, Positional)]
+    # expect positionals to be ["sh", path/to/syscalltbl.sh, input, output]
+    return [positionals[2]]
+
+
+def _parse_mkcapflags_command(command: str) -> list[PathStr]:
+    positionals = _tokenize_single_command_positionals_only(command)
+    # expect positionals to be ["sh", path/to/mkcapflags.sh, output, input1, input2]
+    return [positionals[3], positionals[4]]
+
+
+def _parse_orc_hash_command(command: str) -> list[PathStr]:
+    positionals = _tokenize_single_command_positionals_only(command)
+    # expect positionals to be ["sh", path/to/orc_hash.sh, '<', input, '>', output]
+    return [positionals[3]]
+
+
+def _parse_xen_hypercalls_command(command: str) -> list[PathStr]:
+    positionals = _tokenize_single_command_positionals_only(command)
+    # expect positionals to be ["sh", path/to/xen-hypercalls.sh, output, input1, input2, ...]
+    return positionals[3:]
+
+
+def _parse_gen_initramfs_command(command: str) -> list[PathStr]:
+    command_parts = _tokenize_single_command(command)
+    positionals = [p.value for p in command_parts if isinstance(p, Positional)]
+    # expect positionals to be ["sh", path/to/gen_initramfs.sh, input1, input2, ...]
+    return positionals[2:]
+
+
+def _parse_vdso2c_command(command: str) -> list[PathStr]:
+    positionals = _tokenize_single_command_positionals_only(command)
+    # expect positionals to be ['vdso2c', raw_input, stripped_input, output]
+    return [positionals[1], positionals[2]]
+
+
+def _parse_ld_command(command: str) -> list[PathStr]:
+    command_parts = _tokenize_single_command(
+        command=command.strip(),
+        flag_options=[
+            "-shared",
+            "--no-undefined",
+            "--eh-frame-hdr",
+            "-Bsymbolic",
+            "-r",
+            "--no-ld-generated-unwind-info",
+            "--no-dynamic-linker",
+            "-pie",
+            "--no-dynamic-linker--whole-archive",
+            "--whole-archive",
+            "--no-whole-archive",
+            "--start-group",
+            "--end-group",
+        ],
+    )
+    positionals = [p.value for p in command_parts if isinstance(p, Positional)]
+    # expect positionals to be ["ld", input1, input2, ...]
+    return positionals[1:]
+
+
+def _parse_sed_command(command: str) -> list[PathStr]:
+    command_parts = shlex.split(command)
+    # expect command parts to be ["sed", *, input]
+    input = command_parts[-1]
+    if input == "/dev/null":
+        return []
+    return [input]
+
+
+def _parse_awk(command: str) -> list[PathStr]:
+    command_parts = _tokenize_single_command(command)
+    positionals = [p.value for p in command_parts if isinstance(p, Positional)]
+    # expect positionals to be ["awk", input1, input2, ...]
+    return positionals[1:]
+
+
+def _parse_nm_piped_command(command: str) -> list[PathStr]:
+    nm_command, _ = command.split("|", 1)
+    command_parts = _tokenize_single_command(
+        command=nm_command.strip(),
+        flag_options=["p", "--defined-only"],
+    )
+    positionals = [p.value for p in command_parts if isinstance(p, Positional)]
+    # expect positionals to be ["nm", input1, input2, ...]
+    return [p for p in positionals[1:]]
+
+
+def _parse_pnm_to_logo_command(command: str) -> list[PathStr]:
+    command_parts = shlex.split(command)
+    # expect command parts to be ["pnmtologo", <options>, input]
+    return [command_parts[-1]]
+
+
+def _parse_relacheck(command: str) -> list[PathStr]:
+    positionals = _tokenize_single_command_positionals_only(command)
+    # expect positionals to be ["relachek", input, log_reference]
+    return [positionals[1]]
+
+
+def _parse_perl_command(command: str) -> list[PathStr]:
+    positionals = _tokenize_single_command_positionals_only(command.strip())
+    # expect positionals to be ["perl", input]
+    return [positionals[1]]
+
+
+def _parse_strip_command(command: str) -> list[PathStr]:
+    command_parts = _tokenize_single_command(command, flag_options=["--strip-debug"])
+    positionals = [p.value for p in command_parts if isinstance(p, Positional)]
+    # expect positionals to be ["strip", input1, input2, ...]
+    return positionals[1:]
+
+
+def _parse_mkpiggy_command(command: str) -> list[PathStr]:
+    mkpiggy_command, _ = command.split(">", 1)
+    positionals = _tokenize_single_command_positionals_only(mkpiggy_command)
+    # expect positionals to be ["mkpiggy", input]
+    return [positionals[1]]
+
+
+def _parse_relocs_command(command: str) -> list[PathStr]:
+    if ">" not in command:
+        # Only consider relocs commands that redirect output to a file.
+        # If there's no redirection, we assume it produces no output file and therefore has no input we care about.
+        return []
+    relocs_command, _ = command.split(">", 1)
+    command_parts = shlex.split(relocs_command)
+    # expect command_parts to be ["relocs", options, input]
+    return [command_parts[-1]]
+
+
+def _parse_mk_elfconfig_command(command: str) -> list[PathStr]:
+    positionals = _tokenize_single_command_positionals_only(command)
+    # expect positionals to be ["mk_elfconfig", "<", input, ">", output]
+    return [positionals[2]]
+
+
+def _parse_flex_command(command: str) -> list[PathStr]:
+    parts = shlex.split(command)
+    # expect last positional argument ending in `.l` to be the input file
+    for part in reversed(parts):
+        if not part.startswith("-") and part.endswith(".l"):
+            return [part]
+    raise CmdParsingError("Could not find .l input source file in command")
+
+
+def _parse_bison_command(command: str) -> list[PathStr]:
+    parts = shlex.split(command)
+    # expect last positional argument ending in `.y` to be the input file
+    for part in reversed(parts):
+        if not part.startswith("-") and part.endswith(".y"):
+            return [part]
+    raise CmdParsingError("Could not find input .y input source file in command")
+
+
+def _parse_tools_build_command(command: str) -> list[PathStr]:
+    positionals = _tokenize_single_command_positionals_only(command)
+    # expect positionals to be ["tools/build", "input1", "input2", "input3", "output"]
+    return positionals[1:-1]
+
+
+def _parse_extract_cert_command(command: str) -> list[PathStr]:
+    command_parts = shlex.split(command)
+    # expect command parts to be [path/to/extract-cert, input, output]
+    input = command_parts[1]
+    if not input:
+        return []
+    return [input]
+
+
+def _parse_dtc_command(command: str) -> list[PathStr]:
+    wno_flags = [command_part for command_part in shlex.split(command) if command_part.startswith("-Wno-")]
+    command_parts = _tokenize_single_command(command, flag_options=wno_flags)
+    positionals = [p.value for p in command_parts if isinstance(p, Positional)]
+    # expect positionals to be [path/to/dtc, input]
+    return [positionals[1]]
+
+
+def _parse_bindgen_command(command: str) -> list[PathStr]:
+    command_parts = shlex.split(command)
+    header_file_input_paths = [part for part in command_parts if part.endswith(".h")]
+    return header_file_input_paths
+
+
+def _parse_gen_header(command: str) -> list[PathStr]:
+    command_parts = shlex.split(command)
+    # expect command parts to be ["python3", path/to/gen_headers.py, ..., "--xml", input]
+    i = next(i for i, token in enumerate(command_parts) if token == "--xml")
+    return [command_parts[i + 1]]
+
+
+# Command parser registry
+SINGLE_COMMAND_PARSERS: list[tuple[re.Pattern[str], Callable[[str], list[PathStr]]]] = [
+    # Compound commands
+    (re.compile(r"\(.*?\)\s*>", re.DOTALL), _parse_compound_command),
+    (re.compile(r"\{.*?\}\s*>", re.DOTALL), _parse_compound_command),
+    # Standard Unix utilities and system tools
+    (re.compile(r"^rm\b"), _parse_noop),
+    (re.compile(r"^mkdir\b"), _parse_noop),
+    (re.compile(r"^touch\b"), _parse_noop),
+    (re.compile(r"^cat\b.*?[\|>]"), lambda c: _parse_cat_command(c.split("|")[0].split(">")[0])),
+    (re.compile(r"^echo[^|]*$"), _parse_noop),
+    (re.compile(r"^sed.*?>"), lambda c: _parse_sed_command(c.split(">")[0])),
+    (re.compile(r"^sed\b"), _parse_noop),
+    (re.compile(r"^awk.*?<.*?>"), lambda c: [c.split("<")[1].split(">")[0]]),
+    (re.compile(r"^awk.*?>"), lambda c: _parse_awk(c.split(">")[0])),
+    (re.compile(r"^(/bin/)?true\b"), _parse_noop),
+    (re.compile(r"^(/bin/)?false\b"), _parse_noop),
+    (re.compile(r"^openssl\s+req.*?-new.*?-keyout"), _parse_noop),
+    # Compilers and code generators
+    # (C/LLVM toolchain, Rust, Flex/Bison, Bindgen, Perl, etc.)
+    (re.compile(r"^([^\s]+-)?(gcc|clang)\b"), _parse_gcc_or_clang_command),
+    (re.compile(r"^([^\s]+-)?ld(\.bfd)?\b"), _parse_ld_command),
+    (re.compile(r"^printf\b.*\| xargs ([^\s]+-)?ar\b"), _parse_ar_piped_xargs_command),
+    (re.compile(r"^([^\s]+-)?ar\b"), _parse_ar_command),
+    (re.compile(r"^([^\s]+-)?nm\b.*?\|"), _parse_nm_piped_command),
+    (re.compile(r"^([^\s]+-)?objcopy\b"), _parse_objcopy_command),
+    (re.compile(r"^([^\s]+-)?strip\b"), _parse_strip_command),
+    (re.compile(r".*?rustc\b"), _parse_rustc_command),
+    (re.compile(r".*?rustdoc\b"), _parse_rustdoc_command),
+    (re.compile(r"^flex\b"), _parse_flex_command),
+    (re.compile(r"^bison\b"), _parse_bison_command),
+    (re.compile(r"^bindgen\b"), _parse_bindgen_command),
+    (re.compile(r"^perl\b"), _parse_perl_command),
+    # Kernel-specific build scripts and tools
+    (re.compile(r"^(.*/)?link-vmlinux\.sh\b"), _parse_link_vmlinux_command),
+    (re.compile(r"sh (.*/)?syscallhdr\.sh\b"), _parse_syscallhdr_command),
+    (re.compile(r"sh (.*/)?syscalltbl\.sh\b"), _parse_syscalltbl_command),
+    (re.compile(r"sh (.*/)?mkcapflags\.sh\b"), _parse_mkcapflags_command),
+    (re.compile(r"sh (.*/)?orc_hash\.sh\b"), _parse_orc_hash_command),
+    (re.compile(r"sh (.*/)?xen-hypercalls\.sh\b"), _parse_xen_hypercalls_command),
+    (re.compile(r"sh (.*/)?gen_initramfs\.sh\b"), _parse_gen_initramfs_command),
+    (re.compile(r"sh (.*/)?checkundef\.sh\b"), _parse_noop),
+    (re.compile(r"(.*/)?vdso2c\b"), _parse_vdso2c_command),
+    (re.compile(r"^(.*/)?mkpiggy.*?>"), _parse_mkpiggy_command),
+    (re.compile(r"^(.*/)?relocs\b"), _parse_relocs_command),
+    (re.compile(r"^(.*/)?mk_elfconfig.*?<.*?>"), _parse_mk_elfconfig_command),
+    (re.compile(r"^(.*/)?tools/build\b"), _parse_tools_build_command),
+    (re.compile(r"^(.*/)?certs/extract-cert"), _parse_extract_cert_command),
+    (re.compile(r"^(.*/)?scripts/dtc/dtc\b"), _parse_dtc_command),
+    (re.compile(r"^(.*/)?pnmtologo\b"), _parse_pnm_to_logo_command),
+    (re.compile(r"^(.*/)?kernel/pi/relacheck"), _parse_relacheck),
+    (re.compile(r"^drivers/gpu/drm/radeon/mkregtable"), lambda c: [c.split(" ")[1]]),
+    (re.compile(r"(.*/)?genheaders\b"), _parse_noop),
+    (re.compile(r"^(.*/)?mkcpustr\s+>"), _parse_noop),
+    (re.compile(r"^(.*/)polgen\b"), _parse_noop),
+    (re.compile(r"make -f .*/arch/x86/Makefile\.postlink"), _parse_noop),
+    (re.compile(r"^(.*/)?raid6/mktables\s+>"), _parse_noop),
+    (re.compile(r"^(.*/)?objtool\b"), _parse_noop),
+    (re.compile(r"^(.*/)?module/gen_test_kallsyms.sh"), _parse_noop),
+    (re.compile(r"^(.*/)?gen_header.py"), _parse_gen_header),
+    (re.compile(r"^(.*/)?scripts/rustdoc_test_gen"), _parse_noop),
+]
+
+
+# If Block pattern to match a simple, single-level if-then-fi block. Nested If blocks are not supported.
+IF_BLOCK_PATTERN = re.compile(
+    r"""
+    ^if(.*?);\s*         # Match 'if <condition>;' (non-greedy)
+    then(.*?);\s*        # Match 'then <body>;' (non-greedy)
+    fi\b                 # Match 'fi'
+    """,
+    re.VERBOSE,
+)
+
+
+@...aclass
+class IfBlock:
+    condition: str
+    then_statement: str
+
+
+def _unwrap_outer_parentheses(s: str) -> str:
+    s = s.strip()
+    if not (s.startswith("(") and s.endswith(")")):
+        return s
+
+    count = 0
+    for i, char in enumerate(s):
+        if char == "(":
+            count += 1
+        elif char == ")":
+            count -= 1
+            # If count is 0 before the end, outer parentheses don't match
+            if count == 0 and i != len(s) - 1:
+                return s
+
+    # outer parentheses do match, unwrap once
+    return _unwrap_outer_parentheses(s[1:-1])
+
+
+def _find_first_top_level_command_separator(
+    commands: str, separators: list[str] = [";", "&&"]
+) -> tuple[int | None, int | None]:
+    in_single_quote = False
+    in_double_quote = False
+    in_curly_braces = 0
+    in_braces = 0
+    for i, char in enumerate(commands):
+        if char == "'" and not in_double_quote:
+            # Toggle single quote state (unless inside double quotes)
+            in_single_quote = not in_single_quote
+        elif char == '"' and not in_single_quote:
+            # Toggle double quote state (unless inside single quotes)
+            in_double_quote = not in_double_quote
+
+        if in_single_quote or in_double_quote:
+            continue
+
+        # Toggle braces state
+        if char == "{":
+            in_curly_braces += 1
+        if char == "}":
+            in_curly_braces -= 1
+
+        if char == "(":
+            in_braces += 1
+        if char == ")":
+            in_braces -= 1
+
+        if in_curly_braces > 0 or in_braces > 0:
+            continue
+
+        # return found separator position and separator length
+        for separator in separators:
+            if commands[i : i + len(separator)] == separator:
+                return i, len(separator)
+
+    return None, None
+
+
+def _split_commands(commands: str) -> list[str | IfBlock]:
+    """
+    Splits a string of command-line commands into individual parts.
+
+    This function handles:
+    - Top-level command separators (e.g., `;` and `&&`) to split multiple commands.
+    - Conditional if-blocks, returning them as `IfBlock` instances.
+    - Preserves the order of commands and trims whitespace.
+
+    Args:
+        commands (str): The raw command string.
+
+    Returns:
+        list[str | IfBlock]: A list of single commands or `IfBlock` objects.
+    """
+    single_commands: list[str | IfBlock] = []
+    remaining_commands = _unwrap_outer_parentheses(commands)
+    while len(remaining_commands) > 0:
+        remaining_commands = remaining_commands.strip()
+
+        # if block
+        matched_if = IF_BLOCK_PATTERN.match(remaining_commands)
+        if matched_if:
+            condition, then_statement = matched_if.groups()
+            single_commands.append(IfBlock(condition.strip(), then_statement.strip()))
+            full_matched = matched_if.group(0)
+            remaining_commands = remaining_commands.removeprefix(full_matched).lstrip("; \n")
+            continue
+
+        # command until next separator
+        separator_position, separator_length = _find_first_top_level_command_separator(remaining_commands)
+        if separator_position is not None and separator_length is not None:
+            single_commands.append(remaining_commands[:separator_position].strip())
+            remaining_commands = remaining_commands[separator_position + separator_length :].strip()
+            continue
+
+        # single last command
+        single_commands.append(remaining_commands)
+        break
+
+    return single_commands
+
+
+def parse_inputs_from_commands(commands: str, fail_on_unknown_build_command: bool) -> list[PathStr]:
+    """
+    Extract input files referenced in a set of command-line commands.
+
+    Args:
+        commands (str): Command line expression to parse.
+        fail_on_unknown_build_command (bool): Whether to fail if an unknown build command is encountered. If False, errors are logged as warnings.
+
+    Returns:
+        list[PathStr]: List of input file paths required by the commands.
+    """
+
+    def log_error_or_warning(message: str, /, **kwargs: Any) -> None:
+        if fail_on_unknown_build_command:
+            sbom_logging.error(message, **kwargs)
+        else:
+            sbom_logging.warning(message, **kwargs)
+
+    input_files: list[PathStr] = []
+    for single_command in _split_commands(commands):
+        if isinstance(single_command, IfBlock):
+            inputs = parse_inputs_from_commands(single_command.then_statement, fail_on_unknown_build_command)
+            if inputs:
+                log_error_or_warning(
+                    "Skipped parsing command {then_statement} because input files in IfBlock 'then' statement are not supported",
+                    then_statement=single_command.then_statement,
+                )
+            continue
+
+        matched_parser = next(
+            (parser for pattern, parser in SINGLE_COMMAND_PARSERS if pattern.match(single_command)), None
+        )
+        if matched_parser is None:
+            log_error_or_warning(
+                "Skipped parsing command {single_command} because no matching parser was found",
+                single_command=single_command,
+            )
+            continue
+        try:
+            inputs = matched_parser(single_command)
+            input_files.extend(inputs)
+        except CmdParsingError as e:
+            log_error_or_warning(
+                "Skipped parsing command {single_command} because of command parsing error: {error_message}",
+                single_command=single_command,
+                error_message=e.message,
+            )
+
+    return [input.strip().rstrip("/") for input in input_files]
-- 
2.34.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ