lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20260119064731.23879-10-luis.augenstein@tngtech.com>
Date: Mon, 19 Jan 2026 07:47:26 +0100
From: Luis Augenstein <luis.augenstein@...tech.com>
To: nathan@...nel.org,
	nsc@...nel.org
Cc: linux-kbuild@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	akpm@...ux-foundation.org,
	gregkh@...uxfoundation.org,
	maximilian.huber@...tech.com,
	Luis Augenstein <luis.augenstein@...tech.com>
Subject: [PATCH 09/14] tools/sbom: collect file metadata

Implement the kernel_file module that collects file metadata,
including license identifier for source files, SHA-256 hash,
Git blob object ID, an estimation of the file type, and
whether files belong to the source, build, or output SBOM.

Co-developed-by: Maximilian Huber <maximilian.huber@...tech.com>
Signed-off-by: Maximilian Huber <maximilian.huber@...tech.com>
Signed-off-by: Luis Augenstein <luis.augenstein@...tech.com>
---
 .../sbom/sbom/spdx_graph/build_spdx_graphs.py |   2 +
 tools/sbom/sbom/spdx_graph/kernel_file.py     | 310 ++++++++++++++++++
 2 files changed, 312 insertions(+)
 create mode 100644 tools/sbom/sbom/spdx_graph/kernel_file.py

diff --git a/tools/sbom/sbom/spdx_graph/build_spdx_graphs.py b/tools/sbom/sbom/spdx_graph/build_spdx_graphs.py
index 9c47258a3..0f95f99d5 100644
--- a/tools/sbom/sbom/spdx_graph/build_spdx_graphs.py
+++ b/tools/sbom/sbom/spdx_graph/build_spdx_graphs.py
@@ -7,6 +7,7 @@ from typing import Protocol
 from sbom.config import KernelSpdxDocumentKind
 from sbom.cmd_graph import CmdGraph
 from sbom.path_utils import PathStr
+from sbom.spdx_graph.kernel_file import KernelFileCollection
 from sbom.spdx_graph.spdx_graph_model import SpdxGraph, SpdxIdGeneratorCollection
 from sbom.spdx_graph.shared_spdx_elements import SharedSpdxElements
 
@@ -36,4 +37,5 @@ def build_spdx_graphs(
         Dictionary of SPDX graphs
     """
     shared_elements = SharedSpdxElements.create(spdx_id_generators.base, config.created)
+    kernel_files = KernelFileCollection.create(cmd_graph, config.obj_tree, config.src_tree, spdx_id_generators)
     return {}
diff --git a/tools/sbom/sbom/spdx_graph/kernel_file.py b/tools/sbom/sbom/spdx_graph/kernel_file.py
new file mode 100644
index 000000000..84582567b
--- /dev/null
+++ b/tools/sbom/sbom/spdx_graph/kernel_file.py
@@ -0,0 +1,310 @@
+# SPDX-License-Identifier: GPL-2.0-only OR MIT
+# Copyright (C) 2025 TNG Technology Consulting GmbH
+
+from dataclasses import dataclass
+from enum import Enum
+import hashlib
+import os
+import re
+from sbom.cmd_graph import CmdGraph
+from sbom.path_utils import PathStr, is_relative_to
+from sbom.spdx import SpdxId, SpdxIdGenerator
+from sbom.spdx.core import Hash
+from sbom.spdx.software import ContentIdentifier, File, SoftwarePurpose
+import sbom.sbom_logging as sbom_logging
+from sbom.spdx_graph.spdx_graph_model import SpdxIdGeneratorCollection
+
+
+class KernelFileLocation(Enum):
+    """Represents the location of a file relative to the source/object trees."""
+
+    SOURCE_TREE = "source_tree"
+    """File is located in the source tree."""
+    OBJ_TREE = "obj_tree"
+    """File is located in the object tree."""
+    EXTERNAL = "external"
+    """File is located outside both source and object trees."""
+    BOTH = "both"
+    """File is located in a folder that is both source and object tree."""
+
+
+@...aclass
+class KernelFile:
+    """kernel-specific metadata used to generate an SPDX File element."""
+
+    absolute_path: PathStr
+    """Absolute path of the file."""
+    file_location: KernelFileLocation
+    """Location of the file relative to the source/object trees."""
+    name: str
+    """Name of the file element. Should be relative to the source tree if
+    file_location equals SOURCE_TREE and relative to the object tree if
+    file_location equals OBJ_TREE. If file_location equals EXTERNAL, the
+    absolute path is used."""
+    license_identifier: str | None
+    """SPDX license ID if file_location equals SOURCE_TREE or BOTH; otherwise None."""
+    spdx_id_generator: SpdxIdGenerator
+    """Generator for the SPDX ID of the file element."""
+
+    _spdx_file_element: File | None = None
+
+    @classmethod
+    def create(
+        cls,
+        absolute_path: PathStr,
+        obj_tree: PathStr,
+        src_tree: PathStr,
+        spdx_id_generators: SpdxIdGeneratorCollection,
+        is_output: bool,
+    ) -> "KernelFile":
+        is_in_obj_tree = is_relative_to(absolute_path, obj_tree)
+        is_in_src_tree = is_relative_to(absolute_path, src_tree)
+
+        # file element name should be relative to output or src tree if possible
+        if not is_in_src_tree and not is_in_obj_tree:
+            file_element_name = str(absolute_path)
+            file_location = KernelFileLocation.EXTERNAL
+            spdx_id_generator = spdx_id_generators.build
+        elif is_in_src_tree and src_tree == obj_tree:
+            file_element_name = os.path.relpath(absolute_path, obj_tree)
+            file_location = KernelFileLocation.BOTH
+            spdx_id_generator = spdx_id_generators.output if is_output else spdx_id_generators.build
+        elif is_in_obj_tree:
+            file_element_name = os.path.relpath(absolute_path, obj_tree)
+            file_location = KernelFileLocation.OBJ_TREE
+            spdx_id_generator = spdx_id_generators.output if is_output else spdx_id_generators.build
+        else:
+            file_element_name = os.path.relpath(absolute_path, src_tree)
+            file_location = KernelFileLocation.SOURCE_TREE
+            spdx_id_generator = spdx_id_generators.source
+
+        # parse spdx license identifier
+        license_identifier = (
+            _parse_spdx_license_identifier(absolute_path)
+            if file_location == KernelFileLocation.SOURCE_TREE or file_location == KernelFileLocation.BOTH
+            else None
+        )
+
+        return KernelFile(
+            absolute_path,
+            file_location,
+            file_element_name,
+            license_identifier,
+            spdx_id_generator,
+        )
+
+    @property
+    def spdx_file_element(self) -> File:
+        if self._spdx_file_element is None:
+            self._spdx_file_element = _build_file_element(
+                self.absolute_path,
+                self.name,
+                self.spdx_id_generator.generate(),
+                self.file_location,
+            )
+        return self._spdx_file_element
+
+
+@...aclass
+class KernelFileCollection:
+    """Collection of kernel files."""
+
+    source: dict[PathStr, KernelFile]
+    build: dict[PathStr, KernelFile]
+    output: dict[PathStr, KernelFile]
+
+    @classmethod
+    def create(
+        cls,
+        cmd_graph: CmdGraph,
+        obj_tree: PathStr,
+        src_tree: PathStr,
+        spdx_id_generators: SpdxIdGeneratorCollection,
+    ) -> "KernelFileCollection":
+        source: dict[PathStr, KernelFile] = {}
+        build: dict[PathStr, KernelFile] = {}
+        output: dict[PathStr, KernelFile] = {}
+        root_node_paths = {node.absolute_path for node in cmd_graph.roots}
+        for node in cmd_graph:
+            is_root = node.absolute_path in root_node_paths
+            kernel_file = KernelFile.create(
+                node.absolute_path,
+                obj_tree,
+                src_tree,
+                spdx_id_generators,
+                is_root,
+            )
+            if is_root:
+                output[kernel_file.absolute_path] = kernel_file
+            elif kernel_file.file_location == KernelFileLocation.SOURCE_TREE:
+                source[kernel_file.absolute_path] = kernel_file
+            else:
+                build[kernel_file.absolute_path] = kernel_file
+
+        return KernelFileCollection(source, build, output)
+
+    def to_dict(self) -> dict[PathStr, KernelFile]:
+        return {**self.source, **self.build, **self.output}
+
+
+def _build_file_element(absolute_path: PathStr, name: str, spdx_id: SpdxId, file_location: KernelFileLocation) -> File:
+    verifiedUsing: list[Hash] = []
+    content_identifier: list[ContentIdentifier] = []
+    if os.path.exists(absolute_path):
+        verifiedUsing = [Hash(algorithm="sha256", hashValue=_sha256(absolute_path))]
+        content_identifier = [
+            ContentIdentifier(
+                software_contentIdentifierType="gitoid",
+                software_contentIdentifierValue=_git_blob_oid(absolute_path),
+            )
+        ]
+    elif file_location == KernelFileLocation.EXTERNAL:
+        sbom_logging.warning(
+            "Cannot compute hash for {absolute_path} because file does not exist.",
+            absolute_path=absolute_path,
+        )
+    else:
+        sbom_logging.error(
+            "Cannot compute hash for {absolute_path} because file does not exist.",
+            absolute_path=absolute_path,
+        )
+
+    # primary purpose
+    primary_purpose = _get_primary_purpose(absolute_path)
+
+    return File(
+        spdxId=spdx_id,
+        name=name,
+        verifiedUsing=verifiedUsing,
+        software_primaryPurpose=primary_purpose,
+        software_contentIdentifier=content_identifier,
+    )
+
+
+def _sha256(path: PathStr) -> str:
+    """Compute the SHA-256 hash of a file."""
+    with open(path, "rb") as f:
+        data = f.read()
+    return hashlib.sha256(data).hexdigest()
+
+
+def _git_blob_oid(file_path: str) -> str:
+    """
+    Compute the Git blob object ID (SHA-1) for a file, like `git hash-object`.
+
+    Args:
+        file_path: Path to the file.
+
+    Returns:
+        SHA-1 hash (hex) of the Git blob object.
+    """
+    with open(file_path, "rb") as f:
+        content = f.read()
+    header = f"blob {len(content)}\0".encode()
+    store = header + content
+    sha1_hash = hashlib.sha1(store).hexdigest()
+    return sha1_hash
+
+
+# REUSE-IgnoreStart
+SPDX_LICENSE_IDENTIFIER_PATTERN = re.compile(r"SPDX-License-Identifier:\s*(?P<id>.*?)(?:\s*(\*/|$))")
+# REUSE-IgnoreEnd
+
+
+def _parse_spdx_license_identifier(absolute_path: str, max_lines: int = 5) -> str | None:
+    """
+    Extracts the SPDX-License-Identifier from the first few lines of a source file.
+
+    Args:
+        absolute_path: Path to the source file.
+        max_lines: Number of lines to scan from the top (default: 5).
+
+    Returns:
+        The license identifier string (e.g., 'GPL-2.0-only') if found, otherwise None.
+    """
+    try:
+        with open(absolute_path, "r") as f:
+            for _ in range(max_lines):
+                match = SPDX_LICENSE_IDENTIFIER_PATTERN.search(f.readline())
+                if match:
+                    return match.group("id")
+    except (UnicodeDecodeError, OSError):
+        return None
+    return None
+
+
+def _get_primary_purpose(absolute_path: PathStr) -> SoftwarePurpose | None:
+    def ends_with(suffixes: list[str]) -> bool:
+        return any(absolute_path.endswith(suffix) for suffix in suffixes)
+
+    def includes_path_segments(path_segments: list[str]) -> bool:
+        return any(segment in absolute_path for segment in path_segments)
+
+    # Source code
+    if ends_with([".c", ".h", ".S", ".s", ".rs", ".pl"]):
+        return "source"
+
+    # Libraries
+    if ends_with([".a", ".so", ".rlib"]):
+        return "library"
+
+    # Archives
+    if ends_with([".xz", ".cpio", ".gz", ".tar", ".zip"]):
+        return "archive"
+
+    # Applications
+    if ends_with(["bzImage", "Image"]):
+        return "application"
+
+    # Executables / machine code
+    if ends_with([".bin", ".elf", "vmlinux", "vmlinux.unstripped", "bpfilter_umh"]):
+        return "executable"
+
+    # Kernel modules
+    if ends_with([".ko"]):
+        return "module"
+
+    # Data files
+    if ends_with(
+        [
+            ".tbl",
+            ".relocs",
+            ".rmeta",
+            ".in",
+            ".dbg",
+            ".x509",
+            ".pbm",
+            ".ppm",
+            ".dtb",
+            ".uc",
+            ".inc",
+            ".dts",
+            ".dtsi",
+            ".dtbo",
+            ".xml",
+            ".ro",
+            "initramfs_inc_data",
+            "default_cpio_list",
+            "x509_certificate_list",
+            "utf8data.c_shipped",
+            "blacklist_hash_list",
+            "x509_revocation_list",
+            "cpucaps",
+            "sysreg",
+        ]
+    ) or includes_path_segments(["drivers/gpu/drm/radeon/reg_srcs/"]):
+        return "data"
+
+    # Configuration files
+    if ends_with([".pem", ".key", ".conf", ".config", ".cfg", ".bconf"]):
+        return "configuration"
+
+    # Documentation
+    if ends_with([".md"]):
+        return "documentation"
+
+    # Other / miscellaneous
+    if ends_with([".o", ".tmp"]):
+        return "other"
+
+    sbom_logging.warning("Could not infer primary purpose for {absolute_path}", absolute_path=absolute_path)
-- 
2.34.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ