linux-kernel - [RFC 2/5] LLMinus: Add vectorize command with fastembed

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20251219181629.1123823-3-sashal@kernel.org>
Date: Fri, 19 Dec 2025 13:16:26 -0500
From: Sasha Levin <sashal@...nel.org>
To: tools@...nel.org
Cc: linux-kernel@...r.kernel.org,
	torvalds@...ux-foundation.org,
	broonie@...nel.org,
	Sasha Levin <sashal@...nel.org>
Subject: [RFC 2/5] LLMinus: Add vectorize command with fastembed

Add the 'vectorize' command that generates embeddings for stored
conflict resolutions using the BGE-small-en-v1.5 model via fastembed.

Key features:
- Uses fastembed v5 for local embedding generation
- BGE-small model produces 384-dimensional vectors
- Batch processing with configurable batch size (-b flag)
- Incremental saves after each batch for crash recovery
- Skips resolutions that already have embeddings
- Progress reporting during vectorization

This enables RAG-based similarity search for finding historical
conflict resolutions that are similar to current merge conflicts.

Also adds:
- cosine_similarity() function for vector comparison
- init_embedding_model() helper for model initialization
- Tests for vectorize command parsing and cosine_similarity

Signed-off-by: Sasha Levin <sashal@...nel.org>
---
 tools/llminus/Cargo.toml  |   1 +
 tools/llminus/src/main.rs | 157 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 158 insertions(+)

diff --git a/tools/llminus/Cargo.toml b/tools/llminus/Cargo.toml
index bdb42561a0565..86740174de598 100644
--- a/tools/llminus/Cargo.toml
+++ b/tools/llminus/Cargo.toml
@@ -10,6 +10,7 @@ repository = "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
 [dependencies]
 anyhow = "1"
 clap = { version = "4", features = ["derive"] }
+fastembed = "5"
 rayon = "1"
 serde = { version = "1", features = ["derive"] }
 serde_json = "1"
diff --git a/tools/llminus/src/main.rs b/tools/llminus/src/main.rs
index 1c61836cc93f7..32a578030b0e3 100644
--- a/tools/llminus/src/main.rs
+++ b/tools/llminus/src/main.rs
@@ -2,6 +2,7 @@
 
 use anyhow::{bail, Context, Result};
 use clap::{Parser, Subcommand};
+use fastembed::{EmbeddingModel, InitOptions, TextEmbedding};
 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 use std::collections::HashSet;
@@ -26,6 +27,12 @@ enum Commands {
         /// Git revision range (e.g., "v6.0..v6.1"). If not specified, learns from entire history.
         range: Option<String>,
     },
+    /// Generate embeddings for stored resolutions (for RAG similarity search)
+    Vectorize {
+        /// Batch size for embedding generation (default: 64)
+        #[arg(short, long, default_value = "64")]
+        batch_size: usize,
+    },
 }
 
 /// A single diff hunk representing a change region
@@ -483,11 +490,118 @@ fn learn(range: Option<&str>) -> Result<()> {
     Ok(())
 }
 
+/// Compute cosine similarity between two vectors
+fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
+    if a.len() != b.len() || a.is_empty() {
+        return 0.0;
+    }
+
+    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
+    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
+    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
+
+    if norm_a == 0.0 || norm_b == 0.0 {
+        return 0.0;
+    }
+
+    dot / (norm_a * norm_b)
+}
+
+/// Initialize the BGE-small embedding model
+fn init_embedding_model() -> Result<TextEmbedding> {
+    TextEmbedding::try_new(
+        InitOptions::new(EmbeddingModel::BGESmallENV15)
+            .with_show_download_progress(true),
+    ).context("Failed to initialize embedding model")
+}
+
+fn vectorize(batch_size: usize) -> Result<()> {
+    let store_path = Path::new(STORE_PATH);
+
+    if !store_path.exists() {
+        bail!("No resolutions found. Run 'llminus learn' first.");
+    }
+
+    let mut store = ResolutionStore::load(store_path)?;
+
+    // Count how many need embeddings
+    let need_embedding: Vec<usize> = store
+        .resolutions
+        .iter()
+        .enumerate()
+        .filter(|(_, r)| r.embedding.is_none())
+        .map(|(i, _)| i)
+        .collect();
+
+    if need_embedding.is_empty() {
+        println!("All {} resolutions already have embeddings.", store.resolutions.len());
+        return Ok(());
+    }
+
+    println!("Found {} resolutions needing embeddings", need_embedding.len());
+    println!("Initializing embedding model (BGE-small-en, ~33MB download on first run)...");
+
+    // Initialize the embedding model
+    let mut model = init_embedding_model()?;
+
+    println!("Model loaded. Generating embeddings...\n");
+
+    // Process in batches
+    let total_batches = (need_embedding.len() + batch_size - 1) / batch_size;
+
+    for (batch_num, chunk) in need_embedding.chunks(batch_size).enumerate() {
+        // Collect texts for this batch
+        let texts: Vec<String> = chunk
+            .iter()
+            .map(|&i| store.resolutions[i].to_embedding_text())
+            .collect();
+
+        // Generate embeddings
+        let embeddings = model
+            .embed(texts, None)
+            .context("Failed to generate embeddings")?;
+
+        // Assign embeddings back to resolutions
+        for (j, &idx) in chunk.iter().enumerate() {
+            store.resolutions[idx].embedding = Some(embeddings[j].clone());
+        }
+
+        // Progress report
+        let done = (batch_num + 1) * batch_size.min(chunk.len());
+        let pct = (done as f64 / need_embedding.len() as f64 * 100.0).min(100.0);
+        println!(
+            "  Batch {}/{}: {:.1}% ({}/{})",
+            batch_num + 1,
+            total_batches,
+            pct,
+            done.min(need_embedding.len()),
+            need_embedding.len()
+        );
+
+        // Save after each batch (incremental progress)
+        store.save(store_path)?;
+    }
+
+    // Final stats
+    let json_size = std::fs::metadata(store_path).map(|m| m.len()).unwrap_or(0);
+    let with_embeddings = store.resolutions.iter().filter(|r| r.embedding.is_some()).count();
+
+    println!("\nResults:");
+    println!("  Total resolutions: {}", store.resolutions.len());
+    println!("  With embeddings: {}", with_embeddings);
+    println!("  Embedding dimensions: 384");
+    println!("  Output size: {:.2} MB", json_size as f64 / 1024.0 / 1024.0);
+    println!("\nEmbeddings saved to: {}", store_path.display());
+
+    Ok(())
+}
+
 fn main() -> Result<()> {
     let cli = Cli::parse();
 
     match cli.command {
         Commands::Learn { range } => learn(range.as_deref()),
+        Commands::Vectorize { batch_size } => vectorize(batch_size),
     }
 }
 
@@ -508,6 +622,7 @@ fn test_learn_command_parses() {
         let cli = Cli::try_parse_from(["llminus", "learn"]).unwrap();
         match cli.command {
             Commands::Learn { range } => assert!(range.is_none()),
+            _ => panic!("Expected Learn command"),
         }
     }
 
@@ -516,9 +631,51 @@ fn test_learn_command_with_range() {
         let cli = Cli::try_parse_from(["llminus", "learn", "v6.0..v6.1"]).unwrap();
         match cli.command {
             Commands::Learn { range } => assert_eq!(range, Some("v6.0..v6.1".to_string())),
+            _ => panic!("Expected Learn command"),
         }
     }
 
+    #[test]
+    fn test_vectorize_command_parses() {
+        let cli = Cli::try_parse_from(["llminus", "vectorize"]).unwrap();
+        match cli.command {
+            Commands::Vectorize { batch_size } => assert_eq!(batch_size, 64),
+            _ => panic!("Expected Vectorize command"),
+        }
+    }
+
+    #[test]
+    fn test_vectorize_command_with_batch_size() {
+        let cli = Cli::try_parse_from(["llminus", "vectorize", "-b", "128"]).unwrap();
+        match cli.command {
+            Commands::Vectorize { batch_size } => assert_eq!(batch_size, 128),
+            _ => panic!("Expected Vectorize command"),
+        }
+    }
+
+    #[test]
+    fn test_cosine_similarity() {
+        // Identical vectors should have similarity 1.0
+        let a = vec![1.0, 0.0, 0.0];
+        let b = vec![1.0, 0.0, 0.0];
+        assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.0001);
+
+        // Orthogonal vectors should have similarity 0.0
+        let a = vec![1.0, 0.0, 0.0];
+        let b = vec![0.0, 1.0, 0.0];
+        assert!((cosine_similarity(&a, &b) - 0.0).abs() < 0.0001);
+
+        // Opposite vectors should have similarity -1.0
+        let a = vec![1.0, 0.0, 0.0];
+        let b = vec![-1.0, 0.0, 0.0];
+        assert!((cosine_similarity(&a, &b) - (-1.0)).abs() < 0.0001);
+
+        // Different length vectors return 0
+        let a = vec![1.0, 0.0];
+        let b = vec![1.0, 0.0, 0.0];
+        assert_eq!(cosine_similarity(&a, &b), 0.0);
+    }
+
     #[test]
     fn test_get_file_type() {
         assert_eq!(get_file_type("foo/bar.c"), "c");
-- 
2.51.0