[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20251219181629.1123823-3-sashal@kernel.org>
Date: Fri, 19 Dec 2025 13:16:26 -0500
From: Sasha Levin <sashal@...nel.org>
To: tools@...nel.org
Cc: linux-kernel@...r.kernel.org,
torvalds@...ux-foundation.org,
broonie@...nel.org,
Sasha Levin <sashal@...nel.org>
Subject: [RFC 2/5] LLMinus: Add vectorize command with fastembed
Add the 'vectorize' command that generates embeddings for stored
conflict resolutions using the BGE-small-en-v1.5 model via fastembed.
Key features:
- Uses fastembed v5 for local embedding generation
- BGE-small model produces 384-dimensional vectors
- Batch processing with configurable batch size (-b flag)
- Incremental saves after each batch for crash recovery
- Skips resolutions that already have embeddings
- Progress reporting during vectorization
This enables RAG-based similarity search for finding historical
conflict resolutions that are similar to current merge conflicts.
Also adds:
- cosine_similarity() function for vector comparison
- init_embedding_model() helper for model initialization
- Tests for vectorize command parsing and cosine_similarity
Signed-off-by: Sasha Levin <sashal@...nel.org>
---
tools/llminus/Cargo.toml | 1 +
tools/llminus/src/main.rs | 157 ++++++++++++++++++++++++++++++++++++++
2 files changed, 158 insertions(+)
diff --git a/tools/llminus/Cargo.toml b/tools/llminus/Cargo.toml
index bdb42561a0565..86740174de598 100644
--- a/tools/llminus/Cargo.toml
+++ b/tools/llminus/Cargo.toml
@@ -10,6 +10,7 @@ repository = "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
[dependencies]
anyhow = "1"
clap = { version = "4", features = ["derive"] }
+fastembed = "5"
rayon = "1"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
diff --git a/tools/llminus/src/main.rs b/tools/llminus/src/main.rs
index 1c61836cc93f7..32a578030b0e3 100644
--- a/tools/llminus/src/main.rs
+++ b/tools/llminus/src/main.rs
@@ -2,6 +2,7 @@
use anyhow::{bail, Context, Result};
use clap::{Parser, Subcommand};
+use fastembed::{EmbeddingModel, InitOptions, TextEmbedding};
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
@@ -26,6 +27,12 @@ enum Commands {
/// Git revision range (e.g., "v6.0..v6.1"). If not specified, learns from entire history.
range: Option<String>,
},
+ /// Generate embeddings for stored resolutions (for RAG similarity search)
+ Vectorize {
+ /// Batch size for embedding generation (default: 64)
+ #[arg(short, long, default_value = "64")]
+ batch_size: usize,
+ },
}
/// A single diff hunk representing a change region
@@ -483,11 +490,118 @@ fn learn(range: Option<&str>) -> Result<()> {
Ok(())
}
+/// Compute cosine similarity between two vectors
+fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
+ if a.len() != b.len() || a.is_empty() {
+ return 0.0;
+ }
+
+ let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
+ let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
+ let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
+
+ if norm_a == 0.0 || norm_b == 0.0 {
+ return 0.0;
+ }
+
+ dot / (norm_a * norm_b)
+}
+
+/// Initialize the BGE-small embedding model
+fn init_embedding_model() -> Result<TextEmbedding> {
+ TextEmbedding::try_new(
+ InitOptions::new(EmbeddingModel::BGESmallENV15)
+ .with_show_download_progress(true),
+ ).context("Failed to initialize embedding model")
+}
+
+fn vectorize(batch_size: usize) -> Result<()> {
+ let store_path = Path::new(STORE_PATH);
+
+ if !store_path.exists() {
+ bail!("No resolutions found. Run 'llminus learn' first.");
+ }
+
+ let mut store = ResolutionStore::load(store_path)?;
+
+ // Count how many need embeddings
+ let need_embedding: Vec<usize> = store
+ .resolutions
+ .iter()
+ .enumerate()
+ .filter(|(_, r)| r.embedding.is_none())
+ .map(|(i, _)| i)
+ .collect();
+
+ if need_embedding.is_empty() {
+ println!("All {} resolutions already have embeddings.", store.resolutions.len());
+ return Ok(());
+ }
+
+ println!("Found {} resolutions needing embeddings", need_embedding.len());
+ println!("Initializing embedding model (BGE-small-en, ~33MB download on first run)...");
+
+ // Initialize the embedding model
+ let mut model = init_embedding_model()?;
+
+ println!("Model loaded. Generating embeddings...\n");
+
+ // Process in batches
+ let total_batches = (need_embedding.len() + batch_size - 1) / batch_size;
+
+ for (batch_num, chunk) in need_embedding.chunks(batch_size).enumerate() {
+ // Collect texts for this batch
+ let texts: Vec<String> = chunk
+ .iter()
+ .map(|&i| store.resolutions[i].to_embedding_text())
+ .collect();
+
+ // Generate embeddings
+ let embeddings = model
+ .embed(texts, None)
+ .context("Failed to generate embeddings")?;
+
+ // Assign embeddings back to resolutions
+ for (j, &idx) in chunk.iter().enumerate() {
+ store.resolutions[idx].embedding = Some(embeddings[j].clone());
+ }
+
+ // Progress report
+ let done = (batch_num + 1) * batch_size.min(chunk.len());
+ let pct = (done as f64 / need_embedding.len() as f64 * 100.0).min(100.0);
+ println!(
+ " Batch {}/{}: {:.1}% ({}/{})",
+ batch_num + 1,
+ total_batches,
+ pct,
+ done.min(need_embedding.len()),
+ need_embedding.len()
+ );
+
+ // Save after each batch (incremental progress)
+ store.save(store_path)?;
+ }
+
+ // Final stats
+ let json_size = std::fs::metadata(store_path).map(|m| m.len()).unwrap_or(0);
+ let with_embeddings = store.resolutions.iter().filter(|r| r.embedding.is_some()).count();
+
+ println!("\nResults:");
+ println!(" Total resolutions: {}", store.resolutions.len());
+ println!(" With embeddings: {}", with_embeddings);
+ println!(" Embedding dimensions: 384");
+ println!(" Output size: {:.2} MB", json_size as f64 / 1024.0 / 1024.0);
+ println!("\nEmbeddings saved to: {}", store_path.display());
+
+ Ok(())
+}
+
fn main() -> Result<()> {
let cli = Cli::parse();
match cli.command {
Commands::Learn { range } => learn(range.as_deref()),
+ Commands::Vectorize { batch_size } => vectorize(batch_size),
}
}
@@ -508,6 +622,7 @@ fn test_learn_command_parses() {
let cli = Cli::try_parse_from(["llminus", "learn"]).unwrap();
match cli.command {
Commands::Learn { range } => assert!(range.is_none()),
+ _ => panic!("Expected Learn command"),
}
}
@@ -516,9 +631,51 @@ fn test_learn_command_with_range() {
let cli = Cli::try_parse_from(["llminus", "learn", "v6.0..v6.1"]).unwrap();
match cli.command {
Commands::Learn { range } => assert_eq!(range, Some("v6.0..v6.1".to_string())),
+ _ => panic!("Expected Learn command"),
}
}
+ #[test]
+ fn test_vectorize_command_parses() {
+ let cli = Cli::try_parse_from(["llminus", "vectorize"]).unwrap();
+ match cli.command {
+ Commands::Vectorize { batch_size } => assert_eq!(batch_size, 64),
+ _ => panic!("Expected Vectorize command"),
+ }
+ }
+
+ #[test]
+ fn test_vectorize_command_with_batch_size() {
+ let cli = Cli::try_parse_from(["llminus", "vectorize", "-b", "128"]).unwrap();
+ match cli.command {
+ Commands::Vectorize { batch_size } => assert_eq!(batch_size, 128),
+ _ => panic!("Expected Vectorize command"),
+ }
+ }
+
+ #[test]
+ fn test_cosine_similarity() {
+ // Identical vectors should have similarity 1.0
+ let a = vec![1.0, 0.0, 0.0];
+ let b = vec![1.0, 0.0, 0.0];
+ assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.0001);
+
+ // Orthogonal vectors should have similarity 0.0
+ let a = vec![1.0, 0.0, 0.0];
+ let b = vec![0.0, 1.0, 0.0];
+ assert!((cosine_similarity(&a, &b) - 0.0).abs() < 0.0001);
+
+ // Opposite vectors should have similarity -1.0
+ let a = vec![1.0, 0.0, 0.0];
+ let b = vec![-1.0, 0.0, 0.0];
+ assert!((cosine_similarity(&a, &b) - (-1.0)).abs() < 0.0001);
+
+ // Different length vectors return 0
+ let a = vec![1.0, 0.0];
+ let b = vec![1.0, 0.0, 0.0];
+ assert_eq!(cosine_similarity(&a, &b), 0.0);
+ }
+
#[test]
fn test_get_file_type() {
assert_eq!(get_file_type("foo/bar.c"), "c");
--
2.51.0
Powered by blists - more mailing lists