feat: add Ollama integration for semantic file indexing
Implements the AI-powered indexing component of RFC 0010: - Add indexer module with LlmProvider abstraction - Integrate qwen2.5:3b via Ollama for local file analysis - Extract summaries, relationships, and symbols from source files - Support partial indexing for files >1000 lines - Wire indexer to all CLI index commands (--all, --diff, --file, --refresh) The indexer generates structured YAML output with: - One-sentence file summaries - Relationship descriptions for semantic search - Symbol-level indexing with line numbers Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
cf0baa0ea0
commit
d77ea4ba3f
5 changed files with 698 additions and 72 deletions
|
|
@ -314,14 +314,14 @@ blue impact src/domain.rs
|
|||
|
||||
- [x] Add schema to blue.db (file_index, symbol_index, FTS5 tables)
|
||||
- [x] Create versioned indexing prompt for structured YAML extraction
|
||||
- [ ] Implement Ollama integration with qwen2.5:3b default
|
||||
- [x] Implement Ollama integration with qwen2.5:3b default
|
||||
- [x] Implement `blue index --all` for bootstrap
|
||||
- [x] Implement `blue index --diff` for staged files
|
||||
- [x] Implement `blue index --file` for single-file updates
|
||||
- [x] Implement `blue index --install-hook` for git hook setup
|
||||
- [x] Implement `blue index --refresh` for stale entry updates
|
||||
- [x] Implement `blue index status` for freshness reporting
|
||||
- [ ] Add large file handling (>1000 lines warning)
|
||||
- [x] Add large file handling (>1000 lines warning)
|
||||
- [x] Implement `blue search` with FTS5 backend
|
||||
- [x] Implement `blue impact` for dependency queries
|
||||
- [x] Add MCP tools (5 tools)
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ path = "src/main.rs"
|
|||
[dependencies]
|
||||
blue-core.workspace = true
|
||||
blue-mcp.workspace = true
|
||||
blue-ollama.workspace = true
|
||||
clap.workspace = true
|
||||
anyhow.workspace = true
|
||||
tokio.workspace = true
|
||||
|
|
|
|||
|
|
@ -1530,6 +1530,8 @@ async fn detect_ollama_model() -> Option<String> {
|
|||
|
||||
async fn handle_index_command(command: IndexCommands) -> Result<()> {
|
||||
use blue_core::store::DocumentStore;
|
||||
use blue_core::{Indexer, IndexerConfig, is_indexable_file, LocalLlmConfig};
|
||||
use blue_ollama::OllamaLlm;
|
||||
use std::path::Path;
|
||||
|
||||
// Get the .blue database path
|
||||
|
|
@ -1545,21 +1547,64 @@ async fn handle_index_command(command: IndexCommands) -> Result<()> {
|
|||
|
||||
match command {
|
||||
IndexCommands::All { path, model } => {
|
||||
let target = path.as_deref().unwrap_or(".");
|
||||
let target_path = path.as_deref().unwrap_or(".");
|
||||
let model_name = model.as_deref().unwrap_or("qwen2.5:3b");
|
||||
|
||||
println!("Indexing all files in '{}' with model '{}'...", target, model_name);
|
||||
println!("(Full indexing requires Ollama running with the model pulled)");
|
||||
// Collect all indexable files
|
||||
let files = collect_indexable_files(Path::new(target_path))?;
|
||||
println!("Found {} indexable files in '{}'", files.len(), target_path);
|
||||
|
||||
// For now, show what would be indexed
|
||||
let count = count_indexable_files(Path::new(target))?;
|
||||
println!("Found {} indexable files.", count);
|
||||
println!("\nTo complete indexing:");
|
||||
println!(" 1. Ensure Ollama is running: ollama serve");
|
||||
println!(" 2. Pull the model: ollama pull {}", model_name);
|
||||
println!(" 3. Run this command again");
|
||||
if files.is_empty() {
|
||||
println!("No files to index.");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// TODO: Implement actual indexing with Ollama integration
|
||||
// Try to connect to Ollama
|
||||
let llm_config = LocalLlmConfig {
|
||||
model: model_name.to_string(),
|
||||
use_external: true, // Use existing Ollama instance
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let llm = OllamaLlm::new(&llm_config);
|
||||
if let Err(e) = llm.start() {
|
||||
println!("Ollama not available: {}", e);
|
||||
println!("\nTo index files:");
|
||||
println!(" 1. Start Ollama: ollama serve");
|
||||
println!(" 2. Pull the model: ollama pull {}", model_name);
|
||||
println!(" 3. Run this command again");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
println!("Indexing with model '{}'...\n", model_name);
|
||||
|
||||
let indexer_config = IndexerConfig {
|
||||
model: model_name.to_string(),
|
||||
..Default::default()
|
||||
};
|
||||
let indexer = Indexer::new(llm, indexer_config);
|
||||
|
||||
let mut indexed = 0;
|
||||
let mut errors = 0;
|
||||
|
||||
for file_path in &files {
|
||||
let path = Path::new(file_path);
|
||||
print!(" {} ... ", file_path);
|
||||
|
||||
match indexer.index_and_store(path, &store) {
|
||||
Ok(result) => {
|
||||
let partial = if result.is_partial { " (partial)" } else { "" };
|
||||
println!("{} symbols{}", result.symbols.len(), partial);
|
||||
indexed += 1;
|
||||
}
|
||||
Err(e) => {
|
||||
println!("error: {}", e);
|
||||
errors += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
println!("\nIndexed {} files ({} errors)", indexed, errors);
|
||||
}
|
||||
|
||||
IndexCommands::Diff { model } => {
|
||||
|
|
@ -1570,41 +1615,109 @@ async fn handle_index_command(command: IndexCommands) -> Result<()> {
|
|||
.args(["diff", "--cached", "--name-only"])
|
||||
.output()?;
|
||||
|
||||
let staged_files: Vec<&str> = std::str::from_utf8(&output.stdout)?
|
||||
let staged_files: Vec<String> = std::str::from_utf8(&output.stdout)?
|
||||
.lines()
|
||||
.filter(|l| !l.is_empty())
|
||||
.filter(|l| is_indexable_file(Path::new(l)))
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
|
||||
if staged_files.is_empty() {
|
||||
println!("No staged files to index.");
|
||||
println!("No indexable staged files.");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
println!("Indexing {} staged file(s) with '{}'...", staged_files.len(), model_name);
|
||||
for file in &staged_files {
|
||||
println!(" {}", file);
|
||||
// Try to connect to Ollama
|
||||
let llm_config = LocalLlmConfig {
|
||||
model: model_name.to_string(),
|
||||
use_external: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let llm = OllamaLlm::new(&llm_config);
|
||||
if let Err(_) = llm.start() {
|
||||
// Silently skip if Ollama not available (pre-commit hook shouldn't block)
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// TODO: Implement actual indexing
|
||||
println!("Indexing {} staged file(s)...", staged_files.len());
|
||||
|
||||
let indexer_config = IndexerConfig {
|
||||
model: model_name.to_string(),
|
||||
..Default::default()
|
||||
};
|
||||
let indexer = Indexer::new(llm, indexer_config);
|
||||
|
||||
for file_path in &staged_files {
|
||||
let path = Path::new(file_path);
|
||||
if path.exists() {
|
||||
match indexer.index_and_store(path, &store) {
|
||||
Ok(result) => {
|
||||
println!(" {} - {} symbols", file_path, result.symbols.len());
|
||||
}
|
||||
Err(e) => {
|
||||
println!(" {} - error: {}", file_path, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
IndexCommands::File { path, model } => {
|
||||
let model_name = model.as_deref().unwrap_or("qwen2.5:3b");
|
||||
let file_path = Path::new(&path);
|
||||
|
||||
if !Path::new(&path).exists() {
|
||||
if !file_path.exists() {
|
||||
println!("File not found: {}", path);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Try to connect to Ollama
|
||||
let llm_config = LocalLlmConfig {
|
||||
model: model_name.to_string(),
|
||||
use_external: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let llm = OllamaLlm::new(&llm_config);
|
||||
if let Err(e) = llm.start() {
|
||||
println!("Ollama not available: {}", e);
|
||||
println!("\nStart Ollama first: ollama serve");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
println!("Indexing '{}' with '{}'...", path, model_name);
|
||||
|
||||
// TODO: Implement single file indexing
|
||||
let indexer_config = IndexerConfig {
|
||||
model: model_name.to_string(),
|
||||
..Default::default()
|
||||
};
|
||||
let indexer = Indexer::new(llm, indexer_config);
|
||||
|
||||
match indexer.index_and_store(file_path, &store) {
|
||||
Ok(result) => {
|
||||
println!("\nSummary: {}", result.summary.unwrap_or_default());
|
||||
if let Some(rel) = &result.relationships {
|
||||
println!("\nRelationships:\n{}", rel);
|
||||
}
|
||||
println!("\nSymbols ({}):", result.symbols.len());
|
||||
for sym in &result.symbols {
|
||||
let lines = match (sym.start_line, sym.end_line) {
|
||||
(Some(s), Some(e)) => format!(" (lines {}-{})", s, e),
|
||||
(Some(s), None) => format!(" (line {})", s),
|
||||
_ => String::new(),
|
||||
};
|
||||
println!(" {} ({}){}", sym.name, sym.kind, lines);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
println!("Error: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
IndexCommands::Refresh { model } => {
|
||||
let model_name = model.as_deref().unwrap_or("qwen2.5:3b");
|
||||
|
||||
// Get current realm (default to "default" for single-repo)
|
||||
let realm = "default";
|
||||
|
||||
let (file_count, symbol_count) = store.get_index_stats(realm)?;
|
||||
|
|
@ -1615,10 +1728,67 @@ async fn handle_index_command(command: IndexCommands) -> Result<()> {
|
|||
return Ok(());
|
||||
}
|
||||
|
||||
println!("Checking for stale entries...");
|
||||
println!("(Refresh with model '{}')", model_name);
|
||||
// Get all indexed files and check which are stale
|
||||
let indexed_files = store.list_file_index(realm, None)?;
|
||||
let mut stale_files = Vec::new();
|
||||
|
||||
// TODO: Implement refresh logic - compare hashes
|
||||
for entry in &indexed_files {
|
||||
let path = Path::new(&entry.file_path);
|
||||
if path.exists() {
|
||||
if let Ok(content) = std::fs::read_to_string(path) {
|
||||
let current_hash = hash_file_content(&content);
|
||||
if current_hash != entry.file_hash {
|
||||
stale_files.push(entry.file_path.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if stale_files.is_empty() {
|
||||
println!("All indexed files are up to date.");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
println!("Found {} stale file(s)", stale_files.len());
|
||||
|
||||
// Try to connect to Ollama
|
||||
let llm_config = LocalLlmConfig {
|
||||
model: model_name.to_string(),
|
||||
use_external: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let llm = OllamaLlm::new(&llm_config);
|
||||
if let Err(e) = llm.start() {
|
||||
println!("Ollama not available: {}", e);
|
||||
println!("\nStale files:");
|
||||
for f in &stale_files {
|
||||
println!(" {}", f);
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
println!("Re-indexing stale files with '{}'...\n", model_name);
|
||||
|
||||
let indexer_config = IndexerConfig {
|
||||
model: model_name.to_string(),
|
||||
..Default::default()
|
||||
};
|
||||
let indexer = Indexer::new(llm, indexer_config);
|
||||
|
||||
for file_path in &stale_files {
|
||||
let path = Path::new(file_path);
|
||||
print!(" {} ... ", file_path);
|
||||
|
||||
match indexer.index_and_store(path, &store) {
|
||||
Ok(result) => {
|
||||
println!("{} symbols", result.symbols.len());
|
||||
}
|
||||
Err(e) => {
|
||||
println!("error: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
IndexCommands::InstallHook => {
|
||||
|
|
@ -1669,6 +1839,51 @@ blue index diff 2>/dev/null || true
|
|||
Ok(())
|
||||
}
|
||||
|
||||
/// Collect all indexable files in a directory
|
||||
fn collect_indexable_files(dir: &std::path::Path) -> Result<Vec<String>> {
|
||||
use blue_core::{is_indexable_file, should_skip_dir};
|
||||
use std::fs;
|
||||
|
||||
let mut files = Vec::new();
|
||||
|
||||
fn walk_dir(dir: &std::path::Path, files: &mut Vec<String>) -> Result<()> {
|
||||
if !dir.is_dir() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
for entry in fs::read_dir(dir)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
|
||||
|
||||
if path.is_dir() {
|
||||
if !should_skip_dir(name) {
|
||||
walk_dir(&path, files)?;
|
||||
}
|
||||
} else if is_indexable_file(&path) {
|
||||
if let Some(s) = path.to_str() {
|
||||
files.push(s.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
walk_dir(dir, &mut files)?;
|
||||
files.sort();
|
||||
Ok(files)
|
||||
}
|
||||
|
||||
/// Hash file content for staleness detection
|
||||
fn hash_file_content(content: &str) -> String {
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
let mut hasher = DefaultHasher::new();
|
||||
content.hash(&mut hasher);
|
||||
format!("{:x}", hasher.finish())
|
||||
}
|
||||
|
||||
async fn handle_search_command(query: &str, symbols_only: bool, limit: usize) -> Result<()> {
|
||||
use blue_core::store::DocumentStore;
|
||||
|
||||
|
|
@ -1798,49 +2013,3 @@ async fn handle_impact_command(file: &str) -> Result<()> {
|
|||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn count_indexable_files(dir: &std::path::Path) -> Result<usize> {
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
let mut count = 0;
|
||||
|
||||
// File extensions we care about
|
||||
let extensions: &[&str] = &[
|
||||
"rs", "py", "js", "ts", "tsx", "jsx", "go", "java", "c", "cpp", "h", "hpp",
|
||||
"rb", "php", "swift", "kt", "scala", "clj", "ex", "exs", "erl", "hs",
|
||||
"ml", "mli", "sql", "sh", "bash", "zsh", "yaml", "yml", "toml", "json",
|
||||
];
|
||||
|
||||
// Directories to skip
|
||||
let skip_dirs: &[&str] = &[
|
||||
"node_modules", "target", ".git", "__pycache__", "venv", ".venv",
|
||||
"dist", "build", ".next", ".nuxt", "vendor", ".cargo",
|
||||
];
|
||||
|
||||
fn walk_dir(dir: &Path, extensions: &[&str], skip_dirs: &[&str], count: &mut usize) -> Result<()> {
|
||||
if !dir.is_dir() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
for entry in fs::read_dir(dir)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
|
||||
|
||||
if path.is_dir() {
|
||||
if !skip_dirs.contains(&name) && !name.starts_with('.') {
|
||||
walk_dir(&path, extensions, skip_dirs, count)?;
|
||||
}
|
||||
} else if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
|
||||
if extensions.contains(&ext) {
|
||||
*count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
walk_dir(dir, extensions, skip_dirs, &mut count)?;
|
||||
Ok(count)
|
||||
}
|
||||
|
|
|
|||
454
crates/blue-core/src/indexer.rs
Normal file
454
crates/blue-core/src/indexer.rs
Normal file
|
|
@ -0,0 +1,454 @@
|
|||
//! Semantic file indexer (RFC 0010)
|
||||
//!
|
||||
//! Uses Ollama with qwen2.5:3b to analyze source files and extract:
|
||||
//! - Summary: one-sentence description
|
||||
//! - Relationships: dependencies and connections to other files
|
||||
//! - Symbols: functions, structs, classes with line numbers
|
||||
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::path::Path;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use crate::store::{DocumentStore, FileIndexEntry, SymbolIndexEntry};
|
||||
use crate::{CompletionOptions, LlmError, LlmProvider};
|
||||
|
||||
/// Default model for indexing
|
||||
pub const DEFAULT_INDEX_MODEL: &str = "qwen2.5:3b";
|
||||
|
||||
/// Maximum file size in lines before partial indexing
|
||||
pub const MAX_FILE_LINES: usize = 1000;
|
||||
|
||||
/// Indexer configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct IndexerConfig {
|
||||
pub model: String,
|
||||
pub realm: String,
|
||||
pub repo: String,
|
||||
pub max_tokens: usize,
|
||||
pub temperature: f32,
|
||||
}
|
||||
|
||||
impl Default for IndexerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
model: DEFAULT_INDEX_MODEL.to_string(),
|
||||
realm: "default".to_string(),
|
||||
repo: "default".to_string(),
|
||||
max_tokens: 2048,
|
||||
temperature: 0.1, // Low temperature for consistent structured output
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of indexing a file
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct IndexResult {
|
||||
pub file_path: String,
|
||||
pub file_hash: String,
|
||||
pub summary: Option<String>,
|
||||
pub relationships: Option<String>,
|
||||
pub symbols: Vec<ParsedSymbol>,
|
||||
pub is_partial: bool,
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
/// A parsed symbol from AI output
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ParsedSymbol {
|
||||
pub name: String,
|
||||
pub kind: String,
|
||||
pub start_line: Option<i32>,
|
||||
pub end_line: Option<i32>,
|
||||
pub description: Option<String>,
|
||||
}
|
||||
|
||||
/// The indexer that uses LLM to analyze files
|
||||
pub struct Indexer<P: LlmProvider> {
|
||||
provider: P,
|
||||
config: IndexerConfig,
|
||||
}
|
||||
|
||||
impl<P: LlmProvider> Indexer<P> {
|
||||
/// Create a new indexer with the given LLM provider
|
||||
pub fn new(provider: P, config: IndexerConfig) -> Self {
|
||||
Self { provider, config }
|
||||
}
|
||||
|
||||
/// Index a single file and return the result
|
||||
pub fn index_file(&self, file_path: &Path) -> Result<IndexResult, IndexerError> {
|
||||
let path_str = file_path.to_string_lossy().to_string();
|
||||
|
||||
// Read file contents
|
||||
let content = std::fs::read_to_string(file_path)
|
||||
.map_err(|e| IndexerError::FileRead(path_str.clone(), e.to_string()))?;
|
||||
|
||||
// Calculate hash
|
||||
let file_hash = hash_content(&content);
|
||||
|
||||
// Check file size
|
||||
let line_count = content.lines().count();
|
||||
let is_partial = line_count > MAX_FILE_LINES;
|
||||
|
||||
let content_to_index = if is_partial {
|
||||
// Take first MAX_FILE_LINES lines
|
||||
content.lines().take(MAX_FILE_LINES).collect::<Vec<_>>().join("\n")
|
||||
} else {
|
||||
content.clone()
|
||||
};
|
||||
|
||||
// Generate prompt
|
||||
let prompt = generate_index_prompt(&path_str, &content_to_index, is_partial);
|
||||
|
||||
// Call LLM
|
||||
let options = CompletionOptions {
|
||||
max_tokens: self.config.max_tokens,
|
||||
temperature: self.config.temperature,
|
||||
stop_sequences: vec!["```".to_string()], // Stop at end of YAML block
|
||||
};
|
||||
|
||||
let completion = self.provider.complete(&prompt, &options)
|
||||
.map_err(|e| IndexerError::LlmError(e))?;
|
||||
|
||||
// Parse YAML response
|
||||
let parsed = parse_index_response(&completion.text);
|
||||
|
||||
Ok(IndexResult {
|
||||
file_path: path_str,
|
||||
file_hash,
|
||||
summary: parsed.summary,
|
||||
relationships: parsed.relationships,
|
||||
symbols: parsed.symbols,
|
||||
is_partial,
|
||||
error: parsed.error,
|
||||
})
|
||||
}
|
||||
|
||||
/// Index a file and store in the database
|
||||
pub fn index_and_store(
|
||||
&self,
|
||||
file_path: &Path,
|
||||
store: &DocumentStore,
|
||||
) -> Result<IndexResult, IndexerError> {
|
||||
let result = self.index_file(file_path)?;
|
||||
|
||||
// Create file index entry
|
||||
let mut entry = FileIndexEntry::new(
|
||||
&self.config.realm,
|
||||
&self.config.repo,
|
||||
&result.file_path,
|
||||
&result.file_hash,
|
||||
);
|
||||
entry.summary = result.summary.clone();
|
||||
entry.relationships = result.relationships.clone();
|
||||
|
||||
// Store in database
|
||||
let file_id = store.upsert_file_index(&entry)
|
||||
.map_err(|e| IndexerError::StoreError(e.to_string()))?;
|
||||
|
||||
// Convert and store symbols
|
||||
let symbols: Vec<SymbolIndexEntry> = result.symbols.iter().map(|s| {
|
||||
SymbolIndexEntry {
|
||||
id: None,
|
||||
file_id,
|
||||
name: s.name.clone(),
|
||||
kind: s.kind.clone(),
|
||||
start_line: s.start_line,
|
||||
end_line: s.end_line,
|
||||
description: s.description.clone(),
|
||||
}
|
||||
}).collect();
|
||||
|
||||
store.set_file_symbols(file_id, &symbols)
|
||||
.map_err(|e| IndexerError::StoreError(e.to_string()))?;
|
||||
|
||||
info!("Indexed {} with {} symbols", result.file_path, symbols.len());
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Check if a file needs re-indexing
|
||||
pub fn needs_indexing(&self, file_path: &Path, store: &DocumentStore) -> Result<bool, IndexerError> {
|
||||
let path_str = file_path.to_string_lossy().to_string();
|
||||
|
||||
// Read file and calculate hash
|
||||
let content = std::fs::read_to_string(file_path)
|
||||
.map_err(|e| IndexerError::FileRead(path_str.clone(), e.to_string()))?;
|
||||
let current_hash = hash_content(&content);
|
||||
|
||||
// Check against stored hash
|
||||
store.is_file_stale(&self.config.realm, &self.config.repo, &path_str, ¤t_hash)
|
||||
.map_err(|e| IndexerError::StoreError(e.to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate the indexing prompt
|
||||
fn generate_index_prompt(file_path: &str, content: &str, is_partial: bool) -> String {
|
||||
let partial_note = if is_partial {
|
||||
"\n\nNote: This is a large file. Only the first 1000 lines are shown. Include a note about this in the summary."
|
||||
} else {
|
||||
""
|
||||
};
|
||||
|
||||
format!(
|
||||
r#"Analyze this source file and provide structured information about it.
|
||||
|
||||
File: {file_path}{partial_note}
|
||||
|
||||
```
|
||||
{content}
|
||||
```
|
||||
|
||||
Provide your analysis as YAML with this exact structure:
|
||||
|
||||
```yaml
|
||||
summary: "One sentence describing what this file does"
|
||||
|
||||
relationships: |
|
||||
Describe how this file relates to other files.
|
||||
List imports, dependencies, and what uses this file.
|
||||
Be specific about file names when visible.
|
||||
|
||||
symbols:
|
||||
- name: "SymbolName"
|
||||
kind: "function|struct|class|enum|const|trait|interface|type|method"
|
||||
start_line: 10
|
||||
end_line: 25
|
||||
description: "What this symbol does"
|
||||
```
|
||||
|
||||
Rules:
|
||||
- Summary must be ONE sentence
|
||||
- Relationships should mention specific file names when imports are visible
|
||||
- Only include significant symbols (skip trivial helpers, private internals)
|
||||
- Line numbers must be accurate
|
||||
- Kind must be one of: function, struct, class, enum, const, trait, interface, type, method
|
||||
- Output valid YAML only"#
|
||||
)
|
||||
}
|
||||
|
||||
/// Parsed response from the LLM
|
||||
#[derive(Debug, Default)]
|
||||
struct ParsedResponse {
|
||||
summary: Option<String>,
|
||||
relationships: Option<String>,
|
||||
symbols: Vec<ParsedSymbol>,
|
||||
error: Option<String>,
|
||||
}
|
||||
|
||||
/// Parse the YAML response from the LLM
|
||||
fn parse_index_response(response: &str) -> ParsedResponse {
|
||||
// Try to find YAML block
|
||||
let yaml_content = if let Some(start) = response.find("```yaml") {
|
||||
let after_marker = &response[start + 7..];
|
||||
if let Some(end) = after_marker.find("```") {
|
||||
after_marker[..end].trim()
|
||||
} else {
|
||||
after_marker.trim()
|
||||
}
|
||||
} else if let Some(start) = response.find("summary:") {
|
||||
// No code fence, but starts with summary
|
||||
response[start..].trim()
|
||||
} else {
|
||||
response.trim()
|
||||
};
|
||||
|
||||
// Parse YAML
|
||||
match serde_yaml::from_str::<serde_yaml::Value>(yaml_content) {
|
||||
Ok(value) => {
|
||||
let summary = value.get("summary")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(|s| s.to_string());
|
||||
|
||||
let relationships = value.get("relationships")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(|s| s.trim().to_string());
|
||||
|
||||
let symbols = value.get("symbols")
|
||||
.and_then(|v| v.as_sequence())
|
||||
.map(|seq| {
|
||||
seq.iter().filter_map(|item| {
|
||||
let name = item.get("name")?.as_str()?.to_string();
|
||||
let kind = item.get("kind")?.as_str()?.to_string();
|
||||
|
||||
Some(ParsedSymbol {
|
||||
name,
|
||||
kind,
|
||||
start_line: item.get("start_line")
|
||||
.and_then(|v| v.as_i64())
|
||||
.map(|n| n as i32),
|
||||
end_line: item.get("end_line")
|
||||
.and_then(|v| v.as_i64())
|
||||
.map(|n| n as i32),
|
||||
description: item.get("description")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(|s| s.to_string()),
|
||||
})
|
||||
}).collect()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
ParsedResponse {
|
||||
summary,
|
||||
relationships,
|
||||
symbols,
|
||||
error: None,
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Failed to parse YAML response: {}", e);
|
||||
debug!("Response was: {}", yaml_content);
|
||||
|
||||
ParsedResponse {
|
||||
summary: None,
|
||||
relationships: None,
|
||||
symbols: vec![],
|
||||
error: Some(format!("YAML parse error: {}", e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate hash of file content
|
||||
fn hash_content(content: &str) -> String {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
content.hash(&mut hasher);
|
||||
format!("{:x}", hasher.finish())
|
||||
}
|
||||
|
||||
/// Indexer errors
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum IndexerError {
|
||||
#[error("Failed to read file '{0}': {1}")]
|
||||
FileRead(String, String),
|
||||
|
||||
#[error("LLM error: {0}")]
|
||||
LlmError(#[from] LlmError),
|
||||
|
||||
#[error("Store error: {0}")]
|
||||
StoreError(String),
|
||||
|
||||
#[error("Index error: {0}")]
|
||||
Other(String),
|
||||
}
|
||||
|
||||
/// File extensions we should index
|
||||
pub fn is_indexable_file(path: &Path) -> bool {
|
||||
let extensions: &[&str] = &[
|
||||
"rs", "py", "js", "ts", "tsx", "jsx", "go", "java", "c", "cpp", "h", "hpp",
|
||||
"rb", "php", "swift", "kt", "scala", "clj", "ex", "exs", "erl", "hs",
|
||||
"ml", "mli", "sql", "sh", "bash", "zsh", "yaml", "yml", "toml", "json",
|
||||
];
|
||||
|
||||
path.extension()
|
||||
.and_then(|e| e.to_str())
|
||||
.map(|e| extensions.contains(&e))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Directories to skip when indexing
|
||||
pub fn should_skip_dir(name: &str) -> bool {
|
||||
let skip_dirs: &[&str] = &[
|
||||
"node_modules", "target", ".git", "__pycache__", "venv", ".venv",
|
||||
"dist", "build", ".next", ".nuxt", "vendor", ".cargo", ".blue",
|
||||
];
|
||||
|
||||
skip_dirs.contains(&name) || name.starts_with('.')
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_hash_content() {
|
||||
let hash1 = hash_content("hello");
|
||||
let hash2 = hash_content("hello");
|
||||
let hash3 = hash_content("world");
|
||||
|
||||
assert_eq!(hash1, hash2);
|
||||
assert_ne!(hash1, hash3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_indexable_file() {
|
||||
assert!(is_indexable_file(Path::new("foo.rs")));
|
||||
assert!(is_indexable_file(Path::new("bar.py")));
|
||||
assert!(is_indexable_file(Path::new("baz.ts")));
|
||||
assert!(!is_indexable_file(Path::new("readme.md")));
|
||||
assert!(!is_indexable_file(Path::new("image.png")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_should_skip_dir() {
|
||||
assert!(should_skip_dir("node_modules"));
|
||||
assert!(should_skip_dir("target"));
|
||||
assert!(should_skip_dir(".git"));
|
||||
assert!(should_skip_dir(".hidden"));
|
||||
assert!(!should_skip_dir("src"));
|
||||
assert!(!should_skip_dir("lib"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_index_response_valid() {
|
||||
let response = r#"```yaml
|
||||
summary: "This file handles user authentication"
|
||||
|
||||
relationships: |
|
||||
Imports auth module from ./auth.rs
|
||||
Used by main.rs for login flow
|
||||
|
||||
symbols:
|
||||
- name: "authenticate"
|
||||
kind: "function"
|
||||
start_line: 10
|
||||
end_line: 25
|
||||
description: "Validates user credentials"
|
||||
```"#;
|
||||
|
||||
let parsed = parse_index_response(response);
|
||||
assert_eq!(parsed.summary, Some("This file handles user authentication".to_string()));
|
||||
assert!(parsed.relationships.is_some());
|
||||
assert_eq!(parsed.symbols.len(), 1);
|
||||
assert_eq!(parsed.symbols[0].name, "authenticate");
|
||||
assert_eq!(parsed.symbols[0].kind, "function");
|
||||
assert_eq!(parsed.symbols[0].start_line, Some(10));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_index_response_no_fence() {
|
||||
let response = r#"summary: "Test file"
|
||||
|
||||
relationships: |
|
||||
No dependencies
|
||||
|
||||
symbols: []"#;
|
||||
|
||||
let parsed = parse_index_response(response);
|
||||
assert_eq!(parsed.summary, Some("Test file".to_string()));
|
||||
assert!(parsed.symbols.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_index_response_invalid() {
|
||||
let response = "this is not valid yaml { broken }";
|
||||
let parsed = parse_index_response(response);
|
||||
assert!(parsed.error.is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_generate_index_prompt() {
|
||||
let prompt = generate_index_prompt("test.rs", "fn main() {}", false);
|
||||
assert!(prompt.contains("test.rs"));
|
||||
assert!(prompt.contains("fn main()"));
|
||||
assert!(!prompt.contains("large file"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_generate_index_prompt_partial() {
|
||||
let prompt = generate_index_prompt("test.rs", "fn main() {}", true);
|
||||
assert!(prompt.contains("large file"));
|
||||
}
|
||||
}
|
||||
|
|
@ -15,6 +15,7 @@ const _BLUE_SECRET_NAME: &str = "Sheepey"; // pronounced "Shee-paay"
|
|||
|
||||
pub mod daemon;
|
||||
pub mod documents;
|
||||
pub mod indexer;
|
||||
pub mod llm;
|
||||
pub mod realm;
|
||||
pub mod repo;
|
||||
|
|
@ -24,6 +25,7 @@ pub mod voice;
|
|||
pub mod workflow;
|
||||
|
||||
pub use documents::{Adr, Audit, AuditFinding, AuditSeverity, AuditType, Decision, Rfc, Spike, SpikeOutcome, Status, Task, update_markdown_status};
|
||||
pub use indexer::{Indexer, IndexerConfig, IndexerError, IndexResult, ParsedSymbol, is_indexable_file, should_skip_dir, DEFAULT_INDEX_MODEL, MAX_FILE_LINES};
|
||||
pub use llm::{CompletionOptions, CompletionResult, LlmBackendChoice, LlmConfig, LlmError, LlmManager, LlmProvider, LlmProviderChoice, LocalLlmConfig, ApiLlmConfig, KeywordLlm, MockLlm, ProviderStatus};
|
||||
pub use repo::{detect_blue, BlueHome, RepoError, WorktreeInfo};
|
||||
pub use state::{ItemType, ProjectState, StateError, StatusSummary, WorkItem};
|
||||
|
|
|
|||
Loading…
Reference in a new issue