feat: add Ollama integration for semantic file indexing

Implements the AI-powered indexing component of RFC 0010: - Add indexer module with LlmProvider abstraction - Integrate qwen2.5:3b via Ollama for local file analysis - Extract summaries, relationships, and symbols from source files - Support partial indexing for files >1000 lines - Wire indexer to all CLI index commands (--all, --diff, --file, --refresh) The indexer generates structured YAML output with: - One-sentence file summaries - Relationship descriptions for semantic search - Symbol-level indexing with line numbers Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 18:51:48 -05:00 · 2026-01-24 18:51:48 -05:00 · d77ea4ba3f
commit d77ea4ba3f
parent cf0baa0ea0
5 changed files with 698 additions and 72 deletions
--- a/.blue/docs/rfcs/0010-realm-semantic-index.md
+++ b/.blue/docs/rfcs/0010-realm-semantic-index.md
@ -314,14 +314,14 @@ blue impact src/domain.rs

 - [x] Add schema to blue.db (file_index, symbol_index, FTS5 tables)
 - [x] Create versioned indexing prompt for structured YAML extraction
- [ ] Implement Ollama integration with qwen2.5:3b default
+- [x] Implement Ollama integration with qwen2.5:3b default
 - [x] Implement `blue index --all` for bootstrap
 - [x] Implement `blue index --diff` for staged files
 - [x] Implement `blue index --file` for single-file updates
 - [x] Implement `blue index --install-hook` for git hook setup
 - [x] Implement `blue index --refresh` for stale entry updates
 - [x] Implement `blue index status` for freshness reporting
- [ ] Add large file handling (>1000 lines warning)
+- [x] Add large file handling (>1000 lines warning)
 - [x] Implement `blue search` with FTS5 backend
 - [x] Implement `blue impact` for dependency queries
 - [x] Add MCP tools (5 tools)
--- a/apps/blue-cli/Cargo.toml
+++ b/apps/blue-cli/Cargo.toml
@ -13,6 +13,7 @@ path = "src/main.rs"
 [dependencies]
 blue-core.workspace = true
 blue-mcp.workspace = true
+blue-ollama.workspace = true
 clap.workspace = true
 anyhow.workspace = true
 tokio.workspace = true
--- a/apps/blue-cli/src/main.rs
+++ b/apps/blue-cli/src/main.rs
@ -1530,6 +1530,8 @@ async fn detect_ollama_model() -> Option<String> {

 async fn handle_index_command(command: IndexCommands) -> Result<()> {
    use blue_core::store::DocumentStore;
+    use blue_core::{Indexer, IndexerConfig, is_indexable_file, LocalLlmConfig};
+    use blue_ollama::OllamaLlm;
    use std::path::Path;

    // Get the .blue database path
@ -1545,21 +1547,64 @@ async fn handle_index_command(command: IndexCommands) -> Result<()> {

    match command {
        IndexCommands::All { path, model } => {
-            let target = path.as_deref().unwrap_or(".");
+            let target_path = path.as_deref().unwrap_or(".");
            let model_name = model.as_deref().unwrap_or("qwen2.5:3b");

-            println!("Indexing all files in '{}' with model '{}'...", target, model_name);
-            println!("(Full indexing requires Ollama running with the model pulled)");
+            // Collect all indexable files
+            let files = collect_indexable_files(Path::new(target_path))?;
+            println!("Found {} indexable files in '{}'", files.len(), target_path);

-            // For now, show what would be indexed
-            let count = count_indexable_files(Path::new(target))?;
-            println!("Found {} indexable files.", count);
-            println!("\nTo complete indexing:");
-            println!("  1. Ensure Ollama is running: ollama serve");
-            println!("  2. Pull the model: ollama pull {}", model_name);
-            println!("  3. Run this command again");
+            if files.is_empty() {
+                println!("No files to index.");
+                return Ok(());
+            }

-            // TODO: Implement actual indexing with Ollama integration
+            // Try to connect to Ollama
+            let llm_config = LocalLlmConfig {
+                model: model_name.to_string(),
+                use_external: true, // Use existing Ollama instance
+                ..Default::default()
+            };
+
+            let llm = OllamaLlm::new(&llm_config);
+            if let Err(e) = llm.start() {
+                println!("Ollama not available: {}", e);
+                println!("\nTo index files:");
+                println!("  1. Start Ollama: ollama serve");
+                println!("  2. Pull the model: ollama pull {}", model_name);
+                println!("  3. Run this command again");
+                return Ok(());
+            }
+
+            println!("Indexing with model '{}'...\n", model_name);
+
+            let indexer_config = IndexerConfig {
+                model: model_name.to_string(),
+                ..Default::default()
+            };
+            let indexer = Indexer::new(llm, indexer_config);
+
+            let mut indexed = 0;
+            let mut errors = 0;
+
+            for file_path in &files {
+                let path = Path::new(file_path);
+                print!("  {} ... ", file_path);
+
+                match indexer.index_and_store(path, &store) {
+                    Ok(result) => {
+                        let partial = if result.is_partial { " (partial)" } else { "" };
+                        println!("{} symbols{}", result.symbols.len(), partial);
+                        indexed += 1;
+                    }
+                    Err(e) => {
+                        println!("error: {}", e);
+                        errors += 1;
+                    }
+                }
+            }
+
+            println!("\nIndexed {} files ({} errors)", indexed, errors);
        }

        IndexCommands::Diff { model } => {
@ -1570,41 +1615,109 @@ async fn handle_index_command(command: IndexCommands) -> Result<()> {
                .args(["diff", "--cached", "--name-only"])
                .output()?;

-            let staged_files: Vec<&str> = std::str::from_utf8(&output.stdout)?
+            let staged_files: Vec<String> = std::str::from_utf8(&output.stdout)?
                .lines()
                .filter(|l| !l.is_empty())
+                .filter(|l| is_indexable_file(Path::new(l)))
+                .map(|s| s.to_string())
                .collect();

            if staged_files.is_empty() {
-                println!("No staged files to index.");
+                println!("No indexable staged files.");
                return Ok(());
            }

-            println!("Indexing {} staged file(s) with '{}'...", staged_files.len(), model_name);
-            for file in &staged_files {
-                println!("  {}", file);
+            // Try to connect to Ollama
+            let llm_config = LocalLlmConfig {
+                model: model_name.to_string(),
+                use_external: true,
+                ..Default::default()
+            };
+
+            let llm = OllamaLlm::new(&llm_config);
+            if let Err(_) = llm.start() {
+                // Silently skip if Ollama not available (pre-commit hook shouldn't block)
+                return Ok(());
            }

-            // TODO: Implement actual indexing
+            println!("Indexing {} staged file(s)...", staged_files.len());
+
+            let indexer_config = IndexerConfig {
+                model: model_name.to_string(),
+                ..Default::default()
+            };
+            let indexer = Indexer::new(llm, indexer_config);
+
+            for file_path in &staged_files {
+                let path = Path::new(file_path);
+                if path.exists() {
+                    match indexer.index_and_store(path, &store) {
+                        Ok(result) => {
+                            println!("  {} - {} symbols", file_path, result.symbols.len());
+                        }
+                        Err(e) => {
+                            println!("  {} - error: {}", file_path, e);
+                        }
+                    }
+                }
+            }
        }

        IndexCommands::File { path, model } => {
            let model_name = model.as_deref().unwrap_or("qwen2.5:3b");
+            let file_path = Path::new(&path);

-            if !Path::new(&path).exists() {
+            if !file_path.exists() {
                println!("File not found: {}", path);
                return Ok(());
            }

+            // Try to connect to Ollama
+            let llm_config = LocalLlmConfig {
+                model: model_name.to_string(),
+                use_external: true,
+                ..Default::default()
+            };
+
+            let llm = OllamaLlm::new(&llm_config);
+            if let Err(e) = llm.start() {
+                println!("Ollama not available: {}", e);
+                println!("\nStart Ollama first: ollama serve");
+                return Ok(());
+            }
+
            println!("Indexing '{}' with '{}'...", path, model_name);

-            // TODO: Implement single file indexing
+            let indexer_config = IndexerConfig {
+                model: model_name.to_string(),
+                ..Default::default()
+            };
+            let indexer = Indexer::new(llm, indexer_config);
+
+            match indexer.index_and_store(file_path, &store) {
+                Ok(result) => {
+                    println!("\nSummary: {}", result.summary.unwrap_or_default());
+                    if let Some(rel) = &result.relationships {
+                        println!("\nRelationships:\n{}", rel);
+                    }
+                    println!("\nSymbols ({}):", result.symbols.len());
+                    for sym in &result.symbols {
+                        let lines = match (sym.start_line, sym.end_line) {
+                            (Some(s), Some(e)) => format!(" (lines {}-{})", s, e),
+                            (Some(s), None) => format!(" (line {})", s),
+                            _ => String::new(),
+                        };
+                        println!("  {} ({}){}", sym.name, sym.kind, lines);
+                    }
+                }
+                Err(e) => {
+                    println!("Error: {}", e);
+                }
+            }
        }

        IndexCommands::Refresh { model } => {
            let model_name = model.as_deref().unwrap_or("qwen2.5:3b");
-
-            // Get current realm (default to "default" for single-repo)
            let realm = "default";

            let (file_count, symbol_count) = store.get_index_stats(realm)?;
@ -1615,10 +1728,67 @@ async fn handle_index_command(command: IndexCommands) -> Result<()> {
                return Ok(());
            }

-            println!("Checking for stale entries...");
-            println!("(Refresh with model '{}')", model_name);
+            // Get all indexed files and check which are stale
+            let indexed_files = store.list_file_index(realm, None)?;
+            let mut stale_files = Vec::new();

-            // TODO: Implement refresh logic - compare hashes
+            for entry in &indexed_files {
+                let path = Path::new(&entry.file_path);
+                if path.exists() {
+                    if let Ok(content) = std::fs::read_to_string(path) {
+                        let current_hash = hash_file_content(&content);
+                        if current_hash != entry.file_hash {
+                            stale_files.push(entry.file_path.clone());
+                        }
+                    }
+                }
+            }
+
+            if stale_files.is_empty() {
+                println!("All indexed files are up to date.");
+                return Ok(());
+            }
+
+            println!("Found {} stale file(s)", stale_files.len());
+
+            // Try to connect to Ollama
+            let llm_config = LocalLlmConfig {
+                model: model_name.to_string(),
+                use_external: true,
+                ..Default::default()
+            };
+
+            let llm = OllamaLlm::new(&llm_config);
+            if let Err(e) = llm.start() {
+                println!("Ollama not available: {}", e);
+                println!("\nStale files:");
+                for f in &stale_files {
+                    println!("  {}", f);
+                }
+                return Ok(());
+            }
+
+            println!("Re-indexing stale files with '{}'...\n", model_name);
+
+            let indexer_config = IndexerConfig {
+                model: model_name.to_string(),
+                ..Default::default()
+            };
+            let indexer = Indexer::new(llm, indexer_config);
+
+            for file_path in &stale_files {
+                let path = Path::new(file_path);
+                print!("  {} ... ", file_path);
+
+                match indexer.index_and_store(path, &store) {
+                    Ok(result) => {
+                        println!("{} symbols", result.symbols.len());
+                    }
+                    Err(e) => {
+                        println!("error: {}", e);
+                    }
+                }
+            }
        }

        IndexCommands::InstallHook => {
@ -1669,6 +1839,51 @@ blue index diff 2>/dev/null || true
    Ok(())
 }

+/// Collect all indexable files in a directory
+fn collect_indexable_files(dir: &std::path::Path) -> Result<Vec<String>> {
+    use blue_core::{is_indexable_file, should_skip_dir};
+    use std::fs;
+
+    let mut files = Vec::new();
+
+    fn walk_dir(dir: &std::path::Path, files: &mut Vec<String>) -> Result<()> {
+        if !dir.is_dir() {
+            return Ok(());
+        }
+
+        for entry in fs::read_dir(dir)? {
+            let entry = entry?;
+            let path = entry.path();
+            let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
+
+            if path.is_dir() {
+                if !should_skip_dir(name) {
+                    walk_dir(&path, files)?;
+                }
+            } else if is_indexable_file(&path) {
+                if let Some(s) = path.to_str() {
+                    files.push(s.to_string());
+                }
+            }
+        }
+        Ok(())
+    }
+
+    walk_dir(dir, &mut files)?;
+    files.sort();
+    Ok(files)
+}
+
+/// Hash file content for staleness detection
+fn hash_file_content(content: &str) -> String {
+    use std::collections::hash_map::DefaultHasher;
+    use std::hash::{Hash, Hasher};
+
+    let mut hasher = DefaultHasher::new();
+    content.hash(&mut hasher);
+    format!("{:x}", hasher.finish())
+}
+
 async fn handle_search_command(query: &str, symbols_only: bool, limit: usize) -> Result<()> {
    use blue_core::store::DocumentStore;

@ -1798,49 +2013,3 @@ async fn handle_impact_command(file: &str) -> Result<()> {

    Ok(())
 }
-
-fn count_indexable_files(dir: &std::path::Path) -> Result<usize> {
-    use std::fs;
-    use std::path::Path;
-
-    let mut count = 0;
-
-    // File extensions we care about
-    let extensions: &[&str] = &[
-        "rs", "py", "js", "ts", "tsx", "jsx", "go", "java", "c", "cpp", "h", "hpp",
-        "rb", "php", "swift", "kt", "scala", "clj", "ex", "exs", "erl", "hs",
-        "ml", "mli", "sql", "sh", "bash", "zsh", "yaml", "yml", "toml", "json",
-    ];
-
-    // Directories to skip
-    let skip_dirs: &[&str] = &[
-        "node_modules", "target", ".git", "__pycache__", "venv", ".venv",
-        "dist", "build", ".next", ".nuxt", "vendor", ".cargo",
-    ];
-
-    fn walk_dir(dir: &Path, extensions: &[&str], skip_dirs: &[&str], count: &mut usize) -> Result<()> {
-        if !dir.is_dir() {
-            return Ok(());
-        }
-
-        for entry in fs::read_dir(dir)? {
-            let entry = entry?;
-            let path = entry.path();
-            let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
-
-            if path.is_dir() {
-                if !skip_dirs.contains(&name) && !name.starts_with('.') {
-                    walk_dir(&path, extensions, skip_dirs, count)?;
-                }
-            } else if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
-                if extensions.contains(&ext) {
-                    *count += 1;
-                }
-            }
-        }
-        Ok(())
-    }
-
-    walk_dir(dir, extensions, skip_dirs, &mut count)?;
-    Ok(count)
-}
--- a/crates/blue-core/src/indexer.rs
+++ b/crates/blue-core/src/indexer.rs
@ -0,0 +1,454 @@
+//! Semantic file indexer (RFC 0010)
+//!
+//! Uses Ollama with qwen2.5:3b to analyze source files and extract:
+//! - Summary: one-sentence description
+//! - Relationships: dependencies and connections to other files
+//! - Symbols: functions, structs, classes with line numbers
+
+use std::collections::hash_map::DefaultHasher;
+use std::hash::{Hash, Hasher};
+use std::path::Path;
+
+use serde::{Deserialize, Serialize};
+use tracing::{debug, info, warn};
+
+use crate::store::{DocumentStore, FileIndexEntry, SymbolIndexEntry};
+use crate::{CompletionOptions, LlmError, LlmProvider};
+
+/// Default model for indexing
+pub const DEFAULT_INDEX_MODEL: &str = "qwen2.5:3b";
+
+/// Maximum file size in lines before partial indexing
+pub const MAX_FILE_LINES: usize = 1000;
+
+/// Indexer configuration
+#[derive(Debug, Clone)]
+pub struct IndexerConfig {
+    pub model: String,
+    pub realm: String,
+    pub repo: String,
+    pub max_tokens: usize,
+    pub temperature: f32,
+}
+
+impl Default for IndexerConfig {
+    fn default() -> Self {
+        Self {
+            model: DEFAULT_INDEX_MODEL.to_string(),
+            realm: "default".to_string(),
+            repo: "default".to_string(),
+            max_tokens: 2048,
+            temperature: 0.1, // Low temperature for consistent structured output
+        }
+    }
+}
+
+/// Result of indexing a file
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct IndexResult {
+    pub file_path: String,
+    pub file_hash: String,
+    pub summary: Option<String>,
+    pub relationships: Option<String>,
+    pub symbols: Vec<ParsedSymbol>,
+    pub is_partial: bool,
+    pub error: Option<String>,
+}
+
+/// A parsed symbol from AI output
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ParsedSymbol {
+    pub name: String,
+    pub kind: String,
+    pub start_line: Option<i32>,
+    pub end_line: Option<i32>,
+    pub description: Option<String>,
+}
+
+/// The indexer that uses LLM to analyze files
+pub struct Indexer<P: LlmProvider> {
+    provider: P,
+    config: IndexerConfig,
+}
+
+impl<P: LlmProvider> Indexer<P> {
+    /// Create a new indexer with the given LLM provider
+    pub fn new(provider: P, config: IndexerConfig) -> Self {
+        Self { provider, config }
+    }
+
+    /// Index a single file and return the result
+    pub fn index_file(&self, file_path: &Path) -> Result<IndexResult, IndexerError> {
+        let path_str = file_path.to_string_lossy().to_string();
+
+        // Read file contents
+        let content = std::fs::read_to_string(file_path)
+            .map_err(|e| IndexerError::FileRead(path_str.clone(), e.to_string()))?;
+
+        // Calculate hash
+        let file_hash = hash_content(&content);
+
+        // Check file size
+        let line_count = content.lines().count();
+        let is_partial = line_count > MAX_FILE_LINES;
+
+        let content_to_index = if is_partial {
+            // Take first MAX_FILE_LINES lines
+            content.lines().take(MAX_FILE_LINES).collect::<Vec<_>>().join("\n")
+        } else {
+            content.clone()
+        };
+
+        // Generate prompt
+        let prompt = generate_index_prompt(&path_str, &content_to_index, is_partial);
+
+        // Call LLM
+        let options = CompletionOptions {
+            max_tokens: self.config.max_tokens,
+            temperature: self.config.temperature,
+            stop_sequences: vec!["```".to_string()], // Stop at end of YAML block
+        };
+
+        let completion = self.provider.complete(&prompt, &options)
+            .map_err(|e| IndexerError::LlmError(e))?;
+
+        // Parse YAML response
+        let parsed = parse_index_response(&completion.text);
+
+        Ok(IndexResult {
+            file_path: path_str,
+            file_hash,
+            summary: parsed.summary,
+            relationships: parsed.relationships,
+            symbols: parsed.symbols,
+            is_partial,
+            error: parsed.error,
+        })
+    }
+
+    /// Index a file and store in the database
+    pub fn index_and_store(
+        &self,
+        file_path: &Path,
+        store: &DocumentStore,
+    ) -> Result<IndexResult, IndexerError> {
+        let result = self.index_file(file_path)?;
+
+        // Create file index entry
+        let mut entry = FileIndexEntry::new(
+            &self.config.realm,
+            &self.config.repo,
+            &result.file_path,
+            &result.file_hash,
+        );
+        entry.summary = result.summary.clone();
+        entry.relationships = result.relationships.clone();
+
+        // Store in database
+        let file_id = store.upsert_file_index(&entry)
+            .map_err(|e| IndexerError::StoreError(e.to_string()))?;
+
+        // Convert and store symbols
+        let symbols: Vec<SymbolIndexEntry> = result.symbols.iter().map(|s| {
+            SymbolIndexEntry {
+                id: None,
+                file_id,
+                name: s.name.clone(),
+                kind: s.kind.clone(),
+                start_line: s.start_line,
+                end_line: s.end_line,
+                description: s.description.clone(),
+            }
+        }).collect();
+
+        store.set_file_symbols(file_id, &symbols)
+            .map_err(|e| IndexerError::StoreError(e.to_string()))?;
+
+        info!("Indexed {} with {} symbols", result.file_path, symbols.len());
+
+        Ok(result)
+    }
+
+    /// Check if a file needs re-indexing
+    pub fn needs_indexing(&self, file_path: &Path, store: &DocumentStore) -> Result<bool, IndexerError> {
+        let path_str = file_path.to_string_lossy().to_string();
+
+        // Read file and calculate hash
+        let content = std::fs::read_to_string(file_path)
+            .map_err(|e| IndexerError::FileRead(path_str.clone(), e.to_string()))?;
+        let current_hash = hash_content(&content);
+
+        // Check against stored hash
+        store.is_file_stale(&self.config.realm, &self.config.repo, &path_str, &current_hash)
+            .map_err(|e| IndexerError::StoreError(e.to_string()))
+    }
+}
+
+/// Generate the indexing prompt
+fn generate_index_prompt(file_path: &str, content: &str, is_partial: bool) -> String {
+    let partial_note = if is_partial {
+        "\n\nNote: This is a large file. Only the first 1000 lines are shown. Include a note about this in the summary."
+    } else {
+        ""
+    };
+
+    format!(
+        r#"Analyze this source file and provide structured information about it.
+
+File: {file_path}{partial_note}
+
+```
+{content}
+```
+
+Provide your analysis as YAML with this exact structure:
+
+```yaml
+summary: "One sentence describing what this file does"
+
+relationships: |
+  Describe how this file relates to other files.
+  List imports, dependencies, and what uses this file.
+  Be specific about file names when visible.
+
+symbols:
+  - name: "SymbolName"
+    kind: "function|struct|class|enum|const|trait|interface|type|method"
+    start_line: 10
+    end_line: 25
+    description: "What this symbol does"
+```
+
+Rules:
+- Summary must be ONE sentence
+- Relationships should mention specific file names when imports are visible
+- Only include significant symbols (skip trivial helpers, private internals)
+- Line numbers must be accurate
+- Kind must be one of: function, struct, class, enum, const, trait, interface, type, method
+- Output valid YAML only"#
+    )
+}
+
+/// Parsed response from the LLM
+#[derive(Debug, Default)]
+struct ParsedResponse {
+    summary: Option<String>,
+    relationships: Option<String>,
+    symbols: Vec<ParsedSymbol>,
+    error: Option<String>,
+}
+
+/// Parse the YAML response from the LLM
+fn parse_index_response(response: &str) -> ParsedResponse {
+    // Try to find YAML block
+    let yaml_content = if let Some(start) = response.find("```yaml") {
+        let after_marker = &response[start + 7..];
+        if let Some(end) = after_marker.find("```") {
+            after_marker[..end].trim()
+        } else {
+            after_marker.trim()
+        }
+    } else if let Some(start) = response.find("summary:") {
+        // No code fence, but starts with summary
+        response[start..].trim()
+    } else {
+        response.trim()
+    };
+
+    // Parse YAML
+    match serde_yaml::from_str::<serde_yaml::Value>(yaml_content) {
+        Ok(value) => {
+            let summary = value.get("summary")
+                .and_then(|v| v.as_str())
+                .map(|s| s.to_string());
+
+            let relationships = value.get("relationships")
+                .and_then(|v| v.as_str())
+                .map(|s| s.trim().to_string());
+
+            let symbols = value.get("symbols")
+                .and_then(|v| v.as_sequence())
+                .map(|seq| {
+                    seq.iter().filter_map(|item| {
+                        let name = item.get("name")?.as_str()?.to_string();
+                        let kind = item.get("kind")?.as_str()?.to_string();
+
+                        Some(ParsedSymbol {
+                            name,
+                            kind,
+                            start_line: item.get("start_line")
+                                .and_then(|v| v.as_i64())
+                                .map(|n| n as i32),
+                            end_line: item.get("end_line")
+                                .and_then(|v| v.as_i64())
+                                .map(|n| n as i32),
+                            description: item.get("description")
+                                .and_then(|v| v.as_str())
+                                .map(|s| s.to_string()),
+                        })
+                    }).collect()
+                })
+                .unwrap_or_default();
+
+            ParsedResponse {
+                summary,
+                relationships,
+                symbols,
+                error: None,
+            }
+        }
+        Err(e) => {
+            warn!("Failed to parse YAML response: {}", e);
+            debug!("Response was: {}", yaml_content);
+
+            ParsedResponse {
+                summary: None,
+                relationships: None,
+                symbols: vec![],
+                error: Some(format!("YAML parse error: {}", e)),
+            }
+        }
+    }
+}
+
+/// Calculate hash of file content
+fn hash_content(content: &str) -> String {
+    let mut hasher = DefaultHasher::new();
+    content.hash(&mut hasher);
+    format!("{:x}", hasher.finish())
+}
+
+/// Indexer errors
+#[derive(Debug, thiserror::Error)]
+pub enum IndexerError {
+    #[error("Failed to read file '{0}': {1}")]
+    FileRead(String, String),
+
+    #[error("LLM error: {0}")]
+    LlmError(#[from] LlmError),
+
+    #[error("Store error: {0}")]
+    StoreError(String),
+
+    #[error("Index error: {0}")]
+    Other(String),
+}
+
+/// File extensions we should index
+pub fn is_indexable_file(path: &Path) -> bool {
+    let extensions: &[&str] = &[
+        "rs", "py", "js", "ts", "tsx", "jsx", "go", "java", "c", "cpp", "h", "hpp",
+        "rb", "php", "swift", "kt", "scala", "clj", "ex", "exs", "erl", "hs",
+        "ml", "mli", "sql", "sh", "bash", "zsh", "yaml", "yml", "toml", "json",
+    ];
+
+    path.extension()
+        .and_then(|e| e.to_str())
+        .map(|e| extensions.contains(&e))
+        .unwrap_or(false)
+}
+
+/// Directories to skip when indexing
+pub fn should_skip_dir(name: &str) -> bool {
+    let skip_dirs: &[&str] = &[
+        "node_modules", "target", ".git", "__pycache__", "venv", ".venv",
+        "dist", "build", ".next", ".nuxt", "vendor", ".cargo", ".blue",
+    ];
+
+    skip_dirs.contains(&name) || name.starts_with('.')
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_hash_content() {
+        let hash1 = hash_content("hello");
+        let hash2 = hash_content("hello");
+        let hash3 = hash_content("world");
+
+        assert_eq!(hash1, hash2);
+        assert_ne!(hash1, hash3);
+    }
+
+    #[test]
+    fn test_is_indexable_file() {
+        assert!(is_indexable_file(Path::new("foo.rs")));
+        assert!(is_indexable_file(Path::new("bar.py")));
+        assert!(is_indexable_file(Path::new("baz.ts")));
+        assert!(!is_indexable_file(Path::new("readme.md")));
+        assert!(!is_indexable_file(Path::new("image.png")));
+    }
+
+    #[test]
+    fn test_should_skip_dir() {
+        assert!(should_skip_dir("node_modules"));
+        assert!(should_skip_dir("target"));
+        assert!(should_skip_dir(".git"));
+        assert!(should_skip_dir(".hidden"));
+        assert!(!should_skip_dir("src"));
+        assert!(!should_skip_dir("lib"));
+    }
+
+    #[test]
+    fn test_parse_index_response_valid() {
+        let response = r#"```yaml
+summary: "This file handles user authentication"
+
+relationships: |
+  Imports auth module from ./auth.rs
+  Used by main.rs for login flow
+
+symbols:
+  - name: "authenticate"
+    kind: "function"
+    start_line: 10
+    end_line: 25
+    description: "Validates user credentials"
+```"#;
+
+        let parsed = parse_index_response(response);
+        assert_eq!(parsed.summary, Some("This file handles user authentication".to_string()));
+        assert!(parsed.relationships.is_some());
+        assert_eq!(parsed.symbols.len(), 1);
+        assert_eq!(parsed.symbols[0].name, "authenticate");
+        assert_eq!(parsed.symbols[0].kind, "function");
+        assert_eq!(parsed.symbols[0].start_line, Some(10));
+    }
+
+    #[test]
+    fn test_parse_index_response_no_fence() {
+        let response = r#"summary: "Test file"
+
+relationships: |
+  No dependencies
+
+symbols: []"#;
+
+        let parsed = parse_index_response(response);
+        assert_eq!(parsed.summary, Some("Test file".to_string()));
+        assert!(parsed.symbols.is_empty());
+    }
+
+    #[test]
+    fn test_parse_index_response_invalid() {
+        let response = "this is not valid yaml { broken }";
+        let parsed = parse_index_response(response);
+        assert!(parsed.error.is_some());
+    }
+
+    #[test]
+    fn test_generate_index_prompt() {
+        let prompt = generate_index_prompt("test.rs", "fn main() {}", false);
+        assert!(prompt.contains("test.rs"));
+        assert!(prompt.contains("fn main()"));
+        assert!(!prompt.contains("large file"));
+    }
+
+    #[test]
+    fn test_generate_index_prompt_partial() {
+        let prompt = generate_index_prompt("test.rs", "fn main() {}", true);
+        assert!(prompt.contains("large file"));
+    }
+}
--- a/crates/blue-core/src/lib.rs
+++ b/crates/blue-core/src/lib.rs
@ -15,6 +15,7 @@ const _BLUE_SECRET_NAME: &str = "Sheepey"; // pronounced "Shee-paay"

 pub mod daemon;
 pub mod documents;
+pub mod indexer;
 pub mod llm;
 pub mod realm;
 pub mod repo;
@ -24,6 +25,7 @@ pub mod voice;
 pub mod workflow;

 pub use documents::{Adr, Audit, AuditFinding, AuditSeverity, AuditType, Decision, Rfc, Spike, SpikeOutcome, Status, Task, update_markdown_status};
+pub use indexer::{Indexer, IndexerConfig, IndexerError, IndexResult, ParsedSymbol, is_indexable_file, should_skip_dir, DEFAULT_INDEX_MODEL, MAX_FILE_LINES};
 pub use llm::{CompletionOptions, CompletionResult, LlmBackendChoice, LlmConfig, LlmError, LlmManager, LlmProvider, LlmProviderChoice, LocalLlmConfig, ApiLlmConfig, KeywordLlm, MockLlm, ProviderStatus};
 pub use repo::{detect_blue, BlueHome, RepoError, WorktreeInfo};
 pub use state::{ItemType, ProjectState, StateError, StatusSummary, WorkItem};