The stop_sequences contained "```" which caused the model to stop immediately after outputting "```yaml", truncating the entire response. Also wrap blocking indexer operations in spawn_blocking to avoid runtime conflicts with reqwest::blocking::Client. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
454 lines
14 KiB
Rust
454 lines
14 KiB
Rust
//! Semantic file indexer (RFC 0010)
|
|
//!
|
|
//! Uses Ollama with qwen2.5:3b to analyze source files and extract:
|
|
//! - Summary: one-sentence description
|
|
//! - Relationships: dependencies and connections to other files
|
|
//! - Symbols: functions, structs, classes with line numbers
|
|
|
|
use std::collections::hash_map::DefaultHasher;
|
|
use std::hash::{Hash, Hasher};
|
|
use std::path::Path;
|
|
|
|
use serde::{Deserialize, Serialize};
|
|
use tracing::{debug, info, warn};
|
|
|
|
use crate::store::{DocumentStore, FileIndexEntry, SymbolIndexEntry};
|
|
use crate::{CompletionOptions, LlmError, LlmProvider};
|
|
|
|
/// Default model for indexing
|
|
pub const DEFAULT_INDEX_MODEL: &str = "qwen2.5:3b";
|
|
|
|
/// Maximum file size in lines before partial indexing
|
|
pub const MAX_FILE_LINES: usize = 1000;
|
|
|
|
/// Indexer configuration
|
|
#[derive(Debug, Clone)]
|
|
pub struct IndexerConfig {
|
|
pub model: String,
|
|
pub realm: String,
|
|
pub repo: String,
|
|
pub max_tokens: usize,
|
|
pub temperature: f32,
|
|
}
|
|
|
|
impl Default for IndexerConfig {
|
|
fn default() -> Self {
|
|
Self {
|
|
model: DEFAULT_INDEX_MODEL.to_string(),
|
|
realm: "default".to_string(),
|
|
repo: "default".to_string(),
|
|
max_tokens: 2048,
|
|
temperature: 0.1, // Low temperature for consistent structured output
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Result of indexing a file
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct IndexResult {
|
|
pub file_path: String,
|
|
pub file_hash: String,
|
|
pub summary: Option<String>,
|
|
pub relationships: Option<String>,
|
|
pub symbols: Vec<ParsedSymbol>,
|
|
pub is_partial: bool,
|
|
pub error: Option<String>,
|
|
}
|
|
|
|
/// A parsed symbol from AI output
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct ParsedSymbol {
|
|
pub name: String,
|
|
pub kind: String,
|
|
pub start_line: Option<i32>,
|
|
pub end_line: Option<i32>,
|
|
pub description: Option<String>,
|
|
}
|
|
|
|
/// The indexer that uses LLM to analyze files
|
|
pub struct Indexer<P: LlmProvider> {
|
|
provider: P,
|
|
config: IndexerConfig,
|
|
}
|
|
|
|
impl<P: LlmProvider> Indexer<P> {
|
|
/// Create a new indexer with the given LLM provider
|
|
pub fn new(provider: P, config: IndexerConfig) -> Self {
|
|
Self { provider, config }
|
|
}
|
|
|
|
/// Index a single file and return the result
|
|
pub fn index_file(&self, file_path: &Path) -> Result<IndexResult, IndexerError> {
|
|
let path_str = file_path.to_string_lossy().to_string();
|
|
|
|
// Read file contents
|
|
let content = std::fs::read_to_string(file_path)
|
|
.map_err(|e| IndexerError::FileRead(path_str.clone(), e.to_string()))?;
|
|
|
|
// Calculate hash
|
|
let file_hash = hash_content(&content);
|
|
|
|
// Check file size
|
|
let line_count = content.lines().count();
|
|
let is_partial = line_count > MAX_FILE_LINES;
|
|
|
|
let content_to_index = if is_partial {
|
|
// Take first MAX_FILE_LINES lines
|
|
content.lines().take(MAX_FILE_LINES).collect::<Vec<_>>().join("\n")
|
|
} else {
|
|
content.clone()
|
|
};
|
|
|
|
// Generate prompt
|
|
let prompt = generate_index_prompt(&path_str, &content_to_index, is_partial);
|
|
|
|
// Call LLM
|
|
let options = CompletionOptions {
|
|
max_tokens: self.config.max_tokens,
|
|
temperature: self.config.temperature,
|
|
stop_sequences: vec![], // Let model complete naturally
|
|
};
|
|
|
|
let completion = self.provider.complete(&prompt, &options)
|
|
.map_err(|e| IndexerError::LlmError(e))?;
|
|
|
|
// Parse YAML response
|
|
let parsed = parse_index_response(&completion.text);
|
|
|
|
Ok(IndexResult {
|
|
file_path: path_str,
|
|
file_hash,
|
|
summary: parsed.summary,
|
|
relationships: parsed.relationships,
|
|
symbols: parsed.symbols,
|
|
is_partial,
|
|
error: parsed.error,
|
|
})
|
|
}
|
|
|
|
/// Index a file and store in the database
|
|
pub fn index_and_store(
|
|
&self,
|
|
file_path: &Path,
|
|
store: &DocumentStore,
|
|
) -> Result<IndexResult, IndexerError> {
|
|
let result = self.index_file(file_path)?;
|
|
|
|
// Create file index entry
|
|
let mut entry = FileIndexEntry::new(
|
|
&self.config.realm,
|
|
&self.config.repo,
|
|
&result.file_path,
|
|
&result.file_hash,
|
|
);
|
|
entry.summary = result.summary.clone();
|
|
entry.relationships = result.relationships.clone();
|
|
|
|
// Store in database
|
|
let file_id = store.upsert_file_index(&entry)
|
|
.map_err(|e| IndexerError::StoreError(e.to_string()))?;
|
|
|
|
// Convert and store symbols
|
|
let symbols: Vec<SymbolIndexEntry> = result.symbols.iter().map(|s| {
|
|
SymbolIndexEntry {
|
|
id: None,
|
|
file_id,
|
|
name: s.name.clone(),
|
|
kind: s.kind.clone(),
|
|
start_line: s.start_line,
|
|
end_line: s.end_line,
|
|
description: s.description.clone(),
|
|
}
|
|
}).collect();
|
|
|
|
store.set_file_symbols(file_id, &symbols)
|
|
.map_err(|e| IndexerError::StoreError(e.to_string()))?;
|
|
|
|
info!("Indexed {} with {} symbols", result.file_path, symbols.len());
|
|
|
|
Ok(result)
|
|
}
|
|
|
|
/// Check if a file needs re-indexing
|
|
pub fn needs_indexing(&self, file_path: &Path, store: &DocumentStore) -> Result<bool, IndexerError> {
|
|
let path_str = file_path.to_string_lossy().to_string();
|
|
|
|
// Read file and calculate hash
|
|
let content = std::fs::read_to_string(file_path)
|
|
.map_err(|e| IndexerError::FileRead(path_str.clone(), e.to_string()))?;
|
|
let current_hash = hash_content(&content);
|
|
|
|
// Check against stored hash
|
|
store.is_file_stale(&self.config.realm, &self.config.repo, &path_str, ¤t_hash)
|
|
.map_err(|e| IndexerError::StoreError(e.to_string()))
|
|
}
|
|
}
|
|
|
|
/// Generate the indexing prompt
|
|
fn generate_index_prompt(file_path: &str, content: &str, is_partial: bool) -> String {
|
|
let partial_note = if is_partial {
|
|
"\n\nNote: This is a large file. Only the first 1000 lines are shown. Include a note about this in the summary."
|
|
} else {
|
|
""
|
|
};
|
|
|
|
format!(
|
|
r#"Analyze this source file and provide structured information about it.
|
|
|
|
File: {file_path}{partial_note}
|
|
|
|
```
|
|
{content}
|
|
```
|
|
|
|
Provide your analysis as YAML with this exact structure:
|
|
|
|
```yaml
|
|
summary: "One sentence describing what this file does"
|
|
|
|
relationships: |
|
|
Describe how this file relates to other files.
|
|
List imports, dependencies, and what uses this file.
|
|
Be specific about file names when visible.
|
|
|
|
symbols:
|
|
- name: "SymbolName"
|
|
kind: "function|struct|class|enum|const|trait|interface|type|method"
|
|
start_line: 10
|
|
end_line: 25
|
|
description: "What this symbol does"
|
|
```
|
|
|
|
Rules:
|
|
- Summary must be ONE sentence
|
|
- Relationships should mention specific file names when imports are visible
|
|
- Only include significant symbols (skip trivial helpers, private internals)
|
|
- Line numbers must be accurate
|
|
- Kind must be one of: function, struct, class, enum, const, trait, interface, type, method
|
|
- Output valid YAML only"#
|
|
)
|
|
}
|
|
|
|
/// Parsed response from the LLM
|
|
#[derive(Debug, Default)]
|
|
struct ParsedResponse {
|
|
summary: Option<String>,
|
|
relationships: Option<String>,
|
|
symbols: Vec<ParsedSymbol>,
|
|
error: Option<String>,
|
|
}
|
|
|
|
/// Parse the YAML response from the LLM
|
|
fn parse_index_response(response: &str) -> ParsedResponse {
|
|
// Try to find YAML block
|
|
let yaml_content = if let Some(start) = response.find("```yaml") {
|
|
let after_marker = &response[start + 7..];
|
|
if let Some(end) = after_marker.find("```") {
|
|
after_marker[..end].trim()
|
|
} else {
|
|
after_marker.trim()
|
|
}
|
|
} else if let Some(start) = response.find("summary:") {
|
|
// No code fence, but starts with summary
|
|
response[start..].trim()
|
|
} else {
|
|
response.trim()
|
|
};
|
|
|
|
// Parse YAML
|
|
match serde_yaml::from_str::<serde_yaml::Value>(yaml_content) {
|
|
Ok(value) => {
|
|
let summary = value.get("summary")
|
|
.and_then(|v| v.as_str())
|
|
.map(|s| s.to_string());
|
|
|
|
let relationships = value.get("relationships")
|
|
.and_then(|v| v.as_str())
|
|
.map(|s| s.trim().to_string());
|
|
|
|
let symbols = value.get("symbols")
|
|
.and_then(|v| v.as_sequence())
|
|
.map(|seq| {
|
|
seq.iter().filter_map(|item| {
|
|
let name = item.get("name")?.as_str()?.to_string();
|
|
let kind = item.get("kind")?.as_str()?.to_string();
|
|
|
|
Some(ParsedSymbol {
|
|
name,
|
|
kind,
|
|
start_line: item.get("start_line")
|
|
.and_then(|v| v.as_i64())
|
|
.map(|n| n as i32),
|
|
end_line: item.get("end_line")
|
|
.and_then(|v| v.as_i64())
|
|
.map(|n| n as i32),
|
|
description: item.get("description")
|
|
.and_then(|v| v.as_str())
|
|
.map(|s| s.to_string()),
|
|
})
|
|
}).collect()
|
|
})
|
|
.unwrap_or_default();
|
|
|
|
ParsedResponse {
|
|
summary,
|
|
relationships,
|
|
symbols,
|
|
error: None,
|
|
}
|
|
}
|
|
Err(e) => {
|
|
warn!("Failed to parse YAML response: {}", e);
|
|
debug!("Response was: {}", yaml_content);
|
|
|
|
ParsedResponse {
|
|
summary: None,
|
|
relationships: None,
|
|
symbols: vec![],
|
|
error: Some(format!("YAML parse error: {}", e)),
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Calculate hash of file content
|
|
fn hash_content(content: &str) -> String {
|
|
let mut hasher = DefaultHasher::new();
|
|
content.hash(&mut hasher);
|
|
format!("{:x}", hasher.finish())
|
|
}
|
|
|
|
/// Indexer errors
|
|
#[derive(Debug, thiserror::Error)]
|
|
pub enum IndexerError {
|
|
#[error("Failed to read file '{0}': {1}")]
|
|
FileRead(String, String),
|
|
|
|
#[error("LLM error: {0}")]
|
|
LlmError(#[from] LlmError),
|
|
|
|
#[error("Store error: {0}")]
|
|
StoreError(String),
|
|
|
|
#[error("Index error: {0}")]
|
|
Other(String),
|
|
}
|
|
|
|
/// File extensions we should index
|
|
pub fn is_indexable_file(path: &Path) -> bool {
|
|
let extensions: &[&str] = &[
|
|
"rs", "py", "js", "ts", "tsx", "jsx", "go", "java", "c", "cpp", "h", "hpp",
|
|
"rb", "php", "swift", "kt", "scala", "clj", "ex", "exs", "erl", "hs",
|
|
"ml", "mli", "sql", "sh", "bash", "zsh", "yaml", "yml", "toml", "json",
|
|
];
|
|
|
|
path.extension()
|
|
.and_then(|e| e.to_str())
|
|
.map(|e| extensions.contains(&e))
|
|
.unwrap_or(false)
|
|
}
|
|
|
|
/// Directories to skip when indexing
|
|
pub fn should_skip_dir(name: &str) -> bool {
|
|
let skip_dirs: &[&str] = &[
|
|
"node_modules", "target", ".git", "__pycache__", "venv", ".venv",
|
|
"dist", "build", ".next", ".nuxt", "vendor", ".cargo", ".blue",
|
|
];
|
|
|
|
skip_dirs.contains(&name) || name.starts_with('.')
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_hash_content() {
|
|
let hash1 = hash_content("hello");
|
|
let hash2 = hash_content("hello");
|
|
let hash3 = hash_content("world");
|
|
|
|
assert_eq!(hash1, hash2);
|
|
assert_ne!(hash1, hash3);
|
|
}
|
|
|
|
#[test]
|
|
fn test_is_indexable_file() {
|
|
assert!(is_indexable_file(Path::new("foo.rs")));
|
|
assert!(is_indexable_file(Path::new("bar.py")));
|
|
assert!(is_indexable_file(Path::new("baz.ts")));
|
|
assert!(!is_indexable_file(Path::new("readme.md")));
|
|
assert!(!is_indexable_file(Path::new("image.png")));
|
|
}
|
|
|
|
#[test]
|
|
fn test_should_skip_dir() {
|
|
assert!(should_skip_dir("node_modules"));
|
|
assert!(should_skip_dir("target"));
|
|
assert!(should_skip_dir(".git"));
|
|
assert!(should_skip_dir(".hidden"));
|
|
assert!(!should_skip_dir("src"));
|
|
assert!(!should_skip_dir("lib"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_index_response_valid() {
|
|
let response = r#"```yaml
|
|
summary: "This file handles user authentication"
|
|
|
|
relationships: |
|
|
Imports auth module from ./auth.rs
|
|
Used by main.rs for login flow
|
|
|
|
symbols:
|
|
- name: "authenticate"
|
|
kind: "function"
|
|
start_line: 10
|
|
end_line: 25
|
|
description: "Validates user credentials"
|
|
```"#;
|
|
|
|
let parsed = parse_index_response(response);
|
|
assert_eq!(parsed.summary, Some("This file handles user authentication".to_string()));
|
|
assert!(parsed.relationships.is_some());
|
|
assert_eq!(parsed.symbols.len(), 1);
|
|
assert_eq!(parsed.symbols[0].name, "authenticate");
|
|
assert_eq!(parsed.symbols[0].kind, "function");
|
|
assert_eq!(parsed.symbols[0].start_line, Some(10));
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_index_response_no_fence() {
|
|
let response = r#"summary: "Test file"
|
|
|
|
relationships: |
|
|
No dependencies
|
|
|
|
symbols: []"#;
|
|
|
|
let parsed = parse_index_response(response);
|
|
assert_eq!(parsed.summary, Some("Test file".to_string()));
|
|
assert!(parsed.symbols.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_index_response_invalid() {
|
|
let response = "this is not valid yaml { broken }";
|
|
let parsed = parse_index_response(response);
|
|
assert!(parsed.error.is_some());
|
|
}
|
|
|
|
#[test]
|
|
fn test_generate_index_prompt() {
|
|
let prompt = generate_index_prompt("test.rs", "fn main() {}", false);
|
|
assert!(prompt.contains("test.rs"));
|
|
assert!(prompt.contains("fn main()"));
|
|
assert!(!prompt.contains("large file"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_generate_index_prompt_partial() {
|
|
let prompt = generate_index_prompt("test.rs", "fn main() {}", true);
|
|
assert!(prompt.contains("large file"));
|
|
}
|
|
}
|