diff --git a/Cargo.lock b/Cargo.lock index cde8a34..0328383 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -498,6 +498,7 @@ dependencies = [ "tree-sitter", "tree-sitter-cpp", "tree-sitter-java", + "tree-sitter-python", "tree-sitter-rust-orchard", ] @@ -931,6 +932,16 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4013970217383f67b18aef68f6fb2e8d409bc5755227092d32efb0422ba24b8" +[[package]] +name = "tree-sitter-python" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bf85fd39652e740bf60f46f4cda9492c3a9ad75880575bf14960f775cb74a1c" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "tree-sitter-rust-orchard" version = "0.12.0" diff --git a/Cargo.toml b/Cargo.toml index 7322382..55b8e89 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ tree-sitter = "0.25.3" tree-sitter-cpp = "0.23.4" tree-sitter-rust-orchard = "0.12.0" tree-sitter-java = "0.23.5" +tree-sitter-python = "0.25.0" rayon = "1.11.0" miette = { version = "7.6.0", features = ["fancy"] } diff --git a/docs/Tasks.md b/docs/Tasks.md index 5d8aa44..d33f739 100644 --- a/docs/Tasks.md +++ b/docs/Tasks.md @@ -1,19 +1,23 @@ # Tasks -1. Handle running with no log format. -1. Extract a thread id from log when available and associate with source ref. -1. Generate call stack from exceptions. -1. Support multiple source roots from CLI. -1. Serialize state for re-use on subsequent executions +- [ ] Handle running CLI with no log format. + - TSS: Doesn't this work already? I echo + the body of the log message into log2src + and it can find the message. +- [ ] Extract a thread id from log when available and associate with source ref. +- [ ] Generate call stack from exceptions. +- [ ] Support multiple source roots from CLI. +- [ ] Serialize state for re-use on subsequent executions ## Extension -1. Work with non .log extension (.json, etc). -1. Basic test coverage -1. Support src -> log breakpoints +- [ ] Work with non .log extension (.json, etc). +- [ ] Basic test coverage +- [ ] Support src -> log breakpoints ## Languages -1. Python -1. Go -1. JavaScript +- [X] Python +- [ ] Go +- [ ] JavaScript +- [ ] Typescript diff --git a/src/lib.rs b/src/lib.rs index a4778c9..5aa5804 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,15 +1,15 @@ use itertools::Itertools; use miette::Diagnostic; use rayon::prelude::*; -use regex::RegexSet; +use regex::{Captures, Regex, RegexSet}; use serde::Serialize; use std::collections::HashMap; use std::ffi::OsStr; use std::fs::File; use std::io; -use std::ops::RangeBounds; +use std::ops::{Deref, RangeBounds}; use std::path::{Path, PathBuf}; -use std::sync::Arc; +use std::sync::{Arc, LazyLock}; use thiserror::Error; use tree_sitter::Language; @@ -254,6 +254,7 @@ pub enum SourceLanguage { Java, #[serde(rename = "C++")] Cpp, + Python, } impl From for Language { @@ -262,6 +263,7 @@ impl From for Language { SourceLanguage::Rust => tree_sitter_rust_orchard::LANGUAGE.into(), SourceLanguage::Java => tree_sitter_java::LANGUAGE.into(), SourceLanguage::Cpp => tree_sitter_cpp::LANGUAGE.into(), + SourceLanguage::Python => tree_sitter_python::LANGUAGE.into(), } } } @@ -270,12 +272,30 @@ const IDENTS_RS: &[&str] = &["debug", "info", "warn"]; const IDENTS_JAVA: &[&str] = &["logger", "log", "fine", "debug", "info", "warn", "trace"]; const IDENTS_CPP: &[&str] = &["debug", "info", "warn", "trace"]; +const IDENTS_PYTHON: &[&str] = &["debug", "info", "warn", "trace"]; + +static RUST_PLACEHOLDER_REGEX: LazyLock = LazyLock::new(|| { + Regex::new(r#"\{(?:([a-zA-Z_][a-zA-Z0-9_.]*)|(\d+))?\s*(?::[^}]*)?}"#).unwrap() +}); + +static JAVA_PLACEHOLDER_REGEX: LazyLock = + LazyLock::new(|| Regex::new(r#"\{.*}|\\\{(.*)}"#).unwrap()); + +static CPP_PLACEHOLDER_REGEX: LazyLock = LazyLock::new(|| { + Regex::new(r#"%[-+ #0]*\d*(?:\.\d+)?[hlLzjt]*[diuoxXfFeEgGaAcspn%]|\{(?:([a-zA-Z_][a-zA-Z0-9_.]*)|(\d+))?\s*(?::[^}]*)?}"#).unwrap() +}); + +static PYTHON_PLACEHOLDER_REGEX: LazyLock = LazyLock::new(|| { + Regex::new(r#"%[-+ #0]*\d*(?:\.\d+)?[hlLzjt]*[diuoxXfFeEgGaAcspn%]"#).unwrap() +}); + impl SourceLanguage { pub fn as_str(&self) -> &'static str { match self { SourceLanguage::Rust => "Rust", SourceLanguage::Java => "Java", SourceLanguage::Cpp => "C++", + SourceLanguage::Python => "Python", } } @@ -284,6 +304,7 @@ impl SourceLanguage { Some("rs") => Some(Self::Rust), Some("java") => Some(Self::Java), Some("h" | "hh" | "hpp" | "hxx" | "tpp" | "cc" | "cpp" | "cxx") => Some(Self::Cpp), + Some("py") => Some(Self::Python), None | Some(_) => None, } } @@ -339,6 +360,20 @@ impl SourceLanguage { ) "# } + SourceLanguage::Python => { + r#" + ( + (expression_statement + (call + function: (_) @func + arguments: (argument_list . + (string) @args + ) + ) + ) + ) + "# + } } } @@ -347,7 +382,34 @@ impl SourceLanguage { SourceLanguage::Rust => IDENTS_RS, SourceLanguage::Java => IDENTS_JAVA, SourceLanguage::Cpp => IDENTS_CPP, + SourceLanguage::Python => IDENTS_PYTHON, + } + } + + fn get_placeholder_regex(&self) -> &'static Regex { + match self { + SourceLanguage::Rust => RUST_PLACEHOLDER_REGEX.deref(), + SourceLanguage::Java => JAVA_PLACEHOLDER_REGEX.deref(), + SourceLanguage::Cpp => CPP_PLACEHOLDER_REGEX.deref(), + SourceLanguage::Python => PYTHON_PLACEHOLDER_REGEX.deref(), + } + } + + fn captures_to_format_arg(&self, caps: &Captures) -> FormatArgument { + for (index, cap) in caps.iter().skip(1).enumerate() { + if let Some(cap) = cap { + return match (self, index) { + (SourceLanguage::Rust | SourceLanguage::Java | SourceLanguage::Cpp, 0) => { + FormatArgument::Named(cap.as_str().to_string()) + } + (SourceLanguage::Rust | SourceLanguage::Cpp, 1) => { + FormatArgument::Positional(cap.as_str().parse().unwrap()) + } + _ => unreachable!(), + }; + } } + FormatArgument::Placeholder } } @@ -513,7 +575,7 @@ pub fn extract_logging_guarded(sources: &[CodeSource], guard: &WorkGuard) -> Vec for result in results { // println!("node.kind()={:?} range={:?}", result.kind, result.range); match result.kind.as_str() { - "string_literal" => { + "string_literal" | "string" => { if let Some(src_ref) = SourceRef::new(code, result) { patterns.push(src_ref.pattern.clone()); matched.push(src_ref); @@ -852,4 +914,32 @@ fn main() { },] ); } + + const PYTHON_SOURCE: &str = r#" +def main(args): + logger.info("foo %s \N{greek small letter pi}", test_var) + logging.info(f'Hello, {args[1]}!') + logger.warning(f"warning message:\nlow disk space") + logger.info(rf"""info message: +processing started -- {args[0]}""") +"#; + + #[test] + fn test_basic_python() { + let log_ref = LogRef::new("foo bar π"); + let code = CodeSource::from_string(&Path::new("in-mem.py"), PYTHON_SOURCE); + let src_refs = extract_logging(&[code], &ProgressTracker::new()) + .pop() + .unwrap() + .log_statements; + assert_yaml_snapshot!(src_refs); + let vars = extract_variables(&log_ref, &src_refs[0]); + assert_eq!( + vars, + vec![VariablePair { + expr: "test_var".to_string(), + value: "bar".to_string() + },] + ); + } } diff --git a/src/snapshots/log2src__tests__basic_python.snap b/src/snapshots/log2src__tests__basic_python.snap new file mode 100644 index 0000000..bfe8ba0 --- /dev/null +++ b/src/snapshots/log2src__tests__basic_python.snap @@ -0,0 +1,52 @@ +--- +source: src/lib.rs +expression: src_refs +--- +- sourcePath: in-mem.py + language: Python + lineNumber: 3 + endLineNumber: 3 + column: 16 + name: main + text: "\"foo %s \\N{greek small letter pi}\"" + quality: 5 + pattern: "(?s)^foo (.+) \\w$" + args: + - Placeholder + vars: + - test_var +- sourcePath: in-mem.py + language: Python + lineNumber: 4 + endLineNumber: 4 + column: 17 + name: main + text: "f'Hello, {args[1]}!'" + quality: 7 + pattern: "(?s)^Hello, (.+)!$" + args: + - Named: "args[1]" + vars: [] +- sourcePath: in-mem.py + language: Python + lineNumber: 5 + endLineNumber: 5 + column: 19 + name: main + text: "f\"warning message:\\nlow disk space\"" + quality: 29 + pattern: "(?s)^warning message:\\nlow disk space$" + args: [] + vars: [] +- sourcePath: in-mem.py + language: Python + lineNumber: 6 + endLineNumber: 7 + column: 16 + name: main + text: "rf\"\"\"info message:\nprocessing started -- {args[0]}\"\"\"" + quality: 33 + pattern: "(?s)^info message:\\nprocessing started -- (.+)$" + args: + - Named: "args[0]" + vars: [] diff --git a/src/source_query.rs b/src/source_query.rs index 613c42c..3a62315 100644 --- a/src/source_query.rs +++ b/src/source_query.rs @@ -3,6 +3,7 @@ use tree_sitter::{ Language, Node, Parser, Point, Query, QueryCursor, Range as TSRange, StreamingIterator, Tree, }; +use crate::source_ref::FormatArgument; use crate::CodeSource; pub struct SourceQuery<'a> { @@ -15,6 +16,9 @@ pub(crate) struct QueryResult { pub kind: String, pub range: TSRange, pub name_range: Range, + pub pattern: Option, + pub args: Vec, + pub raw: bool, } impl<'a> SourceQuery<'a> { @@ -44,21 +48,60 @@ impl<'a> SourceQuery<'a> { let mut got_string_literal = false; for capture in m.captures { let mut child = capture.node; - if child.kind() == "string_literal" { - // only return results after the format string literal, other captures - // are not relevant. - got_string_literal = true; - } else if !got_string_literal { - continue; + match child.kind() { + "string_literal" | "string" => { + // only return results after the format string literal, other captures + // are not relevant. + got_string_literal = true; + } + _ => { + if !got_string_literal { + continue; + } + } } let mut arg_start: Option<(usize, Point)> = None; if filter_idx.is_none() || filter_idx.is_some_and(|f| f == capture.index) { + let qr_index = results.len(); results.push(QueryResult { kind: capture.node.kind().to_string(), range: capture.node.range(), name_range: Self::find_fn_range(child), + pattern: None, + args: vec![], + raw: false, }); + let mut pattern = String::new(); + if child.kind() == "string" { + // The Python tree-sitter outputs string nodes that contain details about + // the string, like interpolation expressions. + let mut child_cursor = child.walk(); + for string_child in child.children(&mut child_cursor) { + let range = string_child.start_byte()..string_child.end_byte(); + match string_child.kind() { + "string_start" => { + // Check for a python raw string literal. + if self.source[range].contains("r") { + results[qr_index].raw = true; + } + } + "string_content" => pattern.push_str(self.source[range].as_ref()), + "interpolation" => { + // Swap in a Python placeholder for the interpolation + // expression. + pattern.push_str("%s"); + let expr = + string_child.child_by_field_name("expression").unwrap(); + results[qr_index].args.push(FormatArgument::Named( + self.source[expr.start_byte()..expr.end_byte()].to_string(), + )) + } + _ => {} + } + } + results[qr_index].pattern = Some(pattern); + } while let Some(next_child) = child.next_sibling() { if matches!(next_child.kind(), "," | ")") { if let Some(start) = arg_start { @@ -72,6 +115,9 @@ impl<'a> SourceQuery<'a> { end_point: next_child.start_position(), }, name_range: Self::find_fn_range(child), + pattern: None, + args: vec![], + raw: false, }); } } @@ -94,7 +140,13 @@ impl<'a> SourceQuery<'a> { range.start_byte..range.end_byte } "function_definition" => { - let range = node.child_by_field_name("declarator").unwrap().range(); + let range = if let Some(decl) = node.child_by_field_name("declarator") { + decl.range() + } else if let Some(name) = node.child_by_field_name("name") { + name.range() + } else { + unreachable!(); + }; range.start_byte..range.end_byte } "method_declaration" => { diff --git a/src/source_ref.rs b/src/source_ref.rs index 871d519..1e75b9a 100644 --- a/src/source_ref.rs +++ b/src/source_ref.rs @@ -2,7 +2,6 @@ use crate::{CodeSource, QueryResult, SourceLanguage}; use core::fmt; use regex::{Captures, Regex}; use serde::Serialize; -use std::ops::Deref; use std::sync::LazyLock; #[derive(Clone, Debug, Serialize, Eq, PartialEq)] @@ -53,16 +52,22 @@ impl SourceRef { if start == range.end_byte { end = range.end_byte; } - let unquoted = &source[start..end].to_string(); - // println!("{} line {}", code.filename, line); + let unquoted = if let Some(pat) = result.pattern { + pat + } else { + source[start..end].to_string() + }; if let Some(MessageMatcher { matcher, pattern, - args, + mut args, quality, - }) = build_matcher(unquoted, code.info.language) + }) = build_matcher(result.raw, &unquoted, code.info.language) { let name = source[result.name_range].to_string(); + if !result.args.is_empty() { + args = result.args; + } Some(SourceRef { source_path: code.filename.clone(), language: code.info.language, @@ -107,45 +112,21 @@ impl PartialEq for SourceRef { } } -static RUST_PLACEHOLDER_REGEX: LazyLock = LazyLock::new(|| { - Regex::new(r#"\{(?:([a-zA-Z_][a-zA-Z0-9_.]*)|(\d+))?\s*(?::[^}]*)?}"#).unwrap() -}); - -static JAVA_PLACEHOLDER_REGEX: LazyLock = - LazyLock::new(|| Regex::new(r#"\{.*}|\\\{(.*)}"#).unwrap()); - -static CPP_PLACEHOLDER_REGEX: LazyLock = LazyLock::new(|| { - Regex::new(r#"%[-+ #0]*\d*(?:\.\d+)?[hlLzjt]*[diuoxXfFeEgGaAcspn%]|\{(?:([a-zA-Z_][a-zA-Z0-9_.]*)|(\d+))?\s*(?::[^}]*)?}"#).unwrap() -}); - -fn placeholder_regex_for(language: SourceLanguage) -> &'static Regex { - match language { - SourceLanguage::Rust => RUST_PLACEHOLDER_REGEX.deref(), - SourceLanguage::Java => JAVA_PLACEHOLDER_REGEX.deref(), - SourceLanguage::Cpp => CPP_PLACEHOLDER_REGEX.deref(), - } -} - -fn build_matcher(text: &str, language: SourceLanguage) -> Option { +fn build_matcher(raw: bool, text: &str, language: SourceLanguage) -> Option { let mut args = Vec::new(); let mut last_end = 0; let mut pattern = "(?s)^".to_string(); let mut quality = 0; - for cap in placeholder_regex_for(language).captures_iter(text) { + for cap in language.get_placeholder_regex().captures_iter(text) { let placeholder = cap.get(0).unwrap(); - let text = escape_ignore_newlines(&text[last_end..placeholder.start()]); + let text = escape_ignore_newlines(raw, &text[last_end..placeholder.start()]); quality += text.chars().filter(|c| !c.is_whitespace()).count(); pattern.push_str(text.as_str()); last_end = placeholder.end(); pattern.push_str("(.+)"); - args.push(match (cap.get(1), cap.get(2)) { - (Some(expr), None) => FormatArgument::Named(expr.as_str().to_string()), - (None, Some(pos)) => FormatArgument::Positional(pos.as_str().parse().unwrap_or(0)), - (Some(_), Some(_)) => unreachable!(), - (None, None) => FormatArgument::Placeholder, - }); + args.push(language.captures_to_format_arg(&cap)); } - let text = escape_ignore_newlines(&text[last_end..]); + let text = escape_ignore_newlines(raw, &text[last_end..]); quality += text.chars().filter(|c| !c.is_whitespace()).count(); if quality == 0 { None @@ -161,21 +142,65 @@ fn build_matcher(text: &str, language: SourceLanguage) -> Option } } +static ESCAPE_REGEX: LazyLock = LazyLock::new(|| { + Regex::new(r#"([.*+?^${}()|\[\]])|([\n\r\t])|(\\[0-7]{3}|\\0)|(\\N\{[^}]+})"#).unwrap() +}); + +// A regex for raw strings that doesn't try to interpret escape sequences. +static RAW_ESCAPE_REGEX: LazyLock = LazyLock::new(|| { + Regex::new(r#"([.*+?^${}()|\[\]])|([\n\r\t])|(\\)"#).unwrap() +}); + /// Escape special chars except newlines and carriage returns in order to support multiline strings -fn escape_ignore_newlines(segment: &str) -> String { +fn escape_ignore_newlines(raw: bool, segment: &str) -> String { + const HEX_CHARS: [char; 16] = [ + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', + ]; + let mut result = String::with_capacity(segment.len() * 2); - for c in segment.chars() { - match c { - '\n' => result.push_str(r"\n"), // Use actual newline in regex - '\r' => result.push_str(r"\r"), // Handle carriage returns too - // Escape regex special chars - '.' | '+' | '*' | '?' | '^' | '$' | '(' | ')' | '[' | ']' | '{' | '}' | '|' => { + let mut last_end = 0; + let regex = if raw { + &RAW_ESCAPE_REGEX + } else { + &ESCAPE_REGEX + }; + for cap in regex.captures_iter(segment) { + let overall_range = cap.get(0).unwrap().range(); + result.push_str(segment[last_end..overall_range.start].as_ref()); + last_end = overall_range.end; + if let Some(c) = cap.get(1) { + result.push('\\'); + result.push_str(c.as_str()); + } else if let Some(c) = cap.get(2) { + match c.as_str() { + "\n" => result.push_str("\\n"), + "\r" => result.push_str("\\r"), + "\t" => result.push_str("\\t"), + _ => unreachable!(), + } + } else if let Some(c) = cap.get(3) { + if raw { + result.push('\\'); + result.push_str(c.as_str()); + } else { + let c = c.as_str(); + let c = &c[1..]; + let c = u8::from_str_radix(c, 8).unwrap(); result.push('\\'); - result.push(c); + result.push('x'); + result.push(HEX_CHARS[(c >> 4) as usize]); + result.push(HEX_CHARS[(c & 0xf) as usize]); } - _ => result.push(c), + } else if let Some(_c) = cap.get(4) { + // XXX This is the fancy Python "\N{...}" escape sequence. Ideally, we'd interpret the + // name of the escape, but that seems like a lot of work. So, we'll just match any + // character. + result.push_str("\\w"); + } else { + unreachable!(); } } + result.push_str(segment[last_end..].as_ref()); result } @@ -190,9 +215,11 @@ mod tests { pattern: _pat, args: _args, .. - } = build_matcher("{}) {}, {}", SourceLanguage::Rust).unwrap(); + } = build_matcher(false, "{}) {}, {} \\033", SourceLanguage::Rust).unwrap(); assert_eq!( - Regex::new(r#"(?s)^(.+)\) (.+), (.+)$"#).unwrap().as_str(), + Regex::new(r#"(?s)^(.+)\) (.+), (.+) \x1B$"#) + .unwrap() + .as_str(), matcher.as_str() ); } @@ -200,7 +227,7 @@ mod tests { #[test] fn test_build_matcher_named() { let MessageMatcher { matcher, .. } = - build_matcher("abc {main_path:?} def", SourceLanguage::Rust).unwrap(); + build_matcher(false, "abc {main_path:?} def", SourceLanguage::Rust).unwrap(); assert_eq!( Regex::new(r#"(?s)^abc (.+) def$"#).unwrap().as_str(), matcher.as_str() @@ -210,7 +237,7 @@ mod tests { #[test] fn test_build_matcher_mix() { let MessageMatcher { matcher, args, .. } = - build_matcher("{}) {:?}, {foo.bar}", SourceLanguage::Rust).unwrap(); + build_matcher(false, "{}) {:?}, {foo.bar}", SourceLanguage::Rust).unwrap(); assert_eq!( Regex::new(r#"(?s)^(.+)\) (.+), (.+)$"#).unwrap().as_str(), matcher.as_str() @@ -221,7 +248,7 @@ mod tests { #[test] fn test_build_matcher_positional() { let MessageMatcher { matcher, args, .. } = - build_matcher("second={2}", SourceLanguage::Rust).unwrap(); + build_matcher(false, "second={2}", SourceLanguage::Rust).unwrap(); assert_eq!( Regex::new(r#"(?s)^second=(.+)$"#).unwrap().as_str(), matcher.as_str() @@ -232,7 +259,7 @@ mod tests { #[test] fn test_build_matcher_cpp() { let MessageMatcher { matcher, args, .. } = - build_matcher("they are %d years old", SourceLanguage::Cpp).unwrap(); + build_matcher(false, "they are %d years old", SourceLanguage::Cpp).unwrap(); assert_eq!( Regex::new(r#"(?s)^they are (.+) years old$"#) .unwrap() @@ -245,7 +272,7 @@ mod tests { #[test] fn test_build_matcher_cpp_spdlog() { let MessageMatcher { matcher, args, .. } = - build_matcher("they are {0:d} years old", SourceLanguage::Cpp).unwrap(); + build_matcher(false, "they are {0:d} years old", SourceLanguage::Cpp).unwrap(); assert_eq!( Regex::new(r#"(?s)^they are (.+) years old$"#) .unwrap() @@ -257,13 +284,14 @@ mod tests { #[test] fn test_build_matcher_none() { - let build_res = build_matcher("%s", SourceLanguage::Cpp); + let build_res = build_matcher(false, "%s", SourceLanguage::Cpp); assert!(build_res.is_none()); } #[test] fn test_build_matcher_multiline() { let MessageMatcher { matcher, .. } = build_matcher( + false, "you're only as funky\n as your last cut", SourceLanguage::Rust, )