Skip to content

Commit cebc73d

Browse files
committed
Changed: Refactor parser into directory module with dedicated preprocessor
- Split monolithic parser.rs into parser/mod.rs and parser/preprocessor.rs - Moved YAML frontmatter preprocessing logic and tests to dedicated submodule - Improved module documentation and preprocessor code readability - Extracted first-pass detection helper for block scalar rewrite
1 parent a65145d commit cebc73d

2 files changed

Lines changed: 280 additions & 256 deletions

File tree

src/llm-coding-tools-agents/src/parser.rs renamed to src/llm-coding-tools-agents/src/parser/mod.rs

Lines changed: 10 additions & 256 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
1-
//! Agent markdown parser for files with YAML frontmatter headers.
1+
//! Agent markdown parser for files with YAML frontmatter.
2+
//!
3+
//! Parses markdown that starts with `---` frontmatter and returns deserialized
4+
//! frontmatter data plus normalized body content (LF line endings, trimmed).
5+
//! YAML frontmatter is preprocessed by the `preprocessor` module before
6+
//! deserialization to handle unquoted colon-containing values safely.
7+
8+
mod preprocessor;
29

310
use crlf_to_lf_inplace::crlf_to_lf_inplace;
11+
use preprocessor::preprocess_frontmatter_yaml;
412
use serde::de::DeserializeOwned;
513
use thiserror::Error;
614

@@ -41,7 +49,7 @@ pub(crate) fn parse_agent<T: DeserializeOwned>(
4149
// Process YAML while we can still borrow content
4250
let yaml = &content[offsets.yaml_start..offsets.yaml_end];
4351
let yaml_preprocessed = preprocess_frontmatter_yaml(yaml);
44-
let data: T = serde_yaml::from_str(yaml_preprocessed.as_str()).map_err(|e| {
52+
let data: T = serde_yaml::from_str(yaml_preprocessed.as_ref()).map_err(|e| {
4553
AgentParseError::InvalidYaml {
4654
message: e.to_string(),
4755
}
@@ -137,265 +145,11 @@ fn extract_body_inplace(content: &mut String, body_start: usize) -> String {
137145
body
138146
}
139147

140-
#[inline]
141-
fn is_valid_key(key: &str) -> bool {
142-
let bytes = key.as_bytes();
143-
let Some((&first, rest)) = bytes.split_first() else {
144-
return false;
145-
};
146-
if !(first.is_ascii_alphabetic() || first == b'_') {
147-
return false;
148-
}
149-
rest.iter()
150-
.all(|byte| byte.is_ascii_alphanumeric() || *byte == b'_' || *byte == b'-')
151-
}
152-
153-
/// Checks if a YAML line contains an unquoted colon in the value that needs
154-
/// block scalar conversion.
155-
///
156-
/// Returns `Some((key, value))` if the line should be converted to block scalar
157-
/// format, `None` if it's already safe for YAML parsing.
158-
///
159-
/// # Returns `None` (no conversion needed) when:
160-
///
161-
/// - Line is empty or a comment (`# ...`)
162-
/// - Line is indented (continuation of a block scalar)
163-
/// - No colon found (not a key-value pair)
164-
/// - Key is not a valid YAML identifier
165-
/// - Value is empty or already a block scalar indicator (`|`, `>`, `|-`, `>-`)
166-
/// - Value is quoted (`"..."` or `'...'`)
167-
/// - Value is a flow sequence (`[...]`) or mapping (`{...}`)
168-
/// - Value doesn't contain a colon (no ambiguity to fix)
169-
#[inline]
170-
fn block_scalar_parts(line: &str) -> Option<(&str, &str)> {
171-
let trimmed = line.trim();
172-
if trimmed.is_empty() || trimmed.starts_with('#') {
173-
return None;
174-
}
175-
176-
let first = *line.as_bytes().first()?;
177-
if first == b' ' || first == b'\t' {
178-
return None;
179-
}
180-
181-
let colon_pos = line.find(':')?;
182-
let key = line[..colon_pos].trim();
183-
if !is_valid_key(key) {
184-
return None;
185-
}
186-
187-
let value = line[colon_pos + 1..].trim();
188-
if value.is_empty() || value == ">" || value == "|" || value == "|-" || value == ">-" {
189-
return None;
190-
}
191-
192-
let first_value = value.as_bytes().first().copied();
193-
if matches!(first_value, Some(b'"') | Some(b'\'')) {
194-
return None;
195-
}
196-
197-
if matches!(first_value, Some(b'{') | Some(b'[')) {
198-
return None;
199-
}
200-
201-
if !value.contains(':') {
202-
return None;
203-
}
204-
205-
Some((key, value))
206-
}
207-
208-
/// Preprocesses YAML frontmatter to handle inline `key: value:with:colons`.
209-
/// The input is the YAML slice only (no `---` delimiters).
210-
///
211-
/// # Problem
212-
///
213-
/// YAML interprets colons as key-value separators. A value like `provider/model:tag`
214-
/// would be misparsed as a nested mapping. This function converts such lines to
215-
/// block scalar format, which treats the entire value as a literal string.
216-
///
217-
/// # Transformations
218-
///
219-
/// **Converted to block scalar** (value contains unquoted colon):
220-
///
221-
/// ```text
222-
/// Input:
223-
/// model: provider/model:tag
224-
/// api_url: http://localhost:8080
225-
///
226-
/// Output:
227-
/// model: |-
228-
/// provider/model:tag
229-
/// api_url: |-
230-
/// http://localhost:8080
231-
/// ```
232-
///
233-
/// **Preserved unchanged** (already safe for YAML parsing):
234-
///
235-
/// ```text
236-
/// Input:
237-
/// # comment: with:colon # Comments are ignored
238-
/// description: No colons here # No colon in value
239-
/// model: "provider/model:tag" # Double-quoted
240-
/// model: 'provider/model:tag' # Single-quoted
241-
/// content: | # Block scalar indicator
242-
/// line:with:colon
243-
/// items: ["a:b", "c:d"] # Flow array syntax
244-
/// config: { "key": "a:b" } # Flow mapping syntax
245-
///
246-
/// Output: (identical to input)
247-
/// ```
248-
///
249-
/// # Notes
250-
///
251-
/// - Uses `|-` (literal block, strip chomp) to avoid trailing newlines in values.
252-
/// - Input is expected to be LF-normalized.
253-
/// - Output uses LF line endings.
254-
/// - This matches OpenCode's `preprocessFrontmatter` behavior.
255-
fn preprocess_frontmatter_yaml(input: &str) -> YamlPreprocessed<'_> {
256-
if input.is_empty() {
257-
return YamlPreprocessed::Borrowed(input);
258-
}
259-
260-
let converted = convert_block_scalars(input);
261-
match converted {
262-
Some(output) => YamlPreprocessed::Owned(output),
263-
None => YamlPreprocessed::Borrowed(input),
264-
}
265-
}
266-
267-
enum YamlPreprocessed<'a> {
268-
Borrowed(&'a str),
269-
Owned(String),
270-
}
271-
272-
impl YamlPreprocessed<'_> {
273-
#[inline]
274-
fn as_str(&self) -> &str {
275-
match self {
276-
YamlPreprocessed::Borrowed(value) => value,
277-
YamlPreprocessed::Owned(value) => value.as_str(),
278-
}
279-
}
280-
}
281-
282-
/// Converts lines with unquoted colons in values to block scalar format.
283-
/// Returns `None` when no conversion is needed.
284-
fn convert_block_scalars(input: &str) -> Option<String> {
285-
let input_len = input.len();
286-
let mut output: Option<String> = None;
287-
let mut need_newline = false;
288-
let mut offset = 0usize;
289-
290-
for line in input.split_terminator('\n') {
291-
if let Some(out) = output.as_mut() {
292-
if need_newline {
293-
out.push('\n');
294-
}
295-
if let Some((key, value)) = block_scalar_parts(line) {
296-
out.push_str(key);
297-
out.push_str(": |-\n ");
298-
out.push_str(value);
299-
} else {
300-
out.push_str(line);
301-
}
302-
need_newline = true;
303-
} else if let Some((key, value)) = block_scalar_parts(line) {
304-
let mut out = String::with_capacity(input_len + 3);
305-
if offset > 0 {
306-
out.push_str(&input[..offset]);
307-
}
308-
out.push_str(key);
309-
out.push_str(": |-\n ");
310-
out.push_str(value);
311-
output = Some(out);
312-
need_newline = true;
313-
}
314-
315-
offset += line.len();
316-
if offset < input_len {
317-
offset += 1;
318-
}
319-
}
320-
321-
output
322-
}
323-
324148
#[cfg(test)]
325149
mod tests {
326150
use super::*;
327151
use crate::config::RawFrontmatter;
328152

329-
#[test]
330-
fn preprocess_handles_colons_in_value() {
331-
let input = "model: provider/model:tag";
332-
let output = preprocess_frontmatter_yaml(input);
333-
assert!(output.as_str().contains("model: |-"));
334-
assert!(output.as_str().contains(" provider/model:tag"));
335-
}
336-
337-
#[test]
338-
fn preprocess_preserves_quoted_values() {
339-
let input = "model: \"provider/model:tag\"";
340-
let output = preprocess_frontmatter_yaml(input);
341-
assert!(output.as_str().contains("model: \"provider/model:tag\""));
342-
}
343-
344-
#[test]
345-
fn preprocess_preserves_block_scalars() {
346-
let input = "desc: |\n multiline";
347-
let output = preprocess_frontmatter_yaml(input);
348-
assert_eq!(input, output.as_str());
349-
}
350-
351-
#[test]
352-
fn preprocess_skips_comments() {
353-
let input = "# comment: with:colon\nmode: subagent";
354-
let output = preprocess_frontmatter_yaml(input);
355-
assert!(output.as_str().contains("# comment: with:colon"));
356-
}
357-
358-
#[test]
359-
fn preprocess_skips_flow_mappings() {
360-
let input = "task: { \"*\": \"deny\" }";
361-
let output = preprocess_frontmatter_yaml(input);
362-
assert!(output.as_str().contains("task: { \"*\": \"deny\" }"));
363-
}
364-
365-
#[test]
366-
fn preprocess_skips_flow_arrays() {
367-
let input = "items: [\"a:b\", \"c:d\"]";
368-
let output = preprocess_frontmatter_yaml(input);
369-
assert!(output.as_str().contains("items: [\"a:b\", \"c:d\"]"));
370-
}
371-
372-
#[test]
373-
fn preprocess_handles_key_with_whitespace_around_colon() {
374-
let input = "model : provider/model:tag";
375-
let output = preprocess_frontmatter_yaml(input);
376-
assert!(output.as_str().contains("model: |-"));
377-
assert!(output.as_str().contains(" provider/model:tag"));
378-
}
379-
380-
#[test]
381-
fn preprocess_handles_crlf_line_endings() {
382-
let mut input = "model: provider/model:tag\r\napi_url: http://localhost:8080".to_string();
383-
crlf_to_lf_inplace(&mut input);
384-
let output = preprocess_frontmatter_yaml(&input);
385-
assert!(output.as_str().contains("model: |-"));
386-
assert!(output.as_str().contains(" provider/model:tag"));
387-
}
388-
389-
#[test]
390-
fn preprocess_skips_indented_lines() {
391-
// FIX #1: Indented lines should be skipped (continuation of previous value)
392-
let input = "desc: |\n line:with:colons";
393-
let output = preprocess_frontmatter_yaml(input);
394-
// Should NOT convert the indented line
395-
assert!(output.as_str().contains(" line:with:colons"));
396-
assert!(!output.as_str().contains(" line: |-")); // Should not have nested block scalar
397-
}
398-
399153
#[test]
400154
fn parse_extracts_frontmatter_and_content() {
401155
let input = "---\nmode: subagent\ndescription: Test agent\n---\n\nPrompt body here.";

0 commit comments

Comments
 (0)