|
1 | | -//! Agent markdown parser for files with YAML frontmatter headers. |
| 1 | +//! Agent markdown parser for files with YAML frontmatter. |
| 2 | +//! |
| 3 | +//! Parses markdown that starts with `---` frontmatter and returns deserialized |
| 4 | +//! frontmatter data plus normalized body content (LF line endings, trimmed). |
| 5 | +//! YAML frontmatter is preprocessed by the `preprocessor` module before |
| 6 | +//! deserialization to handle unquoted colon-containing values safely. |
| 7 | +
|
| 8 | +mod preprocessor; |
2 | 9 |
|
3 | 10 | use crlf_to_lf_inplace::crlf_to_lf_inplace; |
| 11 | +use preprocessor::preprocess_frontmatter_yaml; |
4 | 12 | use serde::de::DeserializeOwned; |
5 | 13 | use thiserror::Error; |
6 | 14 |
|
@@ -41,7 +49,7 @@ pub(crate) fn parse_agent<T: DeserializeOwned>( |
41 | 49 | // Process YAML while we can still borrow content |
42 | 50 | let yaml = &content[offsets.yaml_start..offsets.yaml_end]; |
43 | 51 | let yaml_preprocessed = preprocess_frontmatter_yaml(yaml); |
44 | | - let data: T = serde_yaml::from_str(yaml_preprocessed.as_str()).map_err(|e| { |
| 52 | + let data: T = serde_yaml::from_str(yaml_preprocessed.as_ref()).map_err(|e| { |
45 | 53 | AgentParseError::InvalidYaml { |
46 | 54 | message: e.to_string(), |
47 | 55 | } |
@@ -137,265 +145,11 @@ fn extract_body_inplace(content: &mut String, body_start: usize) -> String { |
137 | 145 | body |
138 | 146 | } |
139 | 147 |
|
140 | | -#[inline] |
141 | | -fn is_valid_key(key: &str) -> bool { |
142 | | - let bytes = key.as_bytes(); |
143 | | - let Some((&first, rest)) = bytes.split_first() else { |
144 | | - return false; |
145 | | - }; |
146 | | - if !(first.is_ascii_alphabetic() || first == b'_') { |
147 | | - return false; |
148 | | - } |
149 | | - rest.iter() |
150 | | - .all(|byte| byte.is_ascii_alphanumeric() || *byte == b'_' || *byte == b'-') |
151 | | -} |
152 | | - |
153 | | -/// Checks if a YAML line contains an unquoted colon in the value that needs |
154 | | -/// block scalar conversion. |
155 | | -/// |
156 | | -/// Returns `Some((key, value))` if the line should be converted to block scalar |
157 | | -/// format, `None` if it's already safe for YAML parsing. |
158 | | -/// |
159 | | -/// # Returns `None` (no conversion needed) when: |
160 | | -/// |
161 | | -/// - Line is empty or a comment (`# ...`) |
162 | | -/// - Line is indented (continuation of a block scalar) |
163 | | -/// - No colon found (not a key-value pair) |
164 | | -/// - Key is not a valid YAML identifier |
165 | | -/// - Value is empty or already a block scalar indicator (`|`, `>`, `|-`, `>-`) |
166 | | -/// - Value is quoted (`"..."` or `'...'`) |
167 | | -/// - Value is a flow sequence (`[...]`) or mapping (`{...}`) |
168 | | -/// - Value doesn't contain a colon (no ambiguity to fix) |
169 | | -#[inline] |
170 | | -fn block_scalar_parts(line: &str) -> Option<(&str, &str)> { |
171 | | - let trimmed = line.trim(); |
172 | | - if trimmed.is_empty() || trimmed.starts_with('#') { |
173 | | - return None; |
174 | | - } |
175 | | - |
176 | | - let first = *line.as_bytes().first()?; |
177 | | - if first == b' ' || first == b'\t' { |
178 | | - return None; |
179 | | - } |
180 | | - |
181 | | - let colon_pos = line.find(':')?; |
182 | | - let key = line[..colon_pos].trim(); |
183 | | - if !is_valid_key(key) { |
184 | | - return None; |
185 | | - } |
186 | | - |
187 | | - let value = line[colon_pos + 1..].trim(); |
188 | | - if value.is_empty() || value == ">" || value == "|" || value == "|-" || value == ">-" { |
189 | | - return None; |
190 | | - } |
191 | | - |
192 | | - let first_value = value.as_bytes().first().copied(); |
193 | | - if matches!(first_value, Some(b'"') | Some(b'\'')) { |
194 | | - return None; |
195 | | - } |
196 | | - |
197 | | - if matches!(first_value, Some(b'{') | Some(b'[')) { |
198 | | - return None; |
199 | | - } |
200 | | - |
201 | | - if !value.contains(':') { |
202 | | - return None; |
203 | | - } |
204 | | - |
205 | | - Some((key, value)) |
206 | | -} |
207 | | - |
208 | | -/// Preprocesses YAML frontmatter to handle inline `key: value:with:colons`. |
209 | | -/// The input is the YAML slice only (no `---` delimiters). |
210 | | -/// |
211 | | -/// # Problem |
212 | | -/// |
213 | | -/// YAML interprets colons as key-value separators. A value like `provider/model:tag` |
214 | | -/// would be misparsed as a nested mapping. This function converts such lines to |
215 | | -/// block scalar format, which treats the entire value as a literal string. |
216 | | -/// |
217 | | -/// # Transformations |
218 | | -/// |
219 | | -/// **Converted to block scalar** (value contains unquoted colon): |
220 | | -/// |
221 | | -/// ```text |
222 | | -/// Input: |
223 | | -/// model: provider/model:tag |
224 | | -/// api_url: http://localhost:8080 |
225 | | -/// |
226 | | -/// Output: |
227 | | -/// model: |- |
228 | | -/// provider/model:tag |
229 | | -/// api_url: |- |
230 | | -/// http://localhost:8080 |
231 | | -/// ``` |
232 | | -/// |
233 | | -/// **Preserved unchanged** (already safe for YAML parsing): |
234 | | -/// |
235 | | -/// ```text |
236 | | -/// Input: |
237 | | -/// # comment: with:colon # Comments are ignored |
238 | | -/// description: No colons here # No colon in value |
239 | | -/// model: "provider/model:tag" # Double-quoted |
240 | | -/// model: 'provider/model:tag' # Single-quoted |
241 | | -/// content: | # Block scalar indicator |
242 | | -/// line:with:colon |
243 | | -/// items: ["a:b", "c:d"] # Flow array syntax |
244 | | -/// config: { "key": "a:b" } # Flow mapping syntax |
245 | | -/// |
246 | | -/// Output: (identical to input) |
247 | | -/// ``` |
248 | | -/// |
249 | | -/// # Notes |
250 | | -/// |
251 | | -/// - Uses `|-` (literal block, strip chomp) to avoid trailing newlines in values. |
252 | | -/// - Input is expected to be LF-normalized. |
253 | | -/// - Output uses LF line endings. |
254 | | -/// - This matches OpenCode's `preprocessFrontmatter` behavior. |
255 | | -fn preprocess_frontmatter_yaml(input: &str) -> YamlPreprocessed<'_> { |
256 | | - if input.is_empty() { |
257 | | - return YamlPreprocessed::Borrowed(input); |
258 | | - } |
259 | | - |
260 | | - let converted = convert_block_scalars(input); |
261 | | - match converted { |
262 | | - Some(output) => YamlPreprocessed::Owned(output), |
263 | | - None => YamlPreprocessed::Borrowed(input), |
264 | | - } |
265 | | -} |
266 | | - |
267 | | -enum YamlPreprocessed<'a> { |
268 | | - Borrowed(&'a str), |
269 | | - Owned(String), |
270 | | -} |
271 | | - |
272 | | -impl YamlPreprocessed<'_> { |
273 | | - #[inline] |
274 | | - fn as_str(&self) -> &str { |
275 | | - match self { |
276 | | - YamlPreprocessed::Borrowed(value) => value, |
277 | | - YamlPreprocessed::Owned(value) => value.as_str(), |
278 | | - } |
279 | | - } |
280 | | -} |
281 | | - |
282 | | -/// Converts lines with unquoted colons in values to block scalar format. |
283 | | -/// Returns `None` when no conversion is needed. |
284 | | -fn convert_block_scalars(input: &str) -> Option<String> { |
285 | | - let input_len = input.len(); |
286 | | - let mut output: Option<String> = None; |
287 | | - let mut need_newline = false; |
288 | | - let mut offset = 0usize; |
289 | | - |
290 | | - for line in input.split_terminator('\n') { |
291 | | - if let Some(out) = output.as_mut() { |
292 | | - if need_newline { |
293 | | - out.push('\n'); |
294 | | - } |
295 | | - if let Some((key, value)) = block_scalar_parts(line) { |
296 | | - out.push_str(key); |
297 | | - out.push_str(": |-\n "); |
298 | | - out.push_str(value); |
299 | | - } else { |
300 | | - out.push_str(line); |
301 | | - } |
302 | | - need_newline = true; |
303 | | - } else if let Some((key, value)) = block_scalar_parts(line) { |
304 | | - let mut out = String::with_capacity(input_len + 3); |
305 | | - if offset > 0 { |
306 | | - out.push_str(&input[..offset]); |
307 | | - } |
308 | | - out.push_str(key); |
309 | | - out.push_str(": |-\n "); |
310 | | - out.push_str(value); |
311 | | - output = Some(out); |
312 | | - need_newline = true; |
313 | | - } |
314 | | - |
315 | | - offset += line.len(); |
316 | | - if offset < input_len { |
317 | | - offset += 1; |
318 | | - } |
319 | | - } |
320 | | - |
321 | | - output |
322 | | -} |
323 | | - |
324 | 148 | #[cfg(test)] |
325 | 149 | mod tests { |
326 | 150 | use super::*; |
327 | 151 | use crate::config::RawFrontmatter; |
328 | 152 |
|
329 | | - #[test] |
330 | | - fn preprocess_handles_colons_in_value() { |
331 | | - let input = "model: provider/model:tag"; |
332 | | - let output = preprocess_frontmatter_yaml(input); |
333 | | - assert!(output.as_str().contains("model: |-")); |
334 | | - assert!(output.as_str().contains(" provider/model:tag")); |
335 | | - } |
336 | | - |
337 | | - #[test] |
338 | | - fn preprocess_preserves_quoted_values() { |
339 | | - let input = "model: \"provider/model:tag\""; |
340 | | - let output = preprocess_frontmatter_yaml(input); |
341 | | - assert!(output.as_str().contains("model: \"provider/model:tag\"")); |
342 | | - } |
343 | | - |
344 | | - #[test] |
345 | | - fn preprocess_preserves_block_scalars() { |
346 | | - let input = "desc: |\n multiline"; |
347 | | - let output = preprocess_frontmatter_yaml(input); |
348 | | - assert_eq!(input, output.as_str()); |
349 | | - } |
350 | | - |
351 | | - #[test] |
352 | | - fn preprocess_skips_comments() { |
353 | | - let input = "# comment: with:colon\nmode: subagent"; |
354 | | - let output = preprocess_frontmatter_yaml(input); |
355 | | - assert!(output.as_str().contains("# comment: with:colon")); |
356 | | - } |
357 | | - |
358 | | - #[test] |
359 | | - fn preprocess_skips_flow_mappings() { |
360 | | - let input = "task: { \"*\": \"deny\" }"; |
361 | | - let output = preprocess_frontmatter_yaml(input); |
362 | | - assert!(output.as_str().contains("task: { \"*\": \"deny\" }")); |
363 | | - } |
364 | | - |
365 | | - #[test] |
366 | | - fn preprocess_skips_flow_arrays() { |
367 | | - let input = "items: [\"a:b\", \"c:d\"]"; |
368 | | - let output = preprocess_frontmatter_yaml(input); |
369 | | - assert!(output.as_str().contains("items: [\"a:b\", \"c:d\"]")); |
370 | | - } |
371 | | - |
372 | | - #[test] |
373 | | - fn preprocess_handles_key_with_whitespace_around_colon() { |
374 | | - let input = "model : provider/model:tag"; |
375 | | - let output = preprocess_frontmatter_yaml(input); |
376 | | - assert!(output.as_str().contains("model: |-")); |
377 | | - assert!(output.as_str().contains(" provider/model:tag")); |
378 | | - } |
379 | | - |
380 | | - #[test] |
381 | | - fn preprocess_handles_crlf_line_endings() { |
382 | | - let mut input = "model: provider/model:tag\r\napi_url: http://localhost:8080".to_string(); |
383 | | - crlf_to_lf_inplace(&mut input); |
384 | | - let output = preprocess_frontmatter_yaml(&input); |
385 | | - assert!(output.as_str().contains("model: |-")); |
386 | | - assert!(output.as_str().contains(" provider/model:tag")); |
387 | | - } |
388 | | - |
389 | | - #[test] |
390 | | - fn preprocess_skips_indented_lines() { |
391 | | - // FIX #1: Indented lines should be skipped (continuation of previous value) |
392 | | - let input = "desc: |\n line:with:colons"; |
393 | | - let output = preprocess_frontmatter_yaml(input); |
394 | | - // Should NOT convert the indented line |
395 | | - assert!(output.as_str().contains(" line:with:colons")); |
396 | | - assert!(!output.as_str().contains(" line: |-")); // Should not have nested block scalar |
397 | | - } |
398 | | - |
399 | 153 | #[test] |
400 | 154 | fn parse_extracts_frontmatter_and_content() { |
401 | 155 | let input = "---\nmode: subagent\ndescription: Test agent\n---\n\nPrompt body here."; |
|
0 commit comments