fix: improve section extraction robustness (case-insensitive, H3, code blocks)

This commit is contained in:
康熙
2026-02-16 20:48:24 +08:00
committed by Peter Steinberger
parent 90476d465d
commit d0b33f23eb
2 changed files with 116 additions and 15 deletions

View File

@@ -96,4 +96,74 @@ Ignore this.
expect(result).not.toBeNull();
expect(result).toContain("[truncated]");
});
it("matches section names case-insensitively", async () => {
const content = `# Rules
## session startup
Read WORKFLOW_AUTO.md
## Other
`;
fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content);
const result = await readPostCompactionContext(tmpDir);
expect(result).not.toBeNull();
expect(result).toContain("WORKFLOW_AUTO.md");
});
it("matches H3 headings", async () => {
const content = `# Rules
### Session Startup
Read these files.
### Other
`;
fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content);
const result = await readPostCompactionContext(tmpDir);
expect(result).not.toBeNull();
expect(result).toContain("Read these files");
});
it("skips sections inside code blocks", async () => {
const content = `# Rules
\`\`\`markdown
## Session Startup
This is inside a code block and should NOT be extracted.
\`\`\`
## Red Lines
Real red lines here.
## Other
`;
fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content);
const result = await readPostCompactionContext(tmpDir);
expect(result).not.toBeNull();
expect(result).toContain("Real red lines here");
expect(result).not.toContain("inside a code block");
});
it("includes sub-headings within a section", async () => {
const content = `## Red Lines
### Rule 1
Never do X.
### Rule 2
Never do Y.
## Other Section
`;
fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content);
const result = await readPostCompactionContext(tmpDir);
expect(result).not.toBeNull();
expect(result).toContain("Rule 1");
expect(result).toContain("Rule 2");
expect(result).not.toContain("Other Section");
});
});

View File

@@ -44,8 +44,10 @@ export async function readPostCompactionContext(workspaceDir: string): Promise<s
}
/**
* Extract named H2 sections from markdown content.
* Matches "## SectionName" and captures until the next "## " or end of string.
* Extract named sections from markdown content.
* Matches H2 (##) or H3 (###) headings case-insensitively.
* Skips content inside fenced code blocks.
* Captures until the next heading of same or higher level, or end of string.
*/
function extractSections(content: string, sectionNames: string[]): string[] {
const results: string[] = [];
@@ -54,21 +56,54 @@ function extractSections(content: string, sectionNames: string[]): string[] {
for (const name of sectionNames) {
let sectionLines: string[] = [];
let inSection = false;
let sectionLevel = 0;
let inCodeBlock = false;
for (const line of lines) {
// Check if this is the start of our target section
if (line.match(new RegExp(`^##\\s+${escapeRegExp(name)}\\s*$`))) {
inSection = true;
sectionLines = [line];
// Track fenced code blocks
if (line.trimStart().startsWith("```")) {
inCodeBlock = !inCodeBlock;
if (inSection) {
sectionLines.push(line);
}
continue;
}
// If we're in the section, check if we've hit another H2 heading
if (inSection) {
if (line.match(/^##\s+/)) {
// Hit another H2 heading, stop collecting
break;
// Skip heading detection inside code blocks
if (inCodeBlock) {
if (inSection) {
sectionLines.push(line);
}
continue;
}
// Check if this line is a heading
const headingMatch = line.match(/^(#{2,3})\s+(.+?)\s*$/);
if (headingMatch) {
const level = headingMatch[1].length; // 2 or 3
const headingText = headingMatch[2];
if (!inSection) {
// Check if this is our target section (case-insensitive)
if (headingText.toLowerCase() === name.toLowerCase()) {
inSection = true;
sectionLevel = level;
sectionLines = [line];
continue;
}
} else {
// We're in section — stop if we hit a heading of same or higher level
if (level <= sectionLevel) {
break;
}
// Lower-level heading (e.g., ### inside ##) — include it
sectionLines.push(line);
continue;
}
}
if (inSection) {
sectionLines.push(line);
}
}
@@ -80,7 +115,3 @@ function extractSections(content: string, sectionNames: string[]): string[] {
return results;
}
function escapeRegExp(str: string): string {
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}