fix: improve section extraction robustness (case-insensitive, H3, code blocks)
This commit is contained in:
@@ -96,4 +96,74 @@ Ignore this.
|
||||
expect(result).not.toBeNull();
|
||||
expect(result).toContain("[truncated]");
|
||||
});
|
||||
|
||||
it("matches section names case-insensitively", async () => {
|
||||
const content = `# Rules
|
||||
|
||||
## session startup
|
||||
|
||||
Read WORKFLOW_AUTO.md
|
||||
|
||||
## Other
|
||||
`;
|
||||
fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content);
|
||||
const result = await readPostCompactionContext(tmpDir);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result).toContain("WORKFLOW_AUTO.md");
|
||||
});
|
||||
|
||||
it("matches H3 headings", async () => {
|
||||
const content = `# Rules
|
||||
|
||||
### Session Startup
|
||||
|
||||
Read these files.
|
||||
|
||||
### Other
|
||||
`;
|
||||
fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content);
|
||||
const result = await readPostCompactionContext(tmpDir);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result).toContain("Read these files");
|
||||
});
|
||||
|
||||
it("skips sections inside code blocks", async () => {
|
||||
const content = `# Rules
|
||||
|
||||
\`\`\`markdown
|
||||
## Session Startup
|
||||
This is inside a code block and should NOT be extracted.
|
||||
\`\`\`
|
||||
|
||||
## Red Lines
|
||||
|
||||
Real red lines here.
|
||||
|
||||
## Other
|
||||
`;
|
||||
fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content);
|
||||
const result = await readPostCompactionContext(tmpDir);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result).toContain("Real red lines here");
|
||||
expect(result).not.toContain("inside a code block");
|
||||
});
|
||||
|
||||
it("includes sub-headings within a section", async () => {
|
||||
const content = `## Red Lines
|
||||
|
||||
### Rule 1
|
||||
Never do X.
|
||||
|
||||
### Rule 2
|
||||
Never do Y.
|
||||
|
||||
## Other Section
|
||||
`;
|
||||
fs.writeFileSync(path.join(tmpDir, "AGENTS.md"), content);
|
||||
const result = await readPostCompactionContext(tmpDir);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result).toContain("Rule 1");
|
||||
expect(result).toContain("Rule 2");
|
||||
expect(result).not.toContain("Other Section");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -44,8 +44,10 @@ export async function readPostCompactionContext(workspaceDir: string): Promise<s
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract named H2 sections from markdown content.
|
||||
* Matches "## SectionName" and captures until the next "## " or end of string.
|
||||
* Extract named sections from markdown content.
|
||||
* Matches H2 (##) or H3 (###) headings case-insensitively.
|
||||
* Skips content inside fenced code blocks.
|
||||
* Captures until the next heading of same or higher level, or end of string.
|
||||
*/
|
||||
function extractSections(content: string, sectionNames: string[]): string[] {
|
||||
const results: string[] = [];
|
||||
@@ -54,21 +56,54 @@ function extractSections(content: string, sectionNames: string[]): string[] {
|
||||
for (const name of sectionNames) {
|
||||
let sectionLines: string[] = [];
|
||||
let inSection = false;
|
||||
let sectionLevel = 0;
|
||||
let inCodeBlock = false;
|
||||
|
||||
for (const line of lines) {
|
||||
// Check if this is the start of our target section
|
||||
if (line.match(new RegExp(`^##\\s+${escapeRegExp(name)}\\s*$`))) {
|
||||
inSection = true;
|
||||
sectionLines = [line];
|
||||
// Track fenced code blocks
|
||||
if (line.trimStart().startsWith("```")) {
|
||||
inCodeBlock = !inCodeBlock;
|
||||
if (inSection) {
|
||||
sectionLines.push(line);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// If we're in the section, check if we've hit another H2 heading
|
||||
if (inSection) {
|
||||
if (line.match(/^##\s+/)) {
|
||||
// Hit another H2 heading, stop collecting
|
||||
break;
|
||||
// Skip heading detection inside code blocks
|
||||
if (inCodeBlock) {
|
||||
if (inSection) {
|
||||
sectionLines.push(line);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if this line is a heading
|
||||
const headingMatch = line.match(/^(#{2,3})\s+(.+?)\s*$/);
|
||||
|
||||
if (headingMatch) {
|
||||
const level = headingMatch[1].length; // 2 or 3
|
||||
const headingText = headingMatch[2];
|
||||
|
||||
if (!inSection) {
|
||||
// Check if this is our target section (case-insensitive)
|
||||
if (headingText.toLowerCase() === name.toLowerCase()) {
|
||||
inSection = true;
|
||||
sectionLevel = level;
|
||||
sectionLines = [line];
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
// We're in section — stop if we hit a heading of same or higher level
|
||||
if (level <= sectionLevel) {
|
||||
break;
|
||||
}
|
||||
// Lower-level heading (e.g., ### inside ##) — include it
|
||||
sectionLines.push(line);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (inSection) {
|
||||
sectionLines.push(line);
|
||||
}
|
||||
}
|
||||
@@ -80,7 +115,3 @@ function extractSections(content: string, sectionNames: string[]): string[] {
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
function escapeRegExp(str: string): string {
|
||||
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user