diff --git a/Creed CIM.pdf b/Creed CIM.pdf new file mode 100644 index 0000000..88dee62 Binary files /dev/null and b/Creed CIM.pdf differ diff --git a/backend/src/services/optimizedAgenticRAGProcessor.ts b/backend/src/services/optimizedAgenticRAGProcessor.ts index 34b7edc..ec8348d 100644 --- a/backend/src/services/optimizedAgenticRAGProcessor.ts +++ b/backend/src/services/optimizedAgenticRAGProcessor.ts @@ -700,11 +700,12 @@ Use specific investment terminology: "investment thesis", "value creation levers } // Calculate similarity for each chunk - // We'll use a simplified approach: search for similar chunks and filter by documentId + // Pass documentId to optimize search and prevent cross-document matches const similarChunks = await vectorDatabaseService.searchSimilar( queryEmbedding, Math.min(allChunks.length, 30), // Increased from 20 to 30 to get more chunks - 0.4 // Lower threshold from 0.5 to 0.4 to get more chunks + 0.4, // Lower threshold from 0.5 to 0.4 to get more chunks + documentId // Pass documentId to filter search to this document only ); // Filter to only chunks from this document and sort by similarity @@ -1278,17 +1279,77 @@ Use specific investment terminology: "investment thesis", "value creation levers deterministicPinnedChunks ); - logger.info('Pass 2: Market Analysis + Business Operations (Combined)'); - const pass2CombinedResult = await this.extractPass2CombinedMarketBusiness(documentId, text, chunks); + // Pass 1.5: Financial Validation - Conditional (skip if deterministic parser found data) + // Only run validation if deterministic parser didn't find structured data + let validationResult: { hasIssues: boolean; issues: string[]; correctedData?: Partial; apiCalls: number } = { hasIssues: false, issues: [], apiCalls: 0 }; - logger.info('Pass 3: Investment Thesis'); - const pass3Result = await this.extractPass5InvestmentThesis(documentId, text, chunks); - - const sequentialTime = Date.now() - sequentialStart; - logger.info('Sequential extraction completed', { + if (deterministicFinancials && this.hasStructuredFinancialData(deterministicFinancials)) { + logger.info('Pass 1.5: Skipping financial validation (deterministic parser found structured data)', { documentId }); + } else { + logger.info('Pass 1.5: Financial Validation'); + validationResult = await this.validateFinancialExtraction( documentId, - sequentialTimeMs: sequentialTime, - sequentialTimeSec: (sequentialTime / 1000).toFixed(1) + text, + chunks, + pass1CombinedResult.data + ); + + // If validation found issues, merge corrected data + if (validationResult.hasIssues && validationResult.correctedData) { + logger.info('Financial validation found issues, applying corrections', { + documentId, + issuesFound: validationResult.issues.length, + issues: validationResult.issues.slice(0, 5) + }); + // Merge corrected financial data + Object.assign(pass1CombinedResult.data, validationResult.correctedData); + } else { + logger.info('Financial validation passed', { documentId }); + } + } + totalApiCalls += validationResult.apiCalls; + + // OPTIMIZATION: Run Pass 2 and Pass 3 in parallel (they're independent after Pass 1) + logger.info('Pass 2 & 3: Running Market Analysis and Investment Thesis in parallel'); + let pass2CombinedResult: { data: Partial; apiCalls: number }; + let pass3Result: { data: Partial; apiCalls: number }; + + try { + const [pass2Result, pass3ResultValue] = await Promise.all([ + this.extractPass2CombinedMarketBusiness(documentId, text, chunks), + this.extractPass5InvestmentThesis(documentId, text, chunks) + ]); + pass2CombinedResult = pass2Result; + pass3Result = pass3ResultValue; + } catch (error) { + // If parallel execution fails, log error but don't fail entire processing + // This allows partial results to be used + logger.error('Parallel Pass 2/3 execution failed, attempting sequential fallback', { + documentId, + error: error instanceof Error ? error.message : String(error) + }); + + // Fallback to sequential execution + try { + pass2CombinedResult = await this.extractPass2CombinedMarketBusiness(documentId, text, chunks); + pass3Result = await this.extractPass5InvestmentThesis(documentId, text, chunks); + logger.info('Sequential fallback for Pass 2/3 completed successfully', { documentId }); + } catch (fallbackError) { + // If fallback also fails, re-throw to be caught by outer try-catch + logger.error('Sequential fallback for Pass 2/3 also failed', { + documentId, + error: fallbackError instanceof Error ? fallbackError.message : String(fallbackError) + }); + throw fallbackError; + } + } + + const extractionTime = Date.now() - sequentialStart; + logger.info('Multi-pass extraction completed', { + documentId, + extractionTimeMs: extractionTime, + extractionTimeSec: (extractionTime / 1000).toFixed(1), + note: 'Pass 2 and Pass 3 ran in parallel for faster processing' }); partialResults.push(pass1CombinedResult.data); @@ -1330,14 +1391,25 @@ Use specific investment terminology: "investment thesis", "value creation levers // CRITICAL: Always attempt gap-filling, but limit to top 20 most important fields if (missingFields.length > 0) { - // Prioritize important fields: deal overview, business description, market analysis + // Identify low confidence fields (fields with uncertain values) + const lowConfidenceFields = this.identifyLowConfidenceFields(mergedData); + + // Prioritize: 1) Low confidence fields, 2) Important missing fields, 3) Other missing fields const priorityFields = missingFields.filter(f => f.startsWith('dealOverview.') || f.startsWith('businessDescription.') || - f.startsWith('marketIndustryAnalysis.') + f.startsWith('marketIndustryAnalysis.') || + f.startsWith('financialSummary.') ); - const fieldsToFill = priorityFields.length > 0 - ? priorityFields.slice(0, 20) // Top 20 priority fields + + // Combine low confidence and priority fields, remove duplicates + const highPriorityFields = [ + ...lowConfidenceFields.filter(f => missingFields.includes(f)), + ...priorityFields.filter(f => !lowConfidenceFields.includes(f)) + ]; + + const fieldsToFill = highPriorityFields.length > 0 + ? highPriorityFields.slice(0, 20) // Top 20 priority fields : missingFields.slice(0, 20); // Or top 20 overall if no priority fields if (fieldsToFill.length > 0) { @@ -1554,6 +1626,262 @@ IMPORTANT EXTRACTION RULES: }); } + /** + * Validate financial extraction and trigger re-extraction if issues found + * Implements Recommendation 5: Multi-Pass Financial Validation + */ + private async validateFinancialExtraction( + documentId: string, + text: string, + chunks: ProcessingChunk[], + extractedData: Partial + ): Promise<{ + hasIssues: boolean; + issues: string[]; + correctedData?: Partial; + apiCalls: number; + }> { + const issues: string[] = []; + let apiCalls = 0; + const financials = extractedData.financialSummary?.financials; + + if (!financials) { + logger.warn('No financial data found for validation', { documentId }); + return { hasIssues: false, issues: [], apiCalls: 0 }; + } + + // Validation checks + const periods = ['fy3', 'fy2', 'fy1', 'ltm'] as const; + const revenueValues: { period: string; value: string }[] = []; + const ebitdaValues: { period: string; value: string }[] = []; + + // Extract and parse financial values + for (const period of periods) { + const periodData = financials[period]; + if (periodData) { + const revenue = periodData.revenue; + const ebitda = periodData.ebitda; + + if (revenue && revenue !== 'Not specified in CIM') { + revenueValues.push({ period, value: revenue }); + } + if (ebitda && ebitda !== 'Not specified in CIM') { + ebitdaValues.push({ period, value: ebitda }); + } + } + } + + // 1. Magnitude Validation + for (const rev of revenueValues) { + const numValue = this.parseFinancialValue(rev.value); + if (numValue !== null && numValue < 10) { + issues.push(`Revenue magnitude check failed: ${rev.period} revenue = ${rev.value} (< $10M threshold). May be extracting from wrong table.`); + } + } + + for (const ebitda of ebitdaValues) { + const numValue = this.parseFinancialValue(ebitda.value); + if (numValue !== null && numValue < 1 && numValue > 0) { + issues.push(`EBITDA magnitude check: ${ebitda.period} EBITDA = ${ebitda.value} (< $1M threshold). Verify this is correct.`); + } + if (numValue !== null && numValue < 0) { + issues.push(`EBITDA negative: ${ebitda.period} EBITDA = ${ebitda.value}. Verify this is correct.`); + } + } + + // 2. Trend Validation + if (revenueValues.length >= 2) { + const sortedRevs = revenueValues.sort((a, b) => { + const order = { fy3: 0, fy2: 1, fy1: 2, ltm: 3 }; + return order[a.period as keyof typeof order] - order[b.period as keyof typeof order]; + }); + + for (let i = 1; i < sortedRevs.length; i++) { + const prev = this.parseFinancialValue(sortedRevs[i - 1].value); + const curr = this.parseFinancialValue(sortedRevs[i].value); + + if (prev !== null && curr !== null) { + const change = ((curr - prev) / prev) * 100; + if (change < -50) { + issues.push(`Revenue trend check failed: ${sortedRevs[i - 1].period} to ${sortedRevs[i].period} shows ${change.toFixed(1)}% drop. May indicate column misalignment.`); + } + if (change > 200) { + issues.push(`Revenue trend check failed: ${sortedRevs[i - 1].period} to ${sortedRevs[i].period} shows ${change.toFixed(1)}% increase. May indicate column misalignment.`); + } + } + } + } + + // 3. Cross-Period Consistency (Growth Rate Validation) + for (const period of periods) { + const periodData = financials[period]; + if (periodData?.revenue && periodData?.revenueGrowth) { + const revenue = periodData.revenue; + const growth = periodData.revenueGrowth; + + if (growth !== 'N/A' && growth !== 'Not specified in CIM' && !growth.includes('N/A')) { + // Find prior period revenue + const periodIndex = periods.indexOf(period); + if (periodIndex > 0) { + const priorPeriod = periods[periodIndex - 1]; + const priorData = financials[priorPeriod]; + + if (priorData?.revenue && priorData.revenue !== 'Not specified in CIM') { + const currRev = this.parseFinancialValue(revenue); + const priorRev = this.parseFinancialValue(priorData.revenue); + + if (currRev !== null && priorRev !== null && priorRev > 0) { + const calculatedGrowth = ((currRev - priorRev) / priorRev) * 100; + const statedGrowth = this.parsePercentage(growth); + + if (statedGrowth !== null && Math.abs(calculatedGrowth - statedGrowth) > 5) { + issues.push(`Growth rate mismatch: ${period} stated growth ${growth} but calculated ${calculatedGrowth.toFixed(1)}%. Verify extraction.`); + } + } + } + } + } + } + } + + // 4. Margin Validation + for (const period of periods) { + const periodData = financials[period]; + if (periodData?.revenue && periodData?.ebitdaMargin) { + const revenue = periodData.revenue; + const ebitda = periodData.ebitda; + const margin = periodData.ebitdaMargin; + + if (revenue !== 'Not specified in CIM' && ebitda !== 'Not specified in CIM' && margin !== 'Not specified in CIM') { + const revValue = this.parseFinancialValue(revenue); + const ebitdaValue = this.parseFinancialValue(ebitda); + const marginValue = this.parsePercentage(margin); + + if (revValue !== null && ebitdaValue !== null && revValue > 0 && marginValue !== null) { + const calculatedMargin = (ebitdaValue / revValue) * 100; + + if (Math.abs(calculatedMargin - marginValue) > 2) { + issues.push(`EBITDA margin mismatch: ${period} stated margin ${margin} but calculated ${calculatedMargin.toFixed(1)}%. Verify extraction.`); + } + + // Check margin is in reasonable range + if (marginValue < 5 || marginValue > 50) { + issues.push(`EBITDA margin out of typical range: ${period} margin ${margin} (typical range 5-50%). Verify extraction.`); + } + } + } + } + } + + // If issues found, trigger targeted re-extraction + if (issues.length > 0) { + logger.warn('Financial validation found issues, triggering re-extraction', { + documentId, + issueCount: issues.length, + issues: issues.slice(0, 5) + }); + + // Create focused re-extraction prompt + const validationPrompt = `RE-EXTRACT AND VALIDATE FINANCIAL DATA: + +The following validation issues were found in the initial extraction. Please re-check and correct: + +${issues.slice(0, 10).map((issue, idx) => `${idx + 1}. ${issue}`).join('\n')} + +**RE-EXTRACTION REQUIREMENTS**: +1. Re-locate the PRIMARY historical financial table +2. Verify column alignment - ensure values match their period columns +3. Re-calculate growth rates and margins to verify consistency +4. Cross-reference with executive summary financial highlights +5. If discrepancies exist, use the most authoritative source (typically detailed table) + +**CRITICAL CHECKS**: +- Revenue should be $10M+ for target companies +- Revenue trends should be stable or increasing (not sudden drops >50% or increases >200%) +- Growth rates should match: ((Current - Prior) / Prior) * 100 +- Margins should match: (EBITDA / Revenue) * 100 +- EBITDA margins should be 5-50% (typical range) + +Extract ONLY the financial summary data and verify all calculations.`; + + try { + const { chunks: relevantChunks } = await findRelevantChunks( + documentId, + 'PRIMARY historical financial table revenue EBITDA margins', + chunks, + 30000 + ); + + const reducedText = relevantChunks + .slice(0, 15) + .map((chunk, index) => { + const separator = index > 0 ? '\n\n---\n\n' : ''; + return `${separator}[Section ${chunk.chunkIndex + 1}]\n${chunk.content}`; + }) + .join('\n\n'); + + const llmService = (await import('./llmService')).llmService; + const result = await llmService.processCIMDocument(reducedText, 'BPCP CIM Review Template'); + apiCalls++; + + if (result.success && result.jsonOutput?.financialSummary) { + return { + hasIssues: true, + issues, + correctedData: { financialSummary: result.jsonOutput.financialSummary }, + apiCalls + }; + } + } catch (error) { + logger.error('Financial validation re-extraction failed', { + documentId, + error: error instanceof Error ? error.message : String(error) + }); + } + } + + return { + hasIssues: issues.length > 0, + issues, + apiCalls + }; + } + + /** + * Parse financial value from string (e.g., "$64.2M" -> 64.2) + */ + private parseFinancialValue(value: string): number | null { + if (!value || value === 'Not specified in CIM') return null; + + // Remove $ and parse + const cleaned = value.replace(/[$,]/g, '').trim(); + const match = cleaned.match(/^([\d.]+)([kmb]?)$/i); + + if (!match) return null; + + const num = parseFloat(match[1]); + const suffix = match[2].toLowerCase(); + + if (suffix === 'k') return num / 1000; // Convert thousands to millions + if (suffix === 'm') return num; + if (suffix === 'b') return num * 1000; // Convert billions to millions + + return num; // Assume millions if no suffix + } + + /** + * Parse percentage from string (e.g., "12.5%" -> 12.5) + */ + private parsePercentage(value: string): number | null { + if (!value || value === 'Not specified in CIM' || value === 'N/A') return null; + + // Remove % and parse, handle negative percentages like "(4.4)%" + const cleaned = value.replace(/[()%]/g, '').trim(); + const num = parseFloat(cleaned); + + return isNaN(num) ? null : num; + } + /** * Prioritize chunks that likely contain financial data */ @@ -2133,7 +2461,56 @@ Provide 3-4 key reasons supporting the recommendation, focusing on: - Most compelling investment attractions - Most significant risks or concerns - Strategic fit and alignment -- Value creation potential`; +- Value creation potential + +**EXAMPLE: HIGH-QUALITY INVESTMENT THESIS** (Follow this format): + +**Key Attractions**: +1. Market-leading position with 25% market share in $2.5B TAM, providing pricing power and competitive moat. Revenue grew 15% CAGR over 3 years to $64M, demonstrating strong execution. This market position supports 2-3x revenue growth potential through geographic expansion and product line extensions. + +2. Strong management team with CEO having 20+ years industry experience and track record of scaling businesses from $30M to $100M+. Management committed to stay post-transaction with equity rollover, reducing execution risk. Team depth includes experienced CFO and COO with complementary skills. + +3. Recurring revenue model with 70% of revenue from multi-year contracts averaging 3-year terms, providing predictable cash flow and low churn (<5% annually). Top 10 customers represent 45% of revenue with average 8-year relationship tenure, demonstrating strong customer loyalty. + +4. Clear value creation opportunities through BPCP's operational playbook: (a) Margin expansion of 200-300 bps through shared services consolidation and procurement optimization, adding $1.5-2.3M EBITDA, (b) Add-on M&A strategy in fragmented market with 15+ potential targets identified, (c) Technology enablement to automate manual processes, reducing SG&A by 150 bps. + +5. Strong financial performance with EBITDA margins expanding from 10.5% to 12.8% over 3 years, demonstrating operational leverage. Free cash flow conversion >90% with minimal capital intensity (<3% of revenue), supporting debt capacity and dividend potential. + +**Value Creation Levers**: +1. **Margin Expansion**: Reduce SG&A by 150 bps through shared services consolidation and procurement optimization, adding $1.5M EBITDA within 12-18 months. Leverage BPCP's procurement expertise and shared services platform used successfully in portfolio companies. + +2. **Add-on M&A**: Execute roll-up strategy in fragmented market with 15+ potential add-on targets identified. Target 2-3 acquisitions over 3 years, adding $15-25M revenue and $2-4M EBITDA. Use platform's customer relationships and operational infrastructure to integrate acquisitions efficiently. + +3. **Revenue Growth**: Expand into adjacent geographic markets where company has limited presence but strong brand recognition. Target 20% revenue growth through geographic expansion and new product launches, supported by existing sales infrastructure. + +**Potential Risks**: +1. **Customer Concentration Risk (Medium Probability, High Impact)**: Top 3 customers represent 35% of revenue, creating dependency risk. Mitigation: Diversify customer base through new customer acquisition and expand relationships with existing customers. Not a deal-breaker given long-term relationships and contract terms. + +2. **Management Retention Risk (Low Probability, High Impact)**: Key person risk with CEO being critical to business. Mitigation: Strong retention incentives with equity rollover and long-term incentive plan. Management committed to stay and has succession plan in place. + +**EXAMPLE: LOW-QUALITY INVESTMENT THESIS** (AVOID - Too vague, lacks specificity): + +**Key Attractions**: +1. Strong market position. [TOO VAGUE - lacks specificity, quantification, investment impact] +2. Good management team. [TOO GENERIC - no details, no track record, no investment significance] +3. Growing business. [NO QUANTIFICATION - what growth rate? over what period?] +4. Good financials. [NO SPECIFICS - what metrics? what trends?] + +**Value Creation Levers**: +1. Operational improvements. [TOO VAGUE - what improvements? quantified impact?] +2. Growth opportunities. [NO SPECIFICS - what opportunities? how to execute?] + +**Potential Risks**: +1. Some risks exist. [NO DETAILS - what risks? probability? impact? mitigation?] + +**CRITICAL QUALITY REQUIREMENTS**: +- **Specificity**: Use exact numbers, percentages, and metrics (e.g., "25% market share", "15% CAGR", "$64M revenue") +- **Quantification**: Include quantified impact for value creation (e.g., "adding $1.5M EBITDA", "200-300 bps margin expansion") +- **Investment Impact**: Explain why each point matters for the investment decision +- **Evidence-Based**: Base all statements on information from the CIM document +- **Strategic Context**: Connect attractions to BPCP's investment strategy and value creation playbook +- **Risk Assessment**: Provide probability, impact, and mitigation for each risk +- **Actionable**: Value creation levers should be specific and executable, not generic`; const targetFields = [ 'preliminaryInvestmentThesis.keyAttractions', @@ -2284,8 +2661,45 @@ Provide 3-4 key reasons supporting the recommendation, focusing on: const hasAppendices = selectedChunks.some(chunk => chunk.content.match(/\b(Appendix|Exhibit|Attachment)\b/i) ); - + + // Detect document type for context-aware adaptation + const allChunkText = selectedChunks.map(chunk => chunk.content).join(' ').toLowerCase(); + const documentType = this.detectDocumentType(allChunkText, selectedChunks); + let enhancedInstructions = baseQuery + '\n\n'; + + // Add document type-specific adaptations + if (documentType === 'bank-prepared') { + enhancedInstructions += `**DOCUMENT TYPE: Bank-Prepared CIM** +- This CIM was prepared by an investment bank or M&A advisor +- Emphasize cross-referencing executive summary with detailed financial tables +- Executive summary may use adjusted/pro forma numbers - verify against historical tables +- Financial tables are typically well-structured and authoritative +- Look for investment bank branding/logos on cover page or headers +- Deal source should be clearly identified (investment bank name) + +`; + } else if (documentType === 'company-prepared') { + enhancedInstructions += `**DOCUMENT TYPE: Company-Prepared CIM** +- This CIM was prepared by the company itself (not an investment bank) +- Emphasize narrative text extraction - information may be in descriptive sections +- Financial tables may be less structured - verify carefully +- Look for company branding/logos instead of investment bank branding +- May have more detailed operational and business description sections +- Competitive positioning may be more detailed in narrative text + +`; + } else if (documentType === 'auction') { + enhancedInstructions += `**DOCUMENT TYPE: Auction Process CIM** +- This CIM is part of a competitive auction process +- Emphasize competitive positioning and differentiation +- May include multiple valuation scenarios or strategic options +- Timeline information may be critical (bid deadlines, process milestones) +- Competitive dynamics and market position are particularly important +- May include more detailed risk disclosures + +`; + } // Add field-specific instruction templates if (financialFields.length > 0) { @@ -2391,6 +2805,95 @@ Provide 3-4 key reasons supporting the recommendation, focusing on: return enhancedInstructions; } + /** + * Detect document type for context-aware prompt adaptation + * Implements Recommendation 11: Context-Aware Prompt Adaptation + */ + private detectDocumentType(text: string, chunks: ProcessingChunk[]): 'bank-prepared' | 'company-prepared' | 'auction' | 'unknown' { + const lowerText = text.toLowerCase(); + + // Bank-prepared indicators + const bankIndicators = [ + /investment\s+bank/i, + /m&a\s+advisor/i, + /financial\s+advisor/i, + /transaction\s+advisor/i, + /prepared\s+by.*bank/i, + /harris\s+williams/i, + /capstone\s+partners/i, + /raymond\s+james/i, + /jefferies/i, + /piper\s+sandler/i, + /william\s+blair/i, + /stifel/i, + /baird/i, + /lincoln\s+international/i, + /duff\s+&\s+phelps/i, + /houlihan\s+lokey/i, + /moelis/i, + /lazard/i, + /goldman\s+sachs/i, + /morgan\s+stanley/i, + /jpmorgan/i + ]; + + // Auction process indicators + const auctionIndicators = [ + /auction\s+process/i, + /competitive\s+process/i, + /bid\s+deadline/i, + /bid\s+process/i, + /competitive\s+auction/i, + /multiple\s+bidders/i, + /stalking\s+horse/i, + /qualifying\s+bids/i + ]; + + // Company-prepared indicators (less formal, company branding) + const companyIndicators = [ + /prepared\s+by.*company/i, + /company\s+prepared/i, + /internal\s+preparation/i, + /management\s+presentation/i + ]; + + // Check for bank-prepared (highest priority - most common) + const hasBankIndicators = bankIndicators.some(pattern => pattern.test(lowerText)) || + chunks.some(chunk => { + const content = chunk.content.toLowerCase(); + return bankIndicators.some(pattern => pattern.test(content)); + }); + + if (hasBankIndicators) { + return 'bank-prepared'; + } + + // Check for auction process + const hasAuctionIndicators = auctionIndicators.some(pattern => pattern.test(lowerText)) || + chunks.some(chunk => { + const content = chunk.content.toLowerCase(); + return auctionIndicators.some(pattern => pattern.test(content)); + }); + + if (hasAuctionIndicators) { + return 'auction'; + } + + // Check for company-prepared + const hasCompanyIndicators = companyIndicators.some(pattern => pattern.test(lowerText)) || + chunks.some(chunk => { + const content = chunk.content.toLowerCase(); + return companyIndicators.some(pattern => pattern.test(content)); + }); + + if (hasCompanyIndicators) { + return 'company-prepared'; + } + + // Default: assume bank-prepared (most common type) + return 'bank-prepared'; + } + private hasStructuredFinancialData(financials?: ParsedFinancials | null): boolean { if (!financials) return false; const periods: Array = ['fy3', 'fy2', 'fy1', 'ltm']; @@ -2596,6 +3099,68 @@ Provide 3-4 key reasons supporting the recommendation, focusing on: return missing; } + /** + * Identify fields with low confidence indicators + * Implements Recommendation 12: Confidence Scoring and Uncertainty Handling + */ + private identifyLowConfidenceFields(data: Partial): string[] { + const lowConfidence: string[] = []; + + // Low confidence indicators in field values + const lowConfidencePatterns = [ + /approximately/i, + /estimated/i, + /roughly/i, + /about/i, + /~/, // Tilde indicates approximation + /around/i, + /likely/i, + /probably/i, + /possibly/i, + /may be/i, + /could be/i, + /seems to be/i, + /appears to be/i, + /suggest/i, + /indicate/i, + /inferred/i, + /uncertain/i, + /unclear/i, + /ambiguous/i, + /\d+-\d+/, // Ranges like "15-20%" indicate uncertainty + /between.*and/i, + /or so/i, + /give or take/i + ]; + + const checkObject = (obj: any, prefix: string = ''): void => { + for (const key in obj) { + const value = obj[key]; + const path = prefix ? `${prefix}.${key}` : key; + + if (typeof value === 'string' && value !== 'Not specified in CIM') { + // Check if value contains low confidence indicators + const hasLowConfidence = lowConfidencePatterns.some(pattern => pattern.test(value)); + + if (hasLowConfidence) { + lowConfidence.push(path); + } + } else if (typeof value === 'object' && value !== null && !Array.isArray(value)) { + checkObject(value, path); + } + } + }; + + checkObject(data); + + logger.info('Low confidence fields identified', { + count: lowConfidence.length, + fields: lowConfidence.slice(0, 10) + }); + + return lowConfidence; + } + /** * Pass 6: Gap-Filling - Make targeted queries for missing fields */ @@ -2791,16 +3356,95 @@ Provide 3-4 key reasons supporting the recommendation, focusing on: query += `- Check footnotes, appendices, and exhibits for additional detail\n`; query += `- Look for tables, charts, and graphs that may contain the information\n\n`; - // Inference rules - query += `**INFERENCE RULES**:\n`; - if (financialFields.some(f => f.includes('revenueGrowth'))) { - query += `- If revenue for two periods is available, calculate growth: ((Current - Prior) / Prior) * 100\n`; + // Inference rules - Comprehensive field-specific calculation formulas + query += `**INFERENCE RULES** (Calculate derived fields when base data is available):\n\n`; + + // Financial field inference rules + if (financialFields.length > 0) { + query += `**FINANCIAL FIELD INFERENCE RULES**:\n`; + + if (financialFields.some(f => f.includes('revenueGrowth'))) { + query += `- revenueGrowth: If revenue for 2 periods is available, calculate: ((Current - Prior) / Prior) * 100\n`; + query += ` Example: If FY-3 revenue = $64M and FY-2 revenue = $71M, then revenueGrowth = ((71 - 64) / 64) * 100 = 10.9%\n`; + } + + if (financialFields.some(f => f.includes('ebitdaMargin'))) { + query += `- ebitdaMargin: If revenue and EBITDA are available, calculate: (EBITDA / Revenue) * 100\n`; + query += ` Example: If revenue = $71M and EBITDA = $8.5M, then ebitdaMargin = (8.5 / 71) * 100 = 12.0%\n`; + } + + if (financialFields.some(f => f.includes('grossMargin'))) { + query += `- grossMargin: If revenue and grossProfit are available, calculate: (Gross Profit / Revenue) * 100\n`; + query += ` Example: If revenue = $71M and grossProfit = $28.4M, then grossMargin = (28.4 / 71) * 100 = 40.0%\n`; + } + + if (financialFields.some(f => f.includes('CAGR') || f.includes('cagr'))) { + query += `- CAGR (Compound Annual Growth Rate): If multiple periods available, calculate: ((End/Start)^(1/Periods) - 1) * 100\n`; + query += ` Example: If FY-3 revenue = $64M and FY-1 revenue = $76M over 2 periods, then CAGR = ((76/64)^(1/2) - 1) * 100 = 9.0%\n`; + } + + if (financialFields.some(f => f.includes('margin') || f.includes('Margin'))) { + query += `- Margin calculations: For any margin field, if numerator and revenue available, calculate: (Numerator / Revenue) * 100\n`; + query += ` - EBITDA margin: (EBITDA / Revenue) * 100\n`; + query += ` - Gross margin: (Gross Profit / Revenue) * 100\n`; + query += ` - Operating margin: (Operating Income / Revenue) * 100\n`; + } + + query += `\n`; } - if (financialFields.some(f => f.includes('Margin'))) { - query += `- If revenue and profit metric available, calculate margin: (Metric / Revenue) * 100\n`; + + // Market field inference rules + if (marketFields.length > 0) { + query += `**MARKET FIELD INFERENCE RULES**:\n`; + + if (marketFields.some(f => f.includes('market') && f.includes('share'))) { + query += `- Market share: If TAM and company revenue are available, calculate: (Company Revenue / TAM) * 100\n`; + query += ` Example: If company revenue = $71M and TAM = $2.5B, then market share = (71 / 2500) * 100 = 2.8%\n`; + } + + if (marketFields.some(f => f.includes('market') && (f.includes('growth') || f.includes('Growth')))) { + query += `- Market growth rate: If TAM or market size for 2 periods is available, calculate growth: ((Current - Prior) / Prior) * 100\n`; + query += ` Example: If TAM Year 1 = $2.0B and TAM Year 2 = $2.5B, then market growth = ((2.5 - 2.0) / 2.0) * 100 = 25.0%\n`; + } + + if (marketFields.some(f => f.includes('CAGR') || f.includes('cagr'))) { + query += `- Market CAGR: If market size for multiple periods available, calculate: ((End/Start)^(1/Periods) - 1) * 100\n`; + } + + query += `\n`; } - query += `- Do NOT infer values - only calculate if base data is available\n`; - query += `- If calculation is possible, use calculated value; otherwise use "Not specified in CIM"\n\n`; + + // Business field inference rules + if (businessFields.length > 0) { + query += `**BUSINESS FIELD INFERENCE RULES**:\n`; + + if (businessFields.some(f => f.includes('customer') && (f.includes('concentration') || f.includes('Concentration')))) { + query += `- Customer concentration: If top customers mentioned with percentages, sum the percentages\n`; + query += ` Example: If top 3 customers are 15%, 12%, and 10% of revenue, then top 3 concentration = 15% + 12% + 10% = 37%\n`; + } + + if (businessFields.some(f => f.includes('recurring') || f.includes('Recurring'))) { + query += `- Recurring revenue %: If MRR/ARR and total revenue are available, calculate: (Recurring Revenue / Total Revenue) * 100\n`; + query += ` Example: If ARR = $42M and total revenue = $71M, then recurring revenue % = (42 / 71) * 100 = 59.2%\n`; + } + + if (businessFields.some(f => f.includes('customer') && (f.includes('retention') || f.includes('Retention')))) { + query += `- Customer retention rate: If churn rate is available, calculate: 100 - Churn Rate\n`; + query += ` Example: If annual churn = 5%, then retention rate = 100 - 5 = 95%\n`; + } + + query += `\n`; + } + + // General inference rules + query += `**GENERAL INFERENCE RULES**:\n`; + query += `- Do NOT infer values - only calculate if base data is explicitly available in the document\n`; + query += `- Always verify calculations match stated values if both are present (use stated value if discrepancy)\n`; + query += `- Format calculated values consistently: percentages as "XX.X%", currency as "$XX.XM" or "$XX.XB"\n`; + query += `- If calculation is possible and base data is available, use calculated value\n`; + query += `- If base data is not available, use "Not specified in CIM"\n`; + query += `- Round percentages to one decimal place (e.g., 12.0%, not 11.956%)\n`; + query += `- Round currency to appropriate precision ($64.2M, not $64.156M)\n\n`; // Cross-section search query += `**CROSS-SECTION SEARCH**:\n`; diff --git a/backend/src/services/rag/chunkProcessing.ts b/backend/src/services/rag/chunkProcessing.ts index a1b8596..f457709 100644 --- a/backend/src/services/rag/chunkProcessing.ts +++ b/backend/src/services/rag/chunkProcessing.ts @@ -42,9 +42,9 @@ export async function processChunksInBatches( // Process batch with concurrency control const batchPromises = batch.map(async (chunk, batchIndex) => { try { - // Add delay to respect API rate limits + // Add delay to respect API rate limits (reduced from 100ms to 50ms for faster processing) if (batchIndex > 0) { - await new Promise(resolve => setTimeout(resolve, 100)); + await new Promise(resolve => setTimeout(resolve, 50)); } // Enrich metadata if enabled diff --git a/backend/src/services/rag/embeddingService.ts b/backend/src/services/rag/embeddingService.ts index 97b5009..9e2a430 100644 --- a/backend/src/services/rag/embeddingService.ts +++ b/backend/src/services/rag/embeddingService.ts @@ -3,7 +3,7 @@ import { vectorDatabaseService } from '../vectorDatabaseService'; import { VectorDatabaseModel } from '../../models/VectorDatabaseModel'; import type { ProcessingChunk } from './types'; -const MAX_CONCURRENT_EMBEDDINGS = 5; +const MAX_CONCURRENT_EMBEDDINGS = 10; // Increased from 5 to 10 for faster processing const STORE_BATCH_SIZE = 20; /** @@ -22,9 +22,9 @@ export async function generateEmbeddingsWithRateLimit( const batchPromises = batch.map(async (chunk, batchIndex) => { try { - // Add delay between API calls + // Add delay between API calls (reduced from 200ms to 50ms for faster processing) if (batchIndex > 0) { - await new Promise(resolve => setTimeout(resolve, 200)); + await new Promise(resolve => setTimeout(resolve, 50)); } const embedding = await vectorDatabaseService.generateEmbeddings(chunk.content);