Add multi-tenant architecture and advanced document parsing capabilities

2025-08-07 16:22:28 -04:00
parent fbfe940a45
commit a4877aaa7d
7 changed files with 875 additions and 12 deletions
--- a/app/models/tenant.py
+++ b/app/models/tenant.py
@@ -0,0 +1,129 @@
+"""
+Tenant models for multi-company support in the Virtual Board Member AI System.
+"""
+from datetime import datetime
+from typing import Optional
+from sqlalchemy import Column, String, DateTime, Boolean, Text, Integer, ForeignKey
+from sqlalchemy.dialects.postgresql import UUID, JSONB
+from sqlalchemy.orm import relationship
+import uuid
+import enum
+from app.core.database import Base
+
+
+class TenantStatus(str, enum.Enum):
+    """Tenant status enumeration."""
+    ACTIVE = "active"
+    SUSPENDED = "suspended"
+    PENDING = "pending"
+    INACTIVE = "inactive"
+
+
+class TenantTier(str, enum.Enum):
+    """Tenant subscription tier."""
+    BASIC = "basic"
+    PROFESSIONAL = "professional"
+    ENTERPRISE = "enterprise"
+    CUSTOM = "custom"
+
+
+class Tenant(Base):
+    """Tenant model for multi-company support."""
+    __tablename__ = "tenants"
+
+    # Primary key
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    
+    # Tenant identification
+    name = Column(String(255), nullable=False, unique=True)
+    slug = Column(String(100), nullable=False, unique=True)  # URL-friendly identifier
+    domain = Column(String(255), nullable=True, unique=True)  # Custom domain
+    
+    # Company information
+    company_name = Column(String(255), nullable=False)
+    company_description = Column(Text, nullable=True)
+    industry = Column(String(100), nullable=True)
+    company_size = Column(String(50), nullable=True)  # small, medium, large, enterprise
+    
+    # Contact information
+    primary_contact_name = Column(String(255), nullable=False)
+    primary_contact_email = Column(String(255), nullable=False)
+    primary_contact_phone = Column(String(50), nullable=True)
+    
+    # Subscription and billing
+    tier = Column(String(50), default=TenantTier.BASIC, nullable=False)
+    status = Column(String(50), default=TenantStatus.PENDING, nullable=False)
+    subscription_start_date = Column(DateTime, nullable=True)
+    subscription_end_date = Column(DateTime, nullable=True)
+    
+    # Configuration
+    settings = Column(JSONB, nullable=True)  # Tenant-specific settings
+    features_enabled = Column(JSONB, nullable=True)  # Feature flags
+    storage_quota_gb = Column(Integer, default=10, nullable=False)
+    user_limit = Column(Integer, default=10, nullable=False)
+    
+    # Security and compliance
+    data_retention_days = Column(Integer, default=2555, nullable=False)  # 7 years default
+    encryption_level = Column(String(50), default="standard", nullable=False)
+    compliance_frameworks = Column(JSONB, nullable=True)  # SOX, GDPR, etc.
+    
+    # Timestamps
+    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
+    activated_at = Column(DateTime, nullable=True)
+    
+    # Relationships
+    users = relationship("User", back_populates="tenant", cascade="all, delete-orphan")
+    documents = relationship("Document", back_populates="tenant", cascade="all, delete-orphan")
+    commitments = relationship("Commitment", back_populates="tenant", cascade="all, delete-orphan")
+    audit_logs = relationship("AuditLog", back_populates="tenant", cascade="all, delete-orphan")
+    
+    def __repr__(self):
+        return f"<Tenant(id={self.id}, name='{self.name}', company='{self.company_name}')>"
+    
+    @property
+    def is_active(self) -> bool:
+        """Check if tenant is active."""
+        return self.status == TenantStatus.ACTIVE
+    
+    @property
+    def is_suspended(self) -> bool:
+        """Check if tenant is suspended."""
+        return self.status == TenantStatus.SUSPENDED
+    
+    @property
+    def has_expired_subscription(self) -> bool:
+        """Check if subscription has expired."""
+        if not self.subscription_end_date:
+            return False
+        return datetime.utcnow() > self.subscription_end_date
+    
+    def get_setting(self, key: str, default=None):
+        """Get a tenant-specific setting."""
+        if not self.settings:
+            return default
+        return self.settings.get(key, default)
+    
+    def set_setting(self, key: str, value):
+        """Set a tenant-specific setting."""
+        if not self.settings:
+            self.settings = {}
+        self.settings[key] = value
+    
+    def is_feature_enabled(self, feature: str) -> bool:
+        """Check if a feature is enabled for this tenant."""
+        if not self.features_enabled:
+            return False
+        return self.features_enabled.get(feature, False)
+    
+    def enable_feature(self, feature: str):
+        """Enable a feature for this tenant."""
+        if not self.features_enabled:
+            self.features_enabled = {}
+        self.features_enabled[feature] = True
+    
+    def disable_feature(self, feature: str):
+        """Disable a feature for this tenant."""
+        if not self.features_enabled:
+            self.features_enabled = {}
+        self.features_enabled[feature] = False
--- a/app/models/user.py
+++ b/app/models/user.py
@@ -4,8 +4,9 @@ User model for authentication and user management.

 from datetime import datetime
 from typing import Optional
-from sqlalchemy import Column, String, DateTime, Boolean, Text, Enum
+from sqlalchemy import Column, String, DateTime, Boolean, Text, Enum, ForeignKey
 from sqlalchemy.dialects.postgresql import UUID
+from sqlalchemy.orm import relationship
 import uuid
 import enum

@@ -58,6 +59,9 @@ class User(Base):
    oauth_provider = Column(String(50), nullable=True)  # auth0, cognito, etc.
    oauth_id = Column(String(255), nullable=True)
    
+    # Tenant relationship
+    tenant_id = Column(UUID(as_uuid=True), ForeignKey("tenants.id"), nullable=False)
+    
    # Timestamps
    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
@@ -68,6 +72,9 @@ class User(Base):
    language = Column(String(10), default="en")
    notification_preferences = Column(Text, nullable=True)  # JSON string
    
+    # Relationships
+    tenant = relationship("Tenant", back_populates="users")
+    
    def __repr__(self) -> str:
        return f"<User(id={self.id}, email='{self.email}', role='{self.role}')>"
    
--- a/app/services/document_processor.py
+++ b/app/services/document_processor.py
@@ -0,0 +1,482 @@
+"""
+Advanced document processing service with table and graphics extraction capabilities.
+"""
+import asyncio
+import logging
+from typing import Dict, List, Optional, Tuple, Any
+from pathlib import Path
+import io
+
+import pdfplumber
+import fitz  # PyMuPDF
+import pandas as pd
+import numpy as np
+from PIL import Image
+import cv2
+import pytesseract
+from pptx import Presentation
+from pptx.enum.shapes import MSO_SHAPE_TYPE
+import tabula
+import camelot
+
+from app.core.config import settings
+from app.models.document import Document, DocumentType
+from app.models.tenant import Tenant
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentProcessor:
+    """Advanced document processor with table and graphics extraction."""
+    
+    def __init__(self, tenant: Tenant):
+        self.tenant = tenant
+        self.supported_formats = {
+            '.pdf': self._process_pdf,
+            '.pptx': self._process_powerpoint,
+            '.xlsx': self._process_excel,
+            '.docx': self._process_word,
+            '.txt': self._process_text
+        }
+    
+    async def process_document(self, file_path: Path, document: Document) -> Dict[str, Any]:
+        """Process a document and extract all content including tables and graphics."""
+        try:
+            file_extension = file_path.suffix.lower()
+            
+            if file_extension not in self.supported_formats:
+                raise ValueError(f"Unsupported file format: {file_extension}")
+            
+            processor = self.supported_formats[file_extension]
+            result = await processor(file_path, document)
+            
+            # Add tenant-specific processing
+            result['tenant_id'] = str(self.tenant.id)
+            result['tenant_name'] = self.tenant.name
+            
+            return result
+            
+        except Exception as e:
+            logger.error(f"Error processing document {file_path}: {str(e)}")
+            raise
+    
+    async def _process_pdf(self, file_path: Path, document: Document) -> Dict[str, Any]:
+        """Process PDF with advanced table and graphics extraction."""
+        result = {
+            'text_content': [],
+            'tables': [],
+            'charts': [],
+            'images': [],
+            'metadata': {},
+            'structure': {}
+        }
+        
+        try:
+            # Use pdfplumber for text and table extraction
+            with pdfplumber.open(file_path) as pdf:
+                result['metadata']['pages'] = len(pdf.pages)
+                result['metadata']['file_size'] = file_path.stat().st_size
+                
+                for page_num, page in enumerate(pdf.pages):
+                    page_result = await self._extract_pdf_page_content(page, page_num)
+                    result['text_content'].extend(page_result['text'])
+                    result['tables'].extend(page_result['tables'])
+                    result['charts'].extend(page_result['charts'])
+                    result['images'].extend(page_result['images'])
+            
+            # Use PyMuPDF for additional graphics extraction
+            await self._extract_pdf_graphics(file_path, result)
+            
+            # Use tabula for complex table extraction
+            await self._extract_pdf_tables_tabula(file_path, result)
+            
+            # Use camelot for lattice table extraction
+            await self._extract_pdf_tables_camelot(file_path, result)
+            
+        except Exception as e:
+            logger.error(f"Error processing PDF {file_path}: {str(e)}")
+            raise
+        
+        return result
+    
+    async def _extract_pdf_page_content(self, page, page_num: int) -> Dict[str, Any]:
+        """Extract content from a single PDF page."""
+        page_result = {
+            'text': [],
+            'tables': [],
+            'charts': [],
+            'images': []
+        }
+        
+        # Extract text
+        text = page.extract_text()
+        if text:
+            page_result['text'].append({
+                'page': page_num + 1,
+                'content': text,
+                'bbox': page.bbox
+            })
+        
+        # Extract tables using pdfplumber
+        tables = page.extract_tables()
+        for table_num, table in enumerate(tables):
+            if table and len(table) > 1:  # Ensure table has content
+                table_data = {
+                    'page': page_num + 1,
+                    'table_number': table_num + 1,
+                    'data': table,
+                    'rows': len(table),
+                    'columns': len(table[0]) if table else 0,
+                    'extraction_method': 'pdfplumber'
+                }
+                page_result['tables'].append(table_data)
+        
+        # Extract images
+        images = page.images
+        for img_num, img in enumerate(images):
+            image_data = {
+                'page': page_num + 1,
+                'image_number': img_num + 1,
+                'bbox': img['bbox'],
+                'width': img['width'],
+                'height': img['height'],
+                'type': img.get('name', 'unknown')
+            }
+            page_result['images'].append(image_data)
+        
+        return page_result
+    
+    async def _extract_pdf_graphics(self, file_path: Path, result: Dict[str, Any]):
+        """Extract graphics and charts from PDF using PyMuPDF."""
+        try:
+            doc = fitz.open(file_path)
+            
+            for page_num in range(len(doc)):
+                page = doc[page_num]
+                
+                # Extract images
+                image_list = page.get_images()
+                for img_index, img in enumerate(image_list):
+                    xref = img[0]
+                    pix = fitz.Pixmap(doc, xref)
+                    
+                    if pix.n - pix.alpha < 4:  # GRAY or RGB
+                        image_data = {
+                            'page': page_num + 1,
+                            'image_number': img_index + 1,
+                            'width': pix.width,
+                            'height': pix.height,
+                            'colorspace': pix.colorspace.name,
+                            'extraction_method': 'PyMuPDF'
+                        }
+                        result['images'].append(image_data)
+                
+                # Extract drawings and shapes
+                drawings = page.get_drawings()
+                for drawing in drawings:
+                    if drawing.get('type') == 'l':  # Line
+                        chart_data = {
+                            'page': page_num + 1,
+                            'type': 'chart_element',
+                            'bbox': drawing.get('rect'),
+                            'extraction_method': 'PyMuPDF'
+                        }
+                        result['charts'].append(chart_data)
+            
+            doc.close()
+            
+        except Exception as e:
+            logger.error(f"Error extracting PDF graphics: {str(e)}")
+    
+    async def _extract_pdf_tables_tabula(self, file_path: Path, result: Dict[str, Any]):
+        """Extract tables using tabula-py."""
+        try:
+            tables = tabula.read_pdf(str(file_path), pages='all', multiple_tables=True)
+            
+            for page_num, page_tables in enumerate(tables):
+                for table_num, table in enumerate(page_tables):
+                    if not table.empty:
+                        table_data = {
+                            'page': page_num + 1,
+                            'table_number': table_num + 1,
+                            'data': table.to_dict('records'),
+                            'rows': len(table),
+                            'columns': len(table.columns),
+                            'extraction_method': 'tabula'
+                        }
+                        result['tables'].append(table_data)
+                        
+        except Exception as e:
+            logger.error(f"Error extracting tables with tabula: {str(e)}")
+    
+    async def _extract_pdf_tables_camelot(self, file_path: Path, result: Dict[str, Any]):
+        """Extract tables using camelot-py."""
+        try:
+            tables = camelot.read_pdf(str(file_path), pages='all')
+            
+            for table in tables:
+                if table.df is not None and not table.df.empty:
+                    table_data = {
+                        'page': table.page,
+                        'table_number': table.order,
+                        'data': table.df.to_dict('records'),
+                        'rows': len(table.df),
+                        'columns': len(table.df.columns),
+                        'accuracy': table.accuracy,
+                        'whitespace': table.whitespace,
+                        'extraction_method': 'camelot'
+                    }
+                    result['tables'].append(table_data)
+                    
+        except Exception as e:
+            logger.error(f"Error extracting tables with camelot: {str(e)}")
+    
+    async def _process_powerpoint(self, file_path: Path, document: Document) -> Dict[str, Any]:
+        """Process PowerPoint with table and graphics extraction."""
+        result = {
+            'text_content': [],
+            'tables': [],
+            'charts': [],
+            'images': [],
+            'metadata': {},
+            'structure': {}
+        }
+        
+        try:
+            prs = Presentation(file_path)
+            result['metadata']['slides'] = len(prs.slides)
+            result['metadata']['file_size'] = file_path.stat().st_size
+            
+            for slide_num, slide in enumerate(prs.slides):
+                slide_result = await self._extract_powerpoint_slide_content(slide, slide_num)
+                result['text_content'].extend(slide_result['text'])
+                result['tables'].extend(slide_result['tables'])
+                result['charts'].extend(slide_result['charts'])
+                result['images'].extend(slide_result['images'])
+                
+        except Exception as e:
+            logger.error(f"Error processing PowerPoint {file_path}: {str(e)}")
+            raise
+        
+        return result
+    
+    async def _extract_powerpoint_slide_content(self, slide, slide_num: int) -> Dict[str, Any]:
+        """Extract content from a single PowerPoint slide."""
+        slide_result = {
+            'text': [],
+            'tables': [],
+            'charts': [],
+            'images': []
+        }
+        
+        for shape in slide.shapes:
+            # Extract text
+            if hasattr(shape, 'text') and shape.text.strip():
+                text_data = {
+                    'slide': slide_num + 1,
+                    'content': shape.text.strip(),
+                    'shape_type': str(shape.shape_type),
+                    'bbox': (shape.left, shape.top, shape.width, shape.height)
+                }
+                slide_result['text'].append(text_data)
+            
+            # Extract tables
+            if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
+                table_data = await self._extract_powerpoint_table(shape, slide_num)
+                slide_result['tables'].append(table_data)
+            
+            # Extract charts
+            elif shape.shape_type == MSO_SHAPE_TYPE.CHART:
+                chart_data = await self._extract_powerpoint_chart(shape, slide_num)
+                slide_result['charts'].append(chart_data)
+            
+            # Extract images
+            elif shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
+                image_data = await self._extract_powerpoint_image(shape, slide_num)
+                slide_result['images'].append(image_data)
+        
+        return slide_result
+    
+    async def _extract_powerpoint_table(self, shape, slide_num: int) -> Dict[str, Any]:
+        """Extract table data from PowerPoint shape."""
+        table = shape.table
+        table_data = []
+        
+        for row in table.rows:
+            row_data = []
+            for cell in row.cells:
+                row_data.append(cell.text.strip())
+            table_data.append(row_data)
+        
+        return {
+            'slide': slide_num + 1,
+            'table_number': 1,  # Assuming one table per slide for now
+            'data': table_data,
+            'rows': len(table_data),
+            'columns': len(table_data[0]) if table_data else 0,
+            'extraction_method': 'python-pptx'
+        }
+    
+    async def _extract_powerpoint_chart(self, shape, slide_num: int) -> Dict[str, Any]:
+        """Extract chart data from PowerPoint shape."""
+        chart = shape.chart
+        
+        chart_data = {
+            'slide': slide_num + 1,
+            'chart_type': str(chart.chart_type),
+            'title': chart.chart_title.text if chart.chart_title else '',
+            'bbox': (shape.left, shape.top, shape.width, shape.height),
+            'extraction_method': 'python-pptx'
+        }
+        
+        # Extract chart data if available
+        if hasattr(chart, 'part') and chart.part:
+            # This would require additional processing to extract actual chart data
+            chart_data['has_data'] = True
+        
+        return chart_data
+    
+    async def _extract_powerpoint_image(self, shape, slide_num: int) -> Dict[str, Any]:
+        """Extract image data from PowerPoint shape."""
+        image = shape.image
+        
+        image_data = {
+            'slide': slide_num + 1,
+            'image_number': 1,  # Assuming one image per shape
+            'width': shape.width,
+            'height': shape.height,
+            'bbox': (shape.left, shape.top, shape.width, shape.height),
+            'extraction_method': 'python-pptx'
+        }
+        
+        return image_data
+    
+    async def _process_excel(self, file_path: Path, document: Document) -> Dict[str, Any]:
+        """Process Excel file with table extraction."""
+        result = {
+            'text_content': [],
+            'tables': [],
+            'charts': [],
+            'images': [],
+            'metadata': {},
+            'structure': {}
+        }
+        
+        try:
+            # Read all sheets
+            excel_file = pd.ExcelFile(file_path)
+            result['metadata']['sheets'] = excel_file.sheet_names
+            result['metadata']['file_size'] = file_path.stat().st_size
+            
+            for sheet_name in excel_file.sheet_names:
+                df = pd.read_excel(file_path, sheet_name=sheet_name)
+                
+                if not df.empty:
+                    table_data = {
+                        'sheet': sheet_name,
+                        'table_number': 1,
+                        'data': df.to_dict('records'),
+                        'rows': len(df),
+                        'columns': len(df.columns),
+                        'extraction_method': 'pandas'
+                    }
+                    result['tables'].append(table_data)
+                    
+        except Exception as e:
+            logger.error(f"Error processing Excel {file_path}: {str(e)}")
+            raise
+        
+        return result
+    
+    async def _process_word(self, file_path: Path, document: Document) -> Dict[str, Any]:
+        """Process Word document."""
+        # TODO: Implement Word document processing
+        return {
+            'text_content': [],
+            'tables': [],
+            'charts': [],
+            'images': [],
+            'metadata': {},
+            'structure': {}
+        }
+    
+    async def _process_text(self, file_path: Path, document: Document) -> Dict[str, Any]:
+        """Process text file."""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            
+            return {
+                'text_content': [{'content': content, 'page': 1}],
+                'tables': [],
+                'charts': [],
+                'images': [],
+                'metadata': {'file_size': file_path.stat().st_size},
+                'structure': {}
+            }
+        except Exception as e:
+            logger.error(f"Error processing text file {file_path}: {str(e)}")
+            raise
+    
+    def analyze_table_structure(self, table_data: List[List[str]]) -> Dict[str, Any]:
+        """Analyze table structure and extract metadata."""
+        if not table_data or len(table_data) < 2:
+            return {}
+        
+        analysis = {
+            'header_row': table_data[0] if table_data else [],
+            'data_rows': len(table_data) - 1,
+            'columns': len(table_data[0]) if table_data else 0,
+            'column_types': [],
+            'has_numeric_data': False,
+            'has_date_data': False
+        }
+        
+        # Analyze column types
+        if len(table_data) > 1:
+            for col_idx in range(len(table_data[0])):
+                col_values = [row[col_idx] for row in table_data[1:] if len(row) > col_idx]
+                col_type = self._infer_column_type(col_values)
+                analysis['column_types'].append(col_type)
+                
+                if col_type == 'numeric':
+                    analysis['has_numeric_data'] = True
+                elif col_type == 'date':
+                    analysis['has_date_data'] = True
+        
+        return analysis
+    
+    def _infer_column_type(self, values: List[str]) -> str:
+        """Infer the data type of a column."""
+        if not values:
+            return 'text'
+        
+        numeric_count = 0
+        date_count = 0
+        
+        for value in values:
+            if value and value.strip():
+                # Check if numeric
+                try:
+                    float(value.replace(',', '').replace('$', '').replace('%', ''))
+                    numeric_count += 1
+                except ValueError:
+                    pass
+                
+                # Check if date (basic check)
+                if any(separator in value for separator in ['/', '-', '.']):
+                    date_count += 1
+        
+        total = len([v for v in values if v and v.strip()])
+        if total == 0:
+            return 'text'
+        
+        numeric_ratio = numeric_count / total
+        date_ratio = date_count / total
+        
+        if numeric_ratio > 0.8:
+            return 'numeric'
+        elif date_ratio > 0.8:
+            return 'date'
+        else:
+            return 'text'