Add multi-tenant architecture and advanced document parsing capabilities

This commit is contained in:
Jonathan Pressnell
2025-08-07 16:22:28 -04:00
parent fbfe940a45
commit a4877aaa7d
7 changed files with 875 additions and 12 deletions

129
app/models/tenant.py Normal file
View File

@@ -0,0 +1,129 @@
"""
Tenant models for multi-company support in the Virtual Board Member AI System.
"""
from datetime import datetime
from typing import Optional
from sqlalchemy import Column, String, DateTime, Boolean, Text, Integer, ForeignKey
from sqlalchemy.dialects.postgresql import UUID, JSONB
from sqlalchemy.orm import relationship
import uuid
import enum
from app.core.database import Base
class TenantStatus(str, enum.Enum):
"""Tenant status enumeration."""
ACTIVE = "active"
SUSPENDED = "suspended"
PENDING = "pending"
INACTIVE = "inactive"
class TenantTier(str, enum.Enum):
"""Tenant subscription tier."""
BASIC = "basic"
PROFESSIONAL = "professional"
ENTERPRISE = "enterprise"
CUSTOM = "custom"
class Tenant(Base):
"""Tenant model for multi-company support."""
__tablename__ = "tenants"
# Primary key
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
# Tenant identification
name = Column(String(255), nullable=False, unique=True)
slug = Column(String(100), nullable=False, unique=True) # URL-friendly identifier
domain = Column(String(255), nullable=True, unique=True) # Custom domain
# Company information
company_name = Column(String(255), nullable=False)
company_description = Column(Text, nullable=True)
industry = Column(String(100), nullable=True)
company_size = Column(String(50), nullable=True) # small, medium, large, enterprise
# Contact information
primary_contact_name = Column(String(255), nullable=False)
primary_contact_email = Column(String(255), nullable=False)
primary_contact_phone = Column(String(50), nullable=True)
# Subscription and billing
tier = Column(String(50), default=TenantTier.BASIC, nullable=False)
status = Column(String(50), default=TenantStatus.PENDING, nullable=False)
subscription_start_date = Column(DateTime, nullable=True)
subscription_end_date = Column(DateTime, nullable=True)
# Configuration
settings = Column(JSONB, nullable=True) # Tenant-specific settings
features_enabled = Column(JSONB, nullable=True) # Feature flags
storage_quota_gb = Column(Integer, default=10, nullable=False)
user_limit = Column(Integer, default=10, nullable=False)
# Security and compliance
data_retention_days = Column(Integer, default=2555, nullable=False) # 7 years default
encryption_level = Column(String(50), default="standard", nullable=False)
compliance_frameworks = Column(JSONB, nullable=True) # SOX, GDPR, etc.
# Timestamps
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
activated_at = Column(DateTime, nullable=True)
# Relationships
users = relationship("User", back_populates="tenant", cascade="all, delete-orphan")
documents = relationship("Document", back_populates="tenant", cascade="all, delete-orphan")
commitments = relationship("Commitment", back_populates="tenant", cascade="all, delete-orphan")
audit_logs = relationship("AuditLog", back_populates="tenant", cascade="all, delete-orphan")
def __repr__(self):
return f"<Tenant(id={self.id}, name='{self.name}', company='{self.company_name}')>"
@property
def is_active(self) -> bool:
"""Check if tenant is active."""
return self.status == TenantStatus.ACTIVE
@property
def is_suspended(self) -> bool:
"""Check if tenant is suspended."""
return self.status == TenantStatus.SUSPENDED
@property
def has_expired_subscription(self) -> bool:
"""Check if subscription has expired."""
if not self.subscription_end_date:
return False
return datetime.utcnow() > self.subscription_end_date
def get_setting(self, key: str, default=None):
"""Get a tenant-specific setting."""
if not self.settings:
return default
return self.settings.get(key, default)
def set_setting(self, key: str, value):
"""Set a tenant-specific setting."""
if not self.settings:
self.settings = {}
self.settings[key] = value
def is_feature_enabled(self, feature: str) -> bool:
"""Check if a feature is enabled for this tenant."""
if not self.features_enabled:
return False
return self.features_enabled.get(feature, False)
def enable_feature(self, feature: str):
"""Enable a feature for this tenant."""
if not self.features_enabled:
self.features_enabled = {}
self.features_enabled[feature] = True
def disable_feature(self, feature: str):
"""Disable a feature for this tenant."""
if not self.features_enabled:
self.features_enabled = {}
self.features_enabled[feature] = False

View File

@@ -4,8 +4,9 @@ User model for authentication and user management.
from datetime import datetime
from typing import Optional
from sqlalchemy import Column, String, DateTime, Boolean, Text, Enum
from sqlalchemy import Column, String, DateTime, Boolean, Text, Enum, ForeignKey
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import relationship
import uuid
import enum
@@ -58,6 +59,9 @@ class User(Base):
oauth_provider = Column(String(50), nullable=True) # auth0, cognito, etc.
oauth_id = Column(String(255), nullable=True)
# Tenant relationship
tenant_id = Column(UUID(as_uuid=True), ForeignKey("tenants.id"), nullable=False)
# Timestamps
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
@@ -68,6 +72,9 @@ class User(Base):
language = Column(String(10), default="en")
notification_preferences = Column(Text, nullable=True) # JSON string
# Relationships
tenant = relationship("Tenant", back_populates="users")
def __repr__(self) -> str:
return f"<User(id={self.id}, email='{self.email}', role='{self.role}')>"

View File

@@ -0,0 +1,482 @@
"""
Advanced document processing service with table and graphics extraction capabilities.
"""
import asyncio
import logging
from typing import Dict, List, Optional, Tuple, Any
from pathlib import Path
import io
import pdfplumber
import fitz # PyMuPDF
import pandas as pd
import numpy as np
from PIL import Image
import cv2
import pytesseract
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE
import tabula
import camelot
from app.core.config import settings
from app.models.document import Document, DocumentType
from app.models.tenant import Tenant
logger = logging.getLogger(__name__)
class DocumentProcessor:
"""Advanced document processor with table and graphics extraction."""
def __init__(self, tenant: Tenant):
self.tenant = tenant
self.supported_formats = {
'.pdf': self._process_pdf,
'.pptx': self._process_powerpoint,
'.xlsx': self._process_excel,
'.docx': self._process_word,
'.txt': self._process_text
}
async def process_document(self, file_path: Path, document: Document) -> Dict[str, Any]:
"""Process a document and extract all content including tables and graphics."""
try:
file_extension = file_path.suffix.lower()
if file_extension not in self.supported_formats:
raise ValueError(f"Unsupported file format: {file_extension}")
processor = self.supported_formats[file_extension]
result = await processor(file_path, document)
# Add tenant-specific processing
result['tenant_id'] = str(self.tenant.id)
result['tenant_name'] = self.tenant.name
return result
except Exception as e:
logger.error(f"Error processing document {file_path}: {str(e)}")
raise
async def _process_pdf(self, file_path: Path, document: Document) -> Dict[str, Any]:
"""Process PDF with advanced table and graphics extraction."""
result = {
'text_content': [],
'tables': [],
'charts': [],
'images': [],
'metadata': {},
'structure': {}
}
try:
# Use pdfplumber for text and table extraction
with pdfplumber.open(file_path) as pdf:
result['metadata']['pages'] = len(pdf.pages)
result['metadata']['file_size'] = file_path.stat().st_size
for page_num, page in enumerate(pdf.pages):
page_result = await self._extract_pdf_page_content(page, page_num)
result['text_content'].extend(page_result['text'])
result['tables'].extend(page_result['tables'])
result['charts'].extend(page_result['charts'])
result['images'].extend(page_result['images'])
# Use PyMuPDF for additional graphics extraction
await self._extract_pdf_graphics(file_path, result)
# Use tabula for complex table extraction
await self._extract_pdf_tables_tabula(file_path, result)
# Use camelot for lattice table extraction
await self._extract_pdf_tables_camelot(file_path, result)
except Exception as e:
logger.error(f"Error processing PDF {file_path}: {str(e)}")
raise
return result
async def _extract_pdf_page_content(self, page, page_num: int) -> Dict[str, Any]:
"""Extract content from a single PDF page."""
page_result = {
'text': [],
'tables': [],
'charts': [],
'images': []
}
# Extract text
text = page.extract_text()
if text:
page_result['text'].append({
'page': page_num + 1,
'content': text,
'bbox': page.bbox
})
# Extract tables using pdfplumber
tables = page.extract_tables()
for table_num, table in enumerate(tables):
if table and len(table) > 1: # Ensure table has content
table_data = {
'page': page_num + 1,
'table_number': table_num + 1,
'data': table,
'rows': len(table),
'columns': len(table[0]) if table else 0,
'extraction_method': 'pdfplumber'
}
page_result['tables'].append(table_data)
# Extract images
images = page.images
for img_num, img in enumerate(images):
image_data = {
'page': page_num + 1,
'image_number': img_num + 1,
'bbox': img['bbox'],
'width': img['width'],
'height': img['height'],
'type': img.get('name', 'unknown')
}
page_result['images'].append(image_data)
return page_result
async def _extract_pdf_graphics(self, file_path: Path, result: Dict[str, Any]):
"""Extract graphics and charts from PDF using PyMuPDF."""
try:
doc = fitz.open(file_path)
for page_num in range(len(doc)):
page = doc[page_num]
# Extract images
image_list = page.get_images()
for img_index, img in enumerate(image_list):
xref = img[0]
pix = fitz.Pixmap(doc, xref)
if pix.n - pix.alpha < 4: # GRAY or RGB
image_data = {
'page': page_num + 1,
'image_number': img_index + 1,
'width': pix.width,
'height': pix.height,
'colorspace': pix.colorspace.name,
'extraction_method': 'PyMuPDF'
}
result['images'].append(image_data)
# Extract drawings and shapes
drawings = page.get_drawings()
for drawing in drawings:
if drawing.get('type') == 'l': # Line
chart_data = {
'page': page_num + 1,
'type': 'chart_element',
'bbox': drawing.get('rect'),
'extraction_method': 'PyMuPDF'
}
result['charts'].append(chart_data)
doc.close()
except Exception as e:
logger.error(f"Error extracting PDF graphics: {str(e)}")
async def _extract_pdf_tables_tabula(self, file_path: Path, result: Dict[str, Any]):
"""Extract tables using tabula-py."""
try:
tables = tabula.read_pdf(str(file_path), pages='all', multiple_tables=True)
for page_num, page_tables in enumerate(tables):
for table_num, table in enumerate(page_tables):
if not table.empty:
table_data = {
'page': page_num + 1,
'table_number': table_num + 1,
'data': table.to_dict('records'),
'rows': len(table),
'columns': len(table.columns),
'extraction_method': 'tabula'
}
result['tables'].append(table_data)
except Exception as e:
logger.error(f"Error extracting tables with tabula: {str(e)}")
async def _extract_pdf_tables_camelot(self, file_path: Path, result: Dict[str, Any]):
"""Extract tables using camelot-py."""
try:
tables = camelot.read_pdf(str(file_path), pages='all')
for table in tables:
if table.df is not None and not table.df.empty:
table_data = {
'page': table.page,
'table_number': table.order,
'data': table.df.to_dict('records'),
'rows': len(table.df),
'columns': len(table.df.columns),
'accuracy': table.accuracy,
'whitespace': table.whitespace,
'extraction_method': 'camelot'
}
result['tables'].append(table_data)
except Exception as e:
logger.error(f"Error extracting tables with camelot: {str(e)}")
async def _process_powerpoint(self, file_path: Path, document: Document) -> Dict[str, Any]:
"""Process PowerPoint with table and graphics extraction."""
result = {
'text_content': [],
'tables': [],
'charts': [],
'images': [],
'metadata': {},
'structure': {}
}
try:
prs = Presentation(file_path)
result['metadata']['slides'] = len(prs.slides)
result['metadata']['file_size'] = file_path.stat().st_size
for slide_num, slide in enumerate(prs.slides):
slide_result = await self._extract_powerpoint_slide_content(slide, slide_num)
result['text_content'].extend(slide_result['text'])
result['tables'].extend(slide_result['tables'])
result['charts'].extend(slide_result['charts'])
result['images'].extend(slide_result['images'])
except Exception as e:
logger.error(f"Error processing PowerPoint {file_path}: {str(e)}")
raise
return result
async def _extract_powerpoint_slide_content(self, slide, slide_num: int) -> Dict[str, Any]:
"""Extract content from a single PowerPoint slide."""
slide_result = {
'text': [],
'tables': [],
'charts': [],
'images': []
}
for shape in slide.shapes:
# Extract text
if hasattr(shape, 'text') and shape.text.strip():
text_data = {
'slide': slide_num + 1,
'content': shape.text.strip(),
'shape_type': str(shape.shape_type),
'bbox': (shape.left, shape.top, shape.width, shape.height)
}
slide_result['text'].append(text_data)
# Extract tables
if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
table_data = await self._extract_powerpoint_table(shape, slide_num)
slide_result['tables'].append(table_data)
# Extract charts
elif shape.shape_type == MSO_SHAPE_TYPE.CHART:
chart_data = await self._extract_powerpoint_chart(shape, slide_num)
slide_result['charts'].append(chart_data)
# Extract images
elif shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
image_data = await self._extract_powerpoint_image(shape, slide_num)
slide_result['images'].append(image_data)
return slide_result
async def _extract_powerpoint_table(self, shape, slide_num: int) -> Dict[str, Any]:
"""Extract table data from PowerPoint shape."""
table = shape.table
table_data = []
for row in table.rows:
row_data = []
for cell in row.cells:
row_data.append(cell.text.strip())
table_data.append(row_data)
return {
'slide': slide_num + 1,
'table_number': 1, # Assuming one table per slide for now
'data': table_data,
'rows': len(table_data),
'columns': len(table_data[0]) if table_data else 0,
'extraction_method': 'python-pptx'
}
async def _extract_powerpoint_chart(self, shape, slide_num: int) -> Dict[str, Any]:
"""Extract chart data from PowerPoint shape."""
chart = shape.chart
chart_data = {
'slide': slide_num + 1,
'chart_type': str(chart.chart_type),
'title': chart.chart_title.text if chart.chart_title else '',
'bbox': (shape.left, shape.top, shape.width, shape.height),
'extraction_method': 'python-pptx'
}
# Extract chart data if available
if hasattr(chart, 'part') and chart.part:
# This would require additional processing to extract actual chart data
chart_data['has_data'] = True
return chart_data
async def _extract_powerpoint_image(self, shape, slide_num: int) -> Dict[str, Any]:
"""Extract image data from PowerPoint shape."""
image = shape.image
image_data = {
'slide': slide_num + 1,
'image_number': 1, # Assuming one image per shape
'width': shape.width,
'height': shape.height,
'bbox': (shape.left, shape.top, shape.width, shape.height),
'extraction_method': 'python-pptx'
}
return image_data
async def _process_excel(self, file_path: Path, document: Document) -> Dict[str, Any]:
"""Process Excel file with table extraction."""
result = {
'text_content': [],
'tables': [],
'charts': [],
'images': [],
'metadata': {},
'structure': {}
}
try:
# Read all sheets
excel_file = pd.ExcelFile(file_path)
result['metadata']['sheets'] = excel_file.sheet_names
result['metadata']['file_size'] = file_path.stat().st_size
for sheet_name in excel_file.sheet_names:
df = pd.read_excel(file_path, sheet_name=sheet_name)
if not df.empty:
table_data = {
'sheet': sheet_name,
'table_number': 1,
'data': df.to_dict('records'),
'rows': len(df),
'columns': len(df.columns),
'extraction_method': 'pandas'
}
result['tables'].append(table_data)
except Exception as e:
logger.error(f"Error processing Excel {file_path}: {str(e)}")
raise
return result
async def _process_word(self, file_path: Path, document: Document) -> Dict[str, Any]:
"""Process Word document."""
# TODO: Implement Word document processing
return {
'text_content': [],
'tables': [],
'charts': [],
'images': [],
'metadata': {},
'structure': {}
}
async def _process_text(self, file_path: Path, document: Document) -> Dict[str, Any]:
"""Process text file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
return {
'text_content': [{'content': content, 'page': 1}],
'tables': [],
'charts': [],
'images': [],
'metadata': {'file_size': file_path.stat().st_size},
'structure': {}
}
except Exception as e:
logger.error(f"Error processing text file {file_path}: {str(e)}")
raise
def analyze_table_structure(self, table_data: List[List[str]]) -> Dict[str, Any]:
"""Analyze table structure and extract metadata."""
if not table_data or len(table_data) < 2:
return {}
analysis = {
'header_row': table_data[0] if table_data else [],
'data_rows': len(table_data) - 1,
'columns': len(table_data[0]) if table_data else 0,
'column_types': [],
'has_numeric_data': False,
'has_date_data': False
}
# Analyze column types
if len(table_data) > 1:
for col_idx in range(len(table_data[0])):
col_values = [row[col_idx] for row in table_data[1:] if len(row) > col_idx]
col_type = self._infer_column_type(col_values)
analysis['column_types'].append(col_type)
if col_type == 'numeric':
analysis['has_numeric_data'] = True
elif col_type == 'date':
analysis['has_date_data'] = True
return analysis
def _infer_column_type(self, values: List[str]) -> str:
"""Infer the data type of a column."""
if not values:
return 'text'
numeric_count = 0
date_count = 0
for value in values:
if value and value.strip():
# Check if numeric
try:
float(value.replace(',', '').replace('$', '').replace('%', ''))
numeric_count += 1
except ValueError:
pass
# Check if date (basic check)
if any(separator in value for separator in ['/', '-', '.']):
date_count += 1
total = len([v for v in values if v and v.strip()])
if total == 0:
return 'text'
numeric_ratio = numeric_count / total
date_ratio = date_count / total
if numeric_ratio > 0.8:
return 'numeric'
elif date_ratio > 0.8:
return 'date'
else:
return 'text'