feat: Complete Week 2 - Document Processing Pipeline

- Implement multi-format document support (PDF, XLSX, CSV, PPTX, TXT, Images)
- Add S3-compatible storage service with tenant isolation
- Create document organization service with hierarchical folders and tagging
- Implement advanced document processing with table/chart extraction
- Add batch upload capabilities (up to 50 files)
- Create comprehensive document validation and security scanning
- Implement automatic metadata extraction and categorization
- Add document version control system
- Update DEVELOPMENT_PLAN.md to mark Week 2 as completed
- Add WEEK2_COMPLETION_SUMMARY.md with detailed implementation notes
- All tests passing (6/6) - 100% success rate
This commit is contained in:
Jonathan Pressnell
2025-08-08 15:47:43 -04:00
parent a4877aaa7d
commit 1a8ec37bed
19 changed files with 4089 additions and 308 deletions

208
app/core/auth.py Normal file
View File

@@ -0,0 +1,208 @@
"""
Authentication and authorization service for the Virtual Board Member AI System.
"""
import logging
from datetime import datetime, timedelta
from typing import Optional, Dict, Any
from fastapi import HTTPException, Depends, status
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from jose import JWTError, jwt
from passlib.context import CryptContext
from sqlalchemy.orm import Session
import redis.asyncio as redis
from app.core.config import settings
from app.core.database import get_db
from app.models.user import User
from app.models.tenant import Tenant
logger = logging.getLogger(__name__)
# Security configurations
security = HTTPBearer()
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
class AuthService:
"""Authentication service with tenant-aware authentication."""
def __init__(self):
self.redis_client = None
self._init_redis()
async def _init_redis(self):
"""Initialize Redis connection for session management."""
try:
self.redis_client = redis.from_url(
settings.REDIS_URL,
encoding="utf-8",
decode_responses=True
)
await self.redis_client.ping()
logger.info("Redis connection established for auth service")
except Exception as e:
logger.error(f"Failed to connect to Redis: {e}")
self.redis_client = None
def verify_password(self, plain_password: str, hashed_password: str) -> bool:
"""Verify a password against its hash."""
return pwd_context.verify(plain_password, hashed_password)
def get_password_hash(self, password: str) -> str:
"""Generate password hash."""
return pwd_context.hash(password)
def create_access_token(self, data: Dict[str, Any], expires_delta: Optional[timedelta] = None) -> str:
"""Create JWT access token."""
to_encode = data.copy()
if expires_delta:
expire = datetime.utcnow() + expires_delta
else:
expire = datetime.utcnow() + timedelta(minutes=settings.ACCESS_TOKEN_EXPIRE_MINUTES)
to_encode.update({"exp": expire})
encoded_jwt = jwt.encode(to_encode, settings.SECRET_KEY, algorithm=settings.ALGORITHM)
return encoded_jwt
def verify_token(self, token: str) -> Dict[str, Any]:
"""Verify and decode JWT token."""
try:
payload = jwt.decode(token, settings.SECRET_KEY, algorithms=[settings.ALGORITHM])
return payload
except JWTError as e:
logger.error(f"Token verification failed: {e}")
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Could not validate credentials",
headers={"WWW-Authenticate": "Bearer"},
)
async def create_session(self, user_id: str, tenant_id: str, token: str) -> bool:
"""Create user session in Redis."""
if not self.redis_client:
logger.warning("Redis not available, session not created")
return False
try:
session_key = f"session:{user_id}:{tenant_id}"
session_data = {
"user_id": user_id,
"tenant_id": tenant_id,
"token": token,
"created_at": datetime.utcnow().isoformat(),
"expires_at": (datetime.utcnow() + timedelta(hours=24)).isoformat()
}
await self.redis_client.hset(session_key, mapping=session_data)
await self.redis_client.expire(session_key, 86400) # 24 hours
logger.info(f"Session created for user {user_id} in tenant {tenant_id}")
return True
except Exception as e:
logger.error(f"Failed to create session: {e}")
return False
async def get_session(self, user_id: str, tenant_id: str) -> Optional[Dict[str, Any]]:
"""Get user session from Redis."""
if not self.redis_client:
return None
try:
session_key = f"session:{user_id}:{tenant_id}"
session_data = await self.redis_client.hgetall(session_key)
if session_data:
expires_at = datetime.fromisoformat(session_data["expires_at"])
if datetime.utcnow() < expires_at:
return session_data
else:
await self.redis_client.delete(session_key)
return None
except Exception as e:
logger.error(f"Failed to get session: {e}")
return None
async def invalidate_session(self, user_id: str, tenant_id: str) -> bool:
"""Invalidate user session."""
if not self.redis_client:
return False
try:
session_key = f"session:{user_id}:{tenant_id}"
await self.redis_client.delete(session_key)
logger.info(f"Session invalidated for user {user_id} in tenant {tenant_id}")
return True
except Exception as e:
logger.error(f"Failed to invalidate session: {e}")
return False
# Global auth service instance
auth_service = AuthService()
async def get_current_user(
credentials: HTTPAuthorizationCredentials = Depends(security),
db: Session = Depends(get_db)
) -> User:
"""Get current authenticated user with tenant context."""
token = credentials.credentials
payload = auth_service.verify_token(token)
user_id: str = payload.get("sub")
tenant_id: str = payload.get("tenant_id")
if user_id is None or tenant_id is None:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid token payload",
headers={"WWW-Authenticate": "Bearer"},
)
# Verify session exists
session = await auth_service.get_session(user_id, tenant_id)
if not session:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Session expired or invalid",
headers={"WWW-Authenticate": "Bearer"},
)
# Get user from database
user = db.query(User).filter(
User.id == user_id,
User.tenant_id == tenant_id
).first()
if user is None:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="User not found",
headers={"WWW-Authenticate": "Bearer"},
)
return user
async def get_current_active_user(current_user: User = Depends(get_current_user)) -> User:
"""Get current active user."""
if not current_user.is_active:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Inactive user"
)
return current_user
def require_role(required_role: str):
"""Decorator to require specific user role."""
def role_checker(current_user: User = Depends(get_current_active_user)) -> User:
if current_user.role != required_role and current_user.role != "admin":
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Insufficient permissions"
)
return current_user
return role_checker
def require_tenant_access():
"""Decorator to ensure user has access to the specified tenant."""
def tenant_checker(current_user: User = Depends(get_current_active_user)) -> User:
# Additional tenant-specific checks can be added here
return current_user
return tenant_checker

266
app/core/cache.py Normal file
View File

@@ -0,0 +1,266 @@
"""
Redis caching service for the Virtual Board Member AI System.
"""
import logging
import json
import hashlib
from typing import Optional, Any, Dict, List, Union
from datetime import timedelta
import redis.asyncio as redis
from functools import wraps
import pickle
from app.core.config import settings
logger = logging.getLogger(__name__)
class CacheService:
"""Redis caching service with tenant-aware caching."""
def __init__(self):
self.redis_client = None
# Initialize Redis client lazily when needed
async def _init_redis(self):
"""Initialize Redis connection."""
try:
self.redis_client = redis.from_url(
settings.REDIS_URL,
encoding="utf-8",
decode_responses=False # Keep as bytes for pickle support
)
await self.redis_client.ping()
logger.info("Redis connection established for cache service")
except Exception as e:
logger.error(f"Failed to connect to Redis: {e}")
self.redis_client = None
def _generate_key(self, prefix: str, tenant_id: str, *args, **kwargs) -> str:
"""Generate cache key with tenant isolation."""
# Create a hash of the arguments for consistent key generation
key_parts = [prefix, tenant_id]
if args:
key_parts.extend([str(arg) for arg in args])
if kwargs:
# Sort kwargs for consistent key generation
sorted_kwargs = sorted(kwargs.items())
key_parts.extend([f"{k}:{v}" for k, v in sorted_kwargs])
key_string = ":".join(key_parts)
return hashlib.md5(key_string.encode()).hexdigest()
async def get(self, key: str, tenant_id: str) -> Optional[Any]:
"""Get value from cache."""
if not self.redis_client:
await self._init_redis()
try:
full_key = f"cache:{tenant_id}:{key}"
data = await self.redis_client.get(full_key)
if data:
# Try to deserialize as JSON first, then pickle
try:
return json.loads(data.decode())
except (json.JSONDecodeError, UnicodeDecodeError):
try:
return pickle.loads(data)
except pickle.UnpicklingError:
logger.warning(f"Failed to deserialize cache data for key: {full_key}")
return None
return None
except Exception as e:
logger.error(f"Cache get error: {e}")
return None
async def set(self, key: str, value: Any, tenant_id: str, expire: Optional[int] = None) -> bool:
"""Set value in cache with optional expiration."""
if not self.redis_client:
await self._init_redis()
try:
full_key = f"cache:{tenant_id}:{key}"
# Try to serialize as JSON first, fallback to pickle
try:
data = json.dumps(value).encode()
except (TypeError, ValueError):
data = pickle.dumps(value)
if expire:
await self.redis_client.setex(full_key, expire, data)
else:
await self.redis_client.set(full_key, data)
return True
except Exception as e:
logger.error(f"Cache set error: {e}")
return False
async def delete(self, key: str, tenant_id: str) -> bool:
"""Delete value from cache."""
if not self.redis_client:
return False
try:
full_key = f"cache:{tenant_id}:{key}"
result = await self.redis_client.delete(full_key)
return result > 0
except Exception as e:
logger.error(f"Cache delete error: {e}")
return False
async def delete_pattern(self, pattern: str, tenant_id: str) -> int:
"""Delete all keys matching pattern for a tenant."""
if not self.redis_client:
return 0
try:
full_pattern = f"cache:{tenant_id}:{pattern}"
keys = await self.redis_client.keys(full_pattern)
if keys:
result = await self.redis_client.delete(*keys)
logger.info(f"Deleted {result} cache keys matching pattern: {full_pattern}")
return result
return 0
except Exception as e:
logger.error(f"Cache delete pattern error: {e}")
return 0
async def clear_tenant_cache(self, tenant_id: str) -> int:
"""Clear all cache entries for a specific tenant."""
return await self.delete_pattern("*", tenant_id)
async def get_many(self, keys: List[str], tenant_id: str) -> Dict[str, Any]:
"""Get multiple values from cache."""
if not self.redis_client:
return {}
try:
full_keys = [f"cache:{tenant_id}:{key}" for key in keys]
values = await self.redis_client.mget(full_keys)
result = {}
for key, value in zip(keys, values):
if value is not None:
try:
result[key] = json.loads(value.decode())
except (json.JSONDecodeError, UnicodeDecodeError):
try:
result[key] = pickle.loads(value)
except pickle.UnpicklingError:
logger.warning(f"Failed to deserialize cache data for key: {key}")
return result
except Exception as e:
logger.error(f"Cache get_many error: {e}")
return {}
async def set_many(self, data: Dict[str, Any], tenant_id: str, expire: Optional[int] = None) -> bool:
"""Set multiple values in cache."""
if not self.redis_client:
return False
try:
pipeline = self.redis_client.pipeline()
for key, value in data.items():
full_key = f"cache:{tenant_id}:{key}"
try:
serialized_value = json.dumps(value).encode()
except (TypeError, ValueError):
serialized_value = pickle.dumps(value)
if expire:
pipeline.setex(full_key, expire, serialized_value)
else:
pipeline.set(full_key, serialized_value)
await pipeline.execute()
return True
except Exception as e:
logger.error(f"Cache set_many error: {e}")
return False
async def increment(self, key: str, tenant_id: str, amount: int = 1) -> Optional[int]:
"""Increment a counter in cache."""
if not self.redis_client:
return None
try:
full_key = f"cache:{tenant_id}:{key}"
result = await self.redis_client.incrby(full_key, amount)
return result
except Exception as e:
logger.error(f"Cache increment error: {e}")
return None
async def expire(self, key: str, tenant_id: str, seconds: int) -> bool:
"""Set expiration for a cache key."""
if not self.redis_client:
return False
try:
full_key = f"cache:{tenant_id}:{key}"
result = await self.redis_client.expire(full_key, seconds)
return result
except Exception as e:
logger.error(f"Cache expire error: {e}")
return False
# Global cache service instance
cache_service = CacheService()
def cache_result(prefix: str, expire: Optional[int] = 3600):
"""Decorator to cache function results with tenant isolation."""
def decorator(func):
@wraps(func)
async def wrapper(*args, tenant_id: str = None, **kwargs):
if not tenant_id:
# Try to extract tenant_id from args or kwargs
if args and hasattr(args[0], 'tenant_id'):
tenant_id = args[0].tenant_id
elif 'tenant_id' in kwargs:
tenant_id = kwargs['tenant_id']
else:
# If no tenant_id, skip caching
return await func(*args, **kwargs)
# Generate cache key
cache_key = cache_service._generate_key(prefix, tenant_id, *args, **kwargs)
# Try to get from cache
cached_result = await cache_service.get(cache_key, tenant_id)
if cached_result is not None:
logger.debug(f"Cache hit for key: {cache_key}")
return cached_result
# Execute function and cache result
result = await func(*args, **kwargs)
await cache_service.set(cache_key, result, tenant_id, expire)
logger.debug(f"Cache miss, stored result for key: {cache_key}")
return result
return wrapper
return decorator
def invalidate_cache(prefix: str, pattern: str = "*"):
"""Decorator to invalidate cache entries after function execution."""
def decorator(func):
@wraps(func)
async def wrapper(*args, tenant_id: str = None, **kwargs):
result = await func(*args, **kwargs)
if tenant_id:
await cache_service.delete_pattern(pattern, tenant_id)
logger.debug(f"Invalidated cache for tenant {tenant_id}, pattern: {pattern}")
return result
return wrapper
return decorator

View File

@@ -12,8 +12,10 @@ class Settings(BaseSettings):
"""Application settings."""
# Application Configuration
PROJECT_NAME: str = "Virtual Board Member AI"
APP_NAME: str = "Virtual Board Member AI"
APP_VERSION: str = "0.1.0"
VERSION: str = "0.1.0"
ENVIRONMENT: str = "development"
DEBUG: bool = True
LOG_LEVEL: str = "INFO"
@@ -48,6 +50,9 @@ class Settings(BaseSettings):
QDRANT_API_KEY: Optional[str] = None
QDRANT_COLLECTION_NAME: str = "board_documents"
QDRANT_VECTOR_SIZE: int = 1024
QDRANT_TIMEOUT: int = 30
EMBEDDING_MODEL: str = "sentence-transformers/all-MiniLM-L6-v2"
EMBEDDING_DIMENSION: int = 384 # Dimension for all-MiniLM-L6-v2
# LLM Configuration (OpenRouter)
OPENROUTER_API_KEY: str = Field(..., description="OpenRouter API key")
@@ -77,6 +82,7 @@ class Settings(BaseSettings):
AWS_SECRET_ACCESS_KEY: Optional[str] = None
AWS_REGION: str = "us-east-1"
S3_BUCKET: str = "vbm-documents"
S3_ENDPOINT_URL: Optional[str] = None # For MinIO or other S3-compatible services
# Authentication (OAuth 2.0/OIDC)
AUTH_PROVIDER: str = "auth0" # auth0, cognito, or custom
@@ -172,6 +178,7 @@ class Settings(BaseSettings):
# CORS and Security
ALLOWED_HOSTS: List[str] = ["*"]
API_V1_STR: str = "/api/v1"
@validator("SUPPORTED_FORMATS", pre=True)
def parse_supported_formats(cls, v: str) -> str:

View File

@@ -25,12 +25,15 @@ async_engine = create_async_engine(
)
# Create sync engine for migrations
sync_engine = create_engine(
engine = create_engine(
settings.DATABASE_URL,
echo=settings.DEBUG,
poolclass=StaticPool if settings.TESTING else None,
)
# Alias for compatibility
sync_engine = engine
# Create session factory
AsyncSessionLocal = async_sessionmaker(
async_engine,
@@ -58,6 +61,17 @@ async def get_db() -> AsyncGenerator[AsyncSession, None]:
await session.close()
def get_db_sync():
"""Synchronous database session for non-async contexts."""
from sqlalchemy.orm import sessionmaker
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
db = SessionLocal()
try:
yield db
finally:
db.close()
async def init_db() -> None:
"""Initialize database tables."""
try: