Files
virtual_board_member/app/services/storage_service.py
Jonathan Pressnell 1a8ec37bed feat: Complete Week 2 - Document Processing Pipeline
- Implement multi-format document support (PDF, XLSX, CSV, PPTX, TXT, Images)
- Add S3-compatible storage service with tenant isolation
- Create document organization service with hierarchical folders and tagging
- Implement advanced document processing with table/chart extraction
- Add batch upload capabilities (up to 50 files)
- Create comprehensive document validation and security scanning
- Implement automatic metadata extraction and categorization
- Add document version control system
- Update DEVELOPMENT_PLAN.md to mark Week 2 as completed
- Add WEEK2_COMPLETION_SUMMARY.md with detailed implementation notes
- All tests passing (6/6) - 100% success rate
2025-08-08 15:47:43 -04:00

393 lines
13 KiB
Python

"""
Storage service for handling file storage with S3-compatible backend and multi-tenant support.
"""
import asyncio
import logging
import hashlib
import mimetypes
from typing import Optional, Dict, Any, List
from pathlib import Path
import uuid
from datetime import datetime, timedelta
import boto3
from botocore.exceptions import ClientError, NoCredentialsError
import aiofiles
from fastapi import UploadFile
from app.core.config import settings
from app.models.tenant import Tenant
logger = logging.getLogger(__name__)
class StorageService:
"""Storage service with S3-compatible backend and multi-tenant support."""
def __init__(self, tenant: Tenant):
self.tenant = tenant
self.s3_client = None
self.bucket_name = f"vbm-documents-{tenant.id}"
# Initialize S3 client if credentials are available
if settings.AWS_ACCESS_KEY_ID and settings.AWS_SECRET_ACCESS_KEY:
self.s3_client = boto3.client(
's3',
aws_access_key_id=settings.AWS_ACCESS_KEY_ID,
aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY,
region_name=settings.AWS_REGION or 'us-east-1',
endpoint_url=settings.S3_ENDPOINT_URL # For MinIO or other S3-compatible services
)
else:
logger.warning("AWS credentials not configured, using local storage")
async def upload_file(self, file: UploadFile, document_id: str) -> Dict[str, Any]:
"""
Upload a file to storage with security validation.
"""
try:
# Security validation
await self._validate_file_security(file)
# Generate file path
file_path = self._generate_file_path(document_id, file.filename)
# Read file content
content = await file.read()
# Calculate checksum
checksum = hashlib.sha256(content).hexdigest()
# Upload to storage
if self.s3_client:
await self._upload_to_s3(content, file_path, file.content_type)
storage_url = f"s3://{self.bucket_name}/{file_path}"
else:
await self._upload_to_local(content, file_path)
storage_url = str(file_path)
return {
"file_path": file_path,
"storage_url": storage_url,
"file_size": len(content),
"checksum": checksum,
"mime_type": file.content_type,
"uploaded_at": datetime.utcnow().isoformat()
}
except Exception as e:
logger.error(f"Error uploading file {file.filename}: {str(e)}")
raise
async def download_file(self, file_path: str) -> bytes:
"""
Download a file from storage.
"""
try:
if self.s3_client:
return await self._download_from_s3(file_path)
else:
return await self._download_from_local(file_path)
except Exception as e:
logger.error(f"Error downloading file {file_path}: {str(e)}")
raise
async def delete_file(self, file_path: str) -> bool:
"""
Delete a file from storage.
"""
try:
if self.s3_client:
return await self._delete_from_s3(file_path)
else:
return await self._delete_from_local(file_path)
except Exception as e:
logger.error(f"Error deleting file {file_path}: {str(e)}")
return False
async def get_file_info(self, file_path: str) -> Optional[Dict[str, Any]]:
"""
Get file information from storage.
"""
try:
if self.s3_client:
return await self._get_s3_file_info(file_path)
else:
return await self._get_local_file_info(file_path)
except Exception as e:
logger.error(f"Error getting file info for {file_path}: {str(e)}")
return None
async def list_files(self, prefix: str = "", max_keys: int = 1000) -> List[Dict[str, Any]]:
"""
List files in storage with optional prefix filtering.
"""
try:
if self.s3_client:
return await self._list_s3_files(prefix, max_keys)
else:
return await self._list_local_files(prefix, max_keys)
except Exception as e:
logger.error(f"Error listing files with prefix {prefix}: {str(e)}")
return []
async def _validate_file_security(self, file: UploadFile) -> None:
"""
Validate file for security threats.
"""
# Check file size
if not file.filename:
raise ValueError("No filename provided")
# Check file extension
allowed_extensions = {
'.pdf', '.docx', '.xlsx', '.pptx', '.txt', '.csv',
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff'
}
file_extension = Path(file.filename).suffix.lower()
if file_extension not in allowed_extensions:
raise ValueError(f"File type {file_extension} not allowed")
# Check MIME type
if file.content_type:
allowed_mime_types = {
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'text/plain',
'text/csv',
'image/jpeg',
'image/png',
'image/gif',
'image/bmp',
'image/tiff'
}
if file.content_type not in allowed_mime_types:
raise ValueError(f"MIME type {file.content_type} not allowed")
def _generate_file_path(self, document_id: str, filename: str) -> str:
"""
Generate a secure file path for storage.
"""
# Create tenant-specific path
tenant_path = f"tenants/{self.tenant.id}/documents"
# Use document ID and sanitized filename
sanitized_filename = Path(filename).name.replace(" ", "_")
file_path = f"{tenant_path}/{document_id}_{sanitized_filename}"
return file_path
async def _upload_to_s3(self, content: bytes, file_path: str, content_type: str) -> None:
"""
Upload file to S3-compatible storage.
"""
try:
self.s3_client.put_object(
Bucket=self.bucket_name,
Key=file_path,
Body=content,
ContentType=content_type,
Metadata={
'tenant_id': str(self.tenant.id),
'uploaded_at': datetime.utcnow().isoformat()
}
)
except ClientError as e:
logger.error(f"S3 upload error: {str(e)}")
raise
except NoCredentialsError:
logger.error("AWS credentials not found")
raise
async def _upload_to_local(self, content: bytes, file_path: str) -> None:
"""
Upload file to local storage.
"""
try:
# Create directory structure
local_path = Path(f"storage/{file_path}")
local_path.parent.mkdir(parents=True, exist_ok=True)
# Write file
async with aiofiles.open(local_path, 'wb') as f:
await f.write(content)
except Exception as e:
logger.error(f"Local upload error: {str(e)}")
raise
async def _download_from_s3(self, file_path: str) -> bytes:
"""
Download file from S3-compatible storage.
"""
try:
response = self.s3_client.get_object(
Bucket=self.bucket_name,
Key=file_path
)
return response['Body'].read()
except ClientError as e:
logger.error(f"S3 download error: {str(e)}")
raise
async def _download_from_local(self, file_path: str) -> bytes:
"""
Download file from local storage.
"""
try:
local_path = Path(f"storage/{file_path}")
async with aiofiles.open(local_path, 'rb') as f:
return await f.read()
except Exception as e:
logger.error(f"Local download error: {str(e)}")
raise
async def _delete_from_s3(self, file_path: str) -> bool:
"""
Delete file from S3-compatible storage.
"""
try:
self.s3_client.delete_object(
Bucket=self.bucket_name,
Key=file_path
)
return True
except ClientError as e:
logger.error(f"S3 delete error: {str(e)}")
return False
async def _delete_from_local(self, file_path: str) -> bool:
"""
Delete file from local storage.
"""
try:
local_path = Path(f"storage/{file_path}")
if local_path.exists():
local_path.unlink()
return True
return False
except Exception as e:
logger.error(f"Local delete error: {str(e)}")
return False
async def _get_s3_file_info(self, file_path: str) -> Optional[Dict[str, Any]]:
"""
Get file information from S3-compatible storage.
"""
try:
response = self.s3_client.head_object(
Bucket=self.bucket_name,
Key=file_path
)
return {
"file_size": response['ContentLength'],
"last_modified": response['LastModified'].isoformat(),
"content_type": response.get('ContentType'),
"metadata": response.get('Metadata', {})
}
except ClientError:
return None
async def _get_local_file_info(self, file_path: str) -> Optional[Dict[str, Any]]:
"""
Get file information from local storage.
"""
try:
local_path = Path(f"storage/{file_path}")
if not local_path.exists():
return None
stat = local_path.stat()
return {
"file_size": stat.st_size,
"last_modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
"content_type": mimetypes.guess_type(local_path)[0]
}
except Exception:
return None
async def _list_s3_files(self, prefix: str, max_keys: int) -> List[Dict[str, Any]]:
"""
List files in S3-compatible storage.
"""
try:
tenant_prefix = f"tenants/{self.tenant.id}/documents/{prefix}"
response = self.s3_client.list_objects_v2(
Bucket=self.bucket_name,
Prefix=tenant_prefix,
MaxKeys=max_keys
)
files = []
for obj in response.get('Contents', []):
files.append({
"key": obj['Key'],
"size": obj['Size'],
"last_modified": obj['LastModified'].isoformat()
})
return files
except ClientError as e:
logger.error(f"S3 list error: {str(e)}")
return []
async def _list_local_files(self, prefix: str, max_keys: int) -> List[Dict[str, Any]]:
"""
List files in local storage.
"""
try:
tenant_path = Path(f"storage/tenants/{self.tenant.id}/documents/{prefix}")
if not tenant_path.exists():
return []
files = []
for file_path in tenant_path.rglob("*"):
if file_path.is_file():
stat = file_path.stat()
files.append({
"key": str(file_path.relative_to(Path("storage"))),
"size": stat.st_size,
"last_modified": datetime.fromtimestamp(stat.st_mtime).isoformat()
})
if len(files) >= max_keys:
break
return files
except Exception as e:
logger.error(f"Local list error: {str(e)}")
return []
async def cleanup_old_files(self, days_old: int = 30) -> int:
"""
Clean up old files from storage.
"""
try:
cutoff_date = datetime.utcnow() - timedelta(days=days_old)
deleted_count = 0
files = await self.list_files()
for file_info in files:
last_modified = datetime.fromisoformat(file_info['last_modified'])
if last_modified < cutoff_date:
if await self.delete_file(file_info['key']):
deleted_count += 1
return deleted_count
except Exception as e:
logger.error(f"Cleanup error: {str(e)}")
return 0