- Implement multi-format document support (PDF, XLSX, CSV, PPTX, TXT, Images) - Add S3-compatible storage service with tenant isolation - Create document organization service with hierarchical folders and tagging - Implement advanced document processing with table/chart extraction - Add batch upload capabilities (up to 50 files) - Create comprehensive document validation and security scanning - Implement automatic metadata extraction and categorization - Add document version control system - Update DEVELOPMENT_PLAN.md to mark Week 2 as completed - Add WEEK2_COMPLETION_SUMMARY.md with detailed implementation notes - All tests passing (6/6) - 100% success rate
393 lines
13 KiB
Python
393 lines
13 KiB
Python
"""
|
|
Storage service for handling file storage with S3-compatible backend and multi-tenant support.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import hashlib
|
|
import mimetypes
|
|
from typing import Optional, Dict, Any, List
|
|
from pathlib import Path
|
|
import uuid
|
|
from datetime import datetime, timedelta
|
|
|
|
import boto3
|
|
from botocore.exceptions import ClientError, NoCredentialsError
|
|
import aiofiles
|
|
from fastapi import UploadFile
|
|
|
|
from app.core.config import settings
|
|
from app.models.tenant import Tenant
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class StorageService:
|
|
"""Storage service with S3-compatible backend and multi-tenant support."""
|
|
|
|
def __init__(self, tenant: Tenant):
|
|
self.tenant = tenant
|
|
self.s3_client = None
|
|
self.bucket_name = f"vbm-documents-{tenant.id}"
|
|
|
|
# Initialize S3 client if credentials are available
|
|
if settings.AWS_ACCESS_KEY_ID and settings.AWS_SECRET_ACCESS_KEY:
|
|
self.s3_client = boto3.client(
|
|
's3',
|
|
aws_access_key_id=settings.AWS_ACCESS_KEY_ID,
|
|
aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY,
|
|
region_name=settings.AWS_REGION or 'us-east-1',
|
|
endpoint_url=settings.S3_ENDPOINT_URL # For MinIO or other S3-compatible services
|
|
)
|
|
else:
|
|
logger.warning("AWS credentials not configured, using local storage")
|
|
|
|
async def upload_file(self, file: UploadFile, document_id: str) -> Dict[str, Any]:
|
|
"""
|
|
Upload a file to storage with security validation.
|
|
"""
|
|
try:
|
|
# Security validation
|
|
await self._validate_file_security(file)
|
|
|
|
# Generate file path
|
|
file_path = self._generate_file_path(document_id, file.filename)
|
|
|
|
# Read file content
|
|
content = await file.read()
|
|
|
|
# Calculate checksum
|
|
checksum = hashlib.sha256(content).hexdigest()
|
|
|
|
# Upload to storage
|
|
if self.s3_client:
|
|
await self._upload_to_s3(content, file_path, file.content_type)
|
|
storage_url = f"s3://{self.bucket_name}/{file_path}"
|
|
else:
|
|
await self._upload_to_local(content, file_path)
|
|
storage_url = str(file_path)
|
|
|
|
return {
|
|
"file_path": file_path,
|
|
"storage_url": storage_url,
|
|
"file_size": len(content),
|
|
"checksum": checksum,
|
|
"mime_type": file.content_type,
|
|
"uploaded_at": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error uploading file {file.filename}: {str(e)}")
|
|
raise
|
|
|
|
async def download_file(self, file_path: str) -> bytes:
|
|
"""
|
|
Download a file from storage.
|
|
"""
|
|
try:
|
|
if self.s3_client:
|
|
return await self._download_from_s3(file_path)
|
|
else:
|
|
return await self._download_from_local(file_path)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error downloading file {file_path}: {str(e)}")
|
|
raise
|
|
|
|
async def delete_file(self, file_path: str) -> bool:
|
|
"""
|
|
Delete a file from storage.
|
|
"""
|
|
try:
|
|
if self.s3_client:
|
|
return await self._delete_from_s3(file_path)
|
|
else:
|
|
return await self._delete_from_local(file_path)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error deleting file {file_path}: {str(e)}")
|
|
return False
|
|
|
|
async def get_file_info(self, file_path: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Get file information from storage.
|
|
"""
|
|
try:
|
|
if self.s3_client:
|
|
return await self._get_s3_file_info(file_path)
|
|
else:
|
|
return await self._get_local_file_info(file_path)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting file info for {file_path}: {str(e)}")
|
|
return None
|
|
|
|
async def list_files(self, prefix: str = "", max_keys: int = 1000) -> List[Dict[str, Any]]:
|
|
"""
|
|
List files in storage with optional prefix filtering.
|
|
"""
|
|
try:
|
|
if self.s3_client:
|
|
return await self._list_s3_files(prefix, max_keys)
|
|
else:
|
|
return await self._list_local_files(prefix, max_keys)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error listing files with prefix {prefix}: {str(e)}")
|
|
return []
|
|
|
|
async def _validate_file_security(self, file: UploadFile) -> None:
|
|
"""
|
|
Validate file for security threats.
|
|
"""
|
|
# Check file size
|
|
if not file.filename:
|
|
raise ValueError("No filename provided")
|
|
|
|
# Check file extension
|
|
allowed_extensions = {
|
|
'.pdf', '.docx', '.xlsx', '.pptx', '.txt', '.csv',
|
|
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff'
|
|
}
|
|
|
|
file_extension = Path(file.filename).suffix.lower()
|
|
if file_extension not in allowed_extensions:
|
|
raise ValueError(f"File type {file_extension} not allowed")
|
|
|
|
# Check MIME type
|
|
if file.content_type:
|
|
allowed_mime_types = {
|
|
'application/pdf',
|
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
'text/plain',
|
|
'text/csv',
|
|
'image/jpeg',
|
|
'image/png',
|
|
'image/gif',
|
|
'image/bmp',
|
|
'image/tiff'
|
|
}
|
|
|
|
if file.content_type not in allowed_mime_types:
|
|
raise ValueError(f"MIME type {file.content_type} not allowed")
|
|
|
|
def _generate_file_path(self, document_id: str, filename: str) -> str:
|
|
"""
|
|
Generate a secure file path for storage.
|
|
"""
|
|
# Create tenant-specific path
|
|
tenant_path = f"tenants/{self.tenant.id}/documents"
|
|
|
|
# Use document ID and sanitized filename
|
|
sanitized_filename = Path(filename).name.replace(" ", "_")
|
|
file_path = f"{tenant_path}/{document_id}_{sanitized_filename}"
|
|
|
|
return file_path
|
|
|
|
async def _upload_to_s3(self, content: bytes, file_path: str, content_type: str) -> None:
|
|
"""
|
|
Upload file to S3-compatible storage.
|
|
"""
|
|
try:
|
|
self.s3_client.put_object(
|
|
Bucket=self.bucket_name,
|
|
Key=file_path,
|
|
Body=content,
|
|
ContentType=content_type,
|
|
Metadata={
|
|
'tenant_id': str(self.tenant.id),
|
|
'uploaded_at': datetime.utcnow().isoformat()
|
|
}
|
|
)
|
|
except ClientError as e:
|
|
logger.error(f"S3 upload error: {str(e)}")
|
|
raise
|
|
except NoCredentialsError:
|
|
logger.error("AWS credentials not found")
|
|
raise
|
|
|
|
async def _upload_to_local(self, content: bytes, file_path: str) -> None:
|
|
"""
|
|
Upload file to local storage.
|
|
"""
|
|
try:
|
|
# Create directory structure
|
|
local_path = Path(f"storage/{file_path}")
|
|
local_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Write file
|
|
async with aiofiles.open(local_path, 'wb') as f:
|
|
await f.write(content)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Local upload error: {str(e)}")
|
|
raise
|
|
|
|
async def _download_from_s3(self, file_path: str) -> bytes:
|
|
"""
|
|
Download file from S3-compatible storage.
|
|
"""
|
|
try:
|
|
response = self.s3_client.get_object(
|
|
Bucket=self.bucket_name,
|
|
Key=file_path
|
|
)
|
|
return response['Body'].read()
|
|
except ClientError as e:
|
|
logger.error(f"S3 download error: {str(e)}")
|
|
raise
|
|
|
|
async def _download_from_local(self, file_path: str) -> bytes:
|
|
"""
|
|
Download file from local storage.
|
|
"""
|
|
try:
|
|
local_path = Path(f"storage/{file_path}")
|
|
async with aiofiles.open(local_path, 'rb') as f:
|
|
return await f.read()
|
|
except Exception as e:
|
|
logger.error(f"Local download error: {str(e)}")
|
|
raise
|
|
|
|
async def _delete_from_s3(self, file_path: str) -> bool:
|
|
"""
|
|
Delete file from S3-compatible storage.
|
|
"""
|
|
try:
|
|
self.s3_client.delete_object(
|
|
Bucket=self.bucket_name,
|
|
Key=file_path
|
|
)
|
|
return True
|
|
except ClientError as e:
|
|
logger.error(f"S3 delete error: {str(e)}")
|
|
return False
|
|
|
|
async def _delete_from_local(self, file_path: str) -> bool:
|
|
"""
|
|
Delete file from local storage.
|
|
"""
|
|
try:
|
|
local_path = Path(f"storage/{file_path}")
|
|
if local_path.exists():
|
|
local_path.unlink()
|
|
return True
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f"Local delete error: {str(e)}")
|
|
return False
|
|
|
|
async def _get_s3_file_info(self, file_path: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Get file information from S3-compatible storage.
|
|
"""
|
|
try:
|
|
response = self.s3_client.head_object(
|
|
Bucket=self.bucket_name,
|
|
Key=file_path
|
|
)
|
|
|
|
return {
|
|
"file_size": response['ContentLength'],
|
|
"last_modified": response['LastModified'].isoformat(),
|
|
"content_type": response.get('ContentType'),
|
|
"metadata": response.get('Metadata', {})
|
|
}
|
|
except ClientError:
|
|
return None
|
|
|
|
async def _get_local_file_info(self, file_path: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Get file information from local storage.
|
|
"""
|
|
try:
|
|
local_path = Path(f"storage/{file_path}")
|
|
if not local_path.exists():
|
|
return None
|
|
|
|
stat = local_path.stat()
|
|
return {
|
|
"file_size": stat.st_size,
|
|
"last_modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
|
|
"content_type": mimetypes.guess_type(local_path)[0]
|
|
}
|
|
except Exception:
|
|
return None
|
|
|
|
async def _list_s3_files(self, prefix: str, max_keys: int) -> List[Dict[str, Any]]:
|
|
"""
|
|
List files in S3-compatible storage.
|
|
"""
|
|
try:
|
|
tenant_prefix = f"tenants/{self.tenant.id}/documents/{prefix}"
|
|
|
|
response = self.s3_client.list_objects_v2(
|
|
Bucket=self.bucket_name,
|
|
Prefix=tenant_prefix,
|
|
MaxKeys=max_keys
|
|
)
|
|
|
|
files = []
|
|
for obj in response.get('Contents', []):
|
|
files.append({
|
|
"key": obj['Key'],
|
|
"size": obj['Size'],
|
|
"last_modified": obj['LastModified'].isoformat()
|
|
})
|
|
|
|
return files
|
|
except ClientError as e:
|
|
logger.error(f"S3 list error: {str(e)}")
|
|
return []
|
|
|
|
async def _list_local_files(self, prefix: str, max_keys: int) -> List[Dict[str, Any]]:
|
|
"""
|
|
List files in local storage.
|
|
"""
|
|
try:
|
|
tenant_path = Path(f"storage/tenants/{self.tenant.id}/documents/{prefix}")
|
|
if not tenant_path.exists():
|
|
return []
|
|
|
|
files = []
|
|
for file_path in tenant_path.rglob("*"):
|
|
if file_path.is_file():
|
|
stat = file_path.stat()
|
|
files.append({
|
|
"key": str(file_path.relative_to(Path("storage"))),
|
|
"size": stat.st_size,
|
|
"last_modified": datetime.fromtimestamp(stat.st_mtime).isoformat()
|
|
})
|
|
|
|
if len(files) >= max_keys:
|
|
break
|
|
|
|
return files
|
|
except Exception as e:
|
|
logger.error(f"Local list error: {str(e)}")
|
|
return []
|
|
|
|
async def cleanup_old_files(self, days_old: int = 30) -> int:
|
|
"""
|
|
Clean up old files from storage.
|
|
"""
|
|
try:
|
|
cutoff_date = datetime.utcnow() - timedelta(days=days_old)
|
|
deleted_count = 0
|
|
|
|
files = await self.list_files()
|
|
|
|
for file_info in files:
|
|
last_modified = datetime.fromisoformat(file_info['last_modified'])
|
|
if last_modified < cutoff_date:
|
|
if await self.delete_file(file_info['key']):
|
|
deleted_count += 1
|
|
|
|
return deleted_count
|
|
|
|
except Exception as e:
|
|
logger.error(f"Cleanup error: {str(e)}")
|
|
return 0
|