#!/bin/bash set -euo pipefail # Storage Performance Optimizer # Optimizes storage performance with SSD caching, database tuning, and I/O optimization # Part of the Migration Issues Resolution Framework # Source the error handling library SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/lib/error_handling.sh" # Configuration readonly LOG_FILE="${SCRIPT_DIR}/../logs/storage_optimization_$(date +%Y%m%d_%H%M%S).log" readonly CONFIG_BACKUP_DIR="${SCRIPT_DIR}/../backups/storage_configs" readonly DOCKER_COMPOSE_DIR="${SCRIPT_DIR}/../../" # Initialize logging init_logging "$LOG_FILE" main() { log_info "Starting storage performance optimization" # Register cleanup function register_cleanup cleanup_on_exit # Validate prerequisites validate_prerequisites # Analyze current storage configuration analyze_storage_configuration # Configure SSD caching setup_ssd_caching # Optimize database storage optimize_database_storage # Configure filesystem optimizations configure_filesystem_optimizations # Setup storage monitoring setup_storage_monitoring # Create performance testing tools create_performance_testing_tools # Apply Docker volume optimizations optimize_docker_volumes log_info "Storage performance optimization completed successfully" } validate_prerequisites() { log_info "Validating storage optimization prerequisites" local required_commands=( "docker" "lsblk" "df" "iostat" "iotop" "smartctl" "tune2fs" ) for cmd in "${required_commands[@]}"; do if ! command -v "$cmd" &>/dev/null; then log_error "Required command not found: $cmd" exit 1 fi done # Check if running as root or with sudo if [[ $EUID -ne 0 ]]; then log_error "This script must be run as root or with sudo" exit 1 fi # Install additional tools if needed if ! command -v fio &>/dev/null; then log_info "Installing fio for storage benchmarking" apt-get update && apt-get install -y fio fi if ! command -v bcache-super-show &>/dev/null; then log_info "Installing bcache tools" apt-get update && apt-get install -y bcache-tools fi log_info "Prerequisites validation completed" } analyze_storage_configuration() { log_info "Analyzing current storage configuration" local storage_report="${CONFIG_BACKUP_DIR}/storage_analysis_$(date +%Y%m%d_%H%M%S).txt" mkdir -p "$(dirname "$storage_report")" { echo "Storage Configuration Analysis Report" echo "Generated: $(date)" echo "=====================================" echo echo "Block Devices:" lsblk -o NAME,SIZE,TYPE,MOUNTPOINT,FSTYPE,UUID echo echo "Filesystem Usage:" df -h echo echo "Disk I/O Statistics:" iostat -x 1 3 || echo "iostat not available" echo echo "SMART Status for all drives:" for drive in /dev/sd* /dev/nvme*; do if [[ -b "$drive" ]]; then echo "Drive: $drive" smartctl -H "$drive" 2>/dev/null || echo "SMART not available for $drive" echo fi done echo "Current mount options:" cat /proc/mounts | grep -E "(ext4|xfs|btrfs|zfs)" echo echo "Docker storage driver:" docker info | grep -E "(Storage Driver|Backing Filesystem)" echo echo "Docker volume list:" docker volume ls echo if command -v zpool &>/dev/null; then echo "ZFS pools:" zpool status || echo "No ZFS pools found" echo fi } > "$storage_report" log_info "Storage analysis report saved to: $storage_report" # Detect storage types detect_storage_configuration } detect_storage_configuration() { log_info "Detecting storage configuration" # Detect SSDs SSD_DEVICES=() HDD_DEVICES=() for device in /sys/block/*/queue/rotational; do if [[ -r "$device" ]]; then device_name=$(echo "$device" | cut -d'/' -f4) if [[ $(cat "$device") -eq 0 ]]; then SSD_DEVICES+=("/dev/$device_name") log_info "Detected SSD: /dev/$device_name" else HDD_DEVICES+=("/dev/$device_name") log_info "Detected HDD: /dev/$device_name" fi fi done # Detect filesystem types ROOT_FS=$(findmnt -n -o FSTYPE /) DATA_MOUNT_POINTS=$(df -t ext4 -t xfs -t btrfs -t zfs --output=target | tail -n +2) export SSD_DEVICES HDD_DEVICES ROOT_FS DATA_MOUNT_POINTS log_info "Storage detection completed - SSDs: ${#SSD_DEVICES[@]}, HDDs: ${#HDD_DEVICES[@]}" } setup_ssd_caching() { log_info "Setting up SSD caching" if [[ ${#SSD_DEVICES[@]} -eq 0 ]]; then log_warn "No SSDs detected - skipping SSD caching setup" return 0 fi # Configure bcache for SSD caching if multiple devices available if [[ ${#SSD_DEVICES[@]} -gt 0 && ${#HDD_DEVICES[@]} -gt 0 ]]; then setup_bcache_caching fi # Configure filesystem caching optimizations configure_filesystem_caching # Setup Docker volume caching setup_docker_volume_caching } setup_bcache_caching() { log_info "Setting up bcache SSD caching" # Note: bcache setup requires careful consideration and testing # This implementation provides the framework but requires manual validation cat > "${CONFIG_BACKUP_DIR}/bcache_setup_guide.md" << 'EOF' # Bcache SSD Caching Setup Guide **WARNING**: Bcache setup can be destructive. Test in staging environment first. ## Prerequisites - At least one SSD and one HDD - Data backed up - System in maintenance mode ## Setup Steps 1. **Identify devices**: ```bash # SSD for cache (will be wiped) CACHE_DEVICE="/dev/sdb" # HDD for backing device (will be wiped) BACKING_DEVICE="/dev/sdc" ``` 2. **Create bcache devices**: ```bash # Make cache device make-bcache -C $CACHE_DEVICE # Make backing device make-bcache -B $BACKING_DEVICE # Register cache echo $CACHE_DEVICE > /sys/fs/bcache/register echo $BACKING_DEVICE > /sys/fs/bcache/register ``` 3. **Attach cache to backing device**: ```bash # Find bcache device BCACHE_DEVICE=$(ls /dev/bcache*) # Attach cache echo $(ls /sys/fs/bcache/ | grep -E "^[0-9a-f-]+$" | head -1) > \ /sys/block/$(basename $BCACHE_DEVICE)/bcache/attach ``` 4. **Configure caching policy**: ```bash # Set to writeback mode for better performance echo writeback > /sys/block/$(basename $BCACHE_DEVICE)/bcache/cache_mode ``` ## Current System Analysis EOF # Add current system analysis to the guide { echo echo "### Current Storage Devices" echo "SSDs detected: ${SSD_DEVICES[*]}" echo "HDDs detected: ${HDD_DEVICES[*]}" echo echo "### Recommended Configuration" if [[ ${#SSD_DEVICES[@]} -gt 0 ]] && [[ ${#HDD_DEVICES[@]} -gt 0 ]]; then echo "- Use ${SSD_DEVICES[0]} as cache device" echo "- Use ${HDD_DEVICES[0]} as backing device" echo "- This will provide SSD-accelerated storage for Docker volumes" else echo "- Not enough devices for bcache setup" echo "- Consider filesystem-level optimizations instead" fi } >> "${CONFIG_BACKUP_DIR}/bcache_setup_guide.md" log_info "Bcache setup guide created at: ${CONFIG_BACKUP_DIR}/bcache_setup_guide.md" log_warn "Manual bcache setup required - see guide for details" } configure_filesystem_caching() { log_info "Configuring filesystem-level caching optimizations" # Configure read-ahead settings for better sequential performance for device in "${SSD_DEVICES[@]}" "${HDD_DEVICES[@]}"; do device_name=$(basename "$device") # Set read-ahead for SSDs (smaller) and HDDs (larger) if [[ " ${SSD_DEVICES[*]} " =~ " ${device} " ]]; then # SSD: smaller read-ahead since random access is fast echo 256 > "/sys/block/$device_name/queue/read_ahead_kb" log_info "Set SSD read-ahead to 256KB for $device" else # HDD: larger read-ahead for better sequential performance echo 1024 > "/sys/block/$device_name/queue/read_ahead_kb" log_info "Set HDD read-ahead to 1024KB for $device" fi # Configure I/O scheduler if [[ " ${SSD_DEVICES[*]} " =~ " ${device} " ]]; then # Use mq-deadline for SSDs echo "mq-deadline" > "/sys/block/$device_name/queue/scheduler" 2>/dev/null || \ echo "deadline" > "/sys/block/$device_name/queue/scheduler" 2>/dev/null || \ log_warn "Could not set I/O scheduler for $device" log_info "Set mq-deadline scheduler for SSD $device" else # Use mq-deadline for HDDs too (generally better than CFQ for modern workloads) echo "mq-deadline" > "/sys/block/$device_name/queue/scheduler" 2>/dev/null || \ echo "deadline" > "/sys/block/$device_name/queue/scheduler" 2>/dev/null || \ log_warn "Could not set I/O scheduler for $device" log_info "Set mq-deadline scheduler for HDD $device" fi done # Make read-ahead settings persistent cat > "/etc/udev/rules.d/60-storage-optimization.rules" << 'EOF' # Storage optimization rules # SSD read-ahead optimization ACTION=="add|change", KERNEL=="sd[a-z]", ATTR{queue/rotational}=="0", ATTR{queue/read_ahead_kb}="256" ACTION=="add|change", KERNEL=="nvme[0-9]n[0-9]", ATTR{queue/read_ahead_kb}="256" # HDD read-ahead optimization ACTION=="add|change", KERNEL=="sd[a-z]", ATTR{queue/rotational}=="1", ATTR{queue/read_ahead_kb}="1024" # I/O scheduler optimization ACTION=="add|change", KERNEL=="sd[a-z]", ATTR{queue/rotational}=="0", ATTR{queue/scheduler}="mq-deadline" ACTION=="add|change", KERNEL=="sd[a-z]", ATTR{queue/rotational}=="1", ATTR{queue/scheduler}="mq-deadline" ACTION=="add|change", KERNEL=="nvme[0-9]n[0-9]", ATTR{queue/scheduler}="none" EOF # Reload udev rules udevadm control --reload-rules && udevadm trigger log_info "Filesystem caching optimizations applied" } setup_docker_volume_caching() { log_info "Setting up Docker volume caching optimizations" # Configure Docker daemon for storage optimization local docker_daemon_config="/etc/docker/daemon.json" local backup_file="${CONFIG_BACKUP_DIR}/daemon.json.backup.$(date +%Y%m%d_%H%M%S)" mkdir -p "$(dirname "$backup_file")" if [[ -f "$docker_daemon_config" ]]; then cp "$docker_daemon_config" "$backup_file" log_info "Docker daemon config backed up to: $backup_file" fi # Create optimized Docker daemon configuration python3 << 'EOF' import json import os daemon_config_path = "/etc/docker/daemon.json" config = {} # Load existing config if it exists if os.path.exists(daemon_config_path): try: with open(daemon_config_path, 'r') as f: config = json.load(f) except: config = {} # Add storage optimizations config.update({ "storage-driver": "overlay2", "storage-opts": [ "overlay2.override_kernel_check=true" ], "log-driver": "json-file", "log-opts": { "max-size": "10m", "max-file": "3" }, "data-root": "/var/lib/docker", "exec-opts": ["native.cgroupdriver=systemd"], "live-restore": True, "userland-proxy": False, "experimental": False }) # Write updated config with open(daemon_config_path, 'w') as f: json.dump(config, f, indent=2) print("Docker daemon configuration updated") EOF # Create optimized volume mount options script cat > "${SCRIPT_DIR}/optimize_volume_mounts.sh" << 'EOF' #!/bin/bash # Docker Volume Mount Optimization Script set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/lib/error_handling.sh" optimize_volume_mounts() { log_info "Optimizing Docker volume mount options" # Get all Docker volumes local volumes=$(docker volume ls -q) for volume in $volumes; do local volume_path=$(docker volume inspect "$volume" --format '{{.Mountpoint}}') if [[ -d "$volume_path" ]]; then # Optimize directory for database workloads if it contains database files if [[ "$volume" =~ (postgres|mysql|mongodb|redis|influx) ]]; then log_info "Optimizing database volume: $volume" # Set optimal ownership and permissions chown -R 999:999 "$volume_path" 2>/dev/null || true # For ext4 filesystems, set optimal attributes if findmnt -n -o FSTYPE "$volume_path" | grep -q ext4; then # Enable extent attributes for large files find "$volume_path" -type f -size +100M -exec chattr +e {} \; 2>/dev/null || true fi fi # Set optimal permissions for all volumes chmod 755 "$volume_path" fi done log_info "Volume mount optimization completed" } optimize_volume_mounts EOF chmod +x "${SCRIPT_DIR}/optimize_volume_mounts.sh" # Restart Docker to apply changes systemctl restart docker log_info "Docker volume caching optimizations completed" } optimize_database_storage() { log_info "Optimizing database storage configurations" # Create PostgreSQL optimization configuration create_postgresql_optimizations # Create Redis optimization configuration create_redis_optimizations # Create InfluxDB optimization configuration create_influxdb_optimizations # Setup database connection pooling setup_connection_pooling } create_postgresql_optimizations() { log_info "Creating PostgreSQL storage optimizations" local postgres_config_dir="${DOCKER_COMPOSE_DIR}/postgres" mkdir -p "$postgres_config_dir" # Get system memory for calculations local total_memory_kb=$(grep MemTotal /proc/meminfo | awk '{print $2}') local total_memory_mb=$((total_memory_kb / 1024)) # Calculate optimal PostgreSQL settings local shared_buffers=$((total_memory_mb / 4)) # 25% of total memory local effective_cache_size=$((total_memory_mb * 3 / 4)) # 75% of total memory local work_mem=$((total_memory_mb / 64)) # Conservative work_mem local maintenance_work_mem=$((total_memory_mb / 16)) # For maintenance operations cat > "${postgres_config_dir}/postgresql.conf.optimized" << EOF # PostgreSQL Optimized Configuration # Generated: $(date) # System Memory: ${total_memory_mb}MB # Memory Configuration shared_buffers = ${shared_buffers}MB effective_cache_size = ${effective_cache_size}MB work_mem = ${work_mem}MB maintenance_work_mem = ${maintenance_work_mem}MB # Storage and I/O Configuration wal_buffers = 16MB checkpoint_completion_target = 0.9 checkpoint_timeout = 15min max_wal_size = 2GB min_wal_size = 512MB # Connection Configuration max_connections = 200 shared_preload_libraries = 'pg_stat_statements' # Query Planning default_statistics_target = 100 random_page_cost = 1.1 # Optimized for SSD seq_page_cost = 1.0 # Write-Ahead Logging wal_compression = on wal_log_hints = on full_page_writes = on # Performance and Monitoring track_activities = on track_counts = on track_io_timing = on track_functions = all log_min_duration_statement = 1000 # Log queries over 1 second # Vacuum and Autovacuum autovacuum = on autovacuum_max_workers = 3 autovacuum_naptime = 30s autovacuum_vacuum_scale_factor = 0.1 autovacuum_analyze_scale_factor = 0.05 # Replication (if using) wal_level = replica archive_mode = off # Background Writer bgwriter_delay = 200ms bgwriter_lru_maxpages = 100 bgwriter_lru_multiplier = 2.0 EOF # Create Docker Compose override for PostgreSQL cat > "${postgres_config_dir}/docker-compose.postgres-optimized.yml" << 'EOF' version: '3.8' services: postgres: image: postgres:15-alpine container_name: postgres_optimized restart: unless-stopped environment: - POSTGRES_DB=homelab - POSTGRES_USER=postgres - POSTGRES_PASSWORD_FILE=/run/secrets/postgres_password volumes: - postgres_data:/var/lib/postgresql/data - ./postgresql.conf.optimized:/etc/postgresql/postgresql.conf - /dev/shm:/dev/shm # Shared memory optimization command: postgres -c config_file=/etc/postgresql/postgresql.conf deploy: resources: limits: memory: 1G reservations: memory: 512M secrets: - postgres_password networks: - data_network healthcheck: test: ["CMD-SHELL", "pg_isready -U postgres"] interval: 30s timeout: 10s retries: 3 volumes: postgres_data: driver: local driver_opts: type: none o: bind device: /opt/homelab/postgres/data secrets: postgres_password: external: true networks: data_network: external: true EOF log_info "PostgreSQL optimization configuration created" } create_redis_optimizations() { log_info "Creating Redis storage optimizations" local redis_config_dir="${DOCKER_COMPOSE_DIR}/redis" mkdir -p "$redis_config_dir" cat > "${redis_config_dir}/redis.conf.optimized" << 'EOF' # Redis Optimized Configuration # Generated for high-performance home lab setup # Basic Configuration daemonize no port 6379 bind 0.0.0.0 protected-mode yes requirepass changeme_use_docker_secrets # Memory Management maxmemory 256mb maxmemory-policy allkeys-lru maxmemory-samples 5 # Persistence Configuration (balanced between performance and durability) save 900 1 save 300 10 save 60 10000 stop-writes-on-bgsave-error yes rdbcompression yes rdbchecksum yes dbfilename dump.rdb # AOF Configuration (for better durability) appendonly yes appendfilename "appendonly.aof" appendfsync everysec no-appendfsync-on-rewrite no auto-aof-rewrite-percentage 100 auto-aof-rewrite-min-size 64mb # Network and Connection Configuration timeout 300 tcp-keepalive 300 tcp-backlog 511 # Performance Tuning hash-max-ziplist-entries 512 hash-max-ziplist-value 64 list-max-ziplist-size -2 list-compress-depth 0 set-max-intset-entries 512 zset-max-ziplist-entries 128 zset-max-ziplist-value 64 # Logging loglevel notice logfile "" syslog-enabled no # Advanced Configuration databases 16 latency-monitor-threshold 100 notify-keyspace-events "" # Security rename-command FLUSHDB "" rename-command FLUSHALL "" rename-command DEBUG "" rename-command CONFIG "CONFIG_a83b9c74d0e3f2a1" EOF # Create Docker Compose override for Redis cat > "${redis_config_dir}/docker-compose.redis-optimized.yml" << 'EOF' version: '3.8' services: redis: image: redis:7-alpine container_name: redis_optimized restart: unless-stopped volumes: - redis_data:/data - ./redis.conf.optimized:/etc/redis/redis.conf - /dev/shm:/dev/shm command: redis-server /etc/redis/redis.conf deploy: resources: limits: memory: 512M reservations: memory: 256M networks: - data_network healthcheck: test: ["CMD", "redis-cli", "ping"] interval: 30s timeout: 10s retries: 3 volumes: redis_data: driver: local driver_opts: type: none o: bind device: /opt/homelab/redis/data networks: data_network: external: true EOF log_info "Redis optimization configuration created" } create_influxdb_optimizations() { log_info "Creating InfluxDB storage optimizations" local influxdb_config_dir="${DOCKER_COMPOSE_DIR}/influxdb" mkdir -p "$influxdb_config_dir" cat > "${influxdb_config_dir}/influxdb.conf.optimized" << 'EOF' # InfluxDB Optimized Configuration # Generated for time-series monitoring workloads [meta] dir = "/var/lib/influxdb/meta" retention-autocreate = true logging-enabled = true [data] dir = "/var/lib/influxdb/data" wal-dir = "/var/lib/influxdb/wal" # Storage engine settings cache-max-memory-size = "256m" cache-snapshot-memory-size = "25m" cache-snapshot-write-cold-duration = "10m" # Compaction settings compact-full-write-cold-duration = "4h" compact-throughput = "48m" compact-throughput-burst = "48m" # WAL settings wal-fsync-delay = "0s" # Query settings query-timeout = "0s" max-concurrent-queries = 0 # Series and measurement limits max-series-per-database = 1000000 max-values-per-tag = 100000 [coordinator] write-timeout = "10s" max-concurrent-queries = 0 query-timeout = "0s" log-queries-after = "0s" max-select-point = 0 max-select-series = 0 max-select-buckets = 0 [retention] enabled = true check-interval = "30m" [shard-precreation] enabled = true check-interval = "10m" advance-period = "30m" [monitor] store-enabled = true store-database = "_internal" store-interval = "10s" [admin] enabled = false [http] enabled = true bind-address = ":8086" auth-enabled = false log-enabled = true write-tracing = false pprof-enabled = true debug-pprof-enabled = false https-enabled = false [logging] format = "auto" level = "info" suppress-logo = false [[graphite]] enabled = false [[collectd]] enabled = false [[opentsdb]] enabled = false [[udp]] enabled = false EOF # Create Docker Compose override for InfluxDB cat > "${influxdb_config_dir}/docker-compose.influxdb-optimized.yml" << 'EOF' version: '3.8' services: influxdb: image: influxdb:1.8-alpine container_name: influxdb_optimized restart: unless-stopped environment: - INFLUXDB_DB=homelab - INFLUXDB_ADMIN_USER=admin - INFLUXDB_ADMIN_PASSWORD_FILE=/run/secrets/influxdb_admin_password - INFLUXDB_USER=homelab - INFLUXDB_USER_PASSWORD_FILE=/run/secrets/influxdb_user_password volumes: - influxdb_data:/var/lib/influxdb - ./influxdb.conf.optimized:/etc/influxdb/influxdb.conf - /dev/shm:/dev/shm command: influxd -config /etc/influxdb/influxdb.conf ports: - "8086:8086" deploy: resources: limits: memory: 1G reservations: memory: 512M secrets: - influxdb_admin_password - influxdb_user_password networks: - monitoring_network healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8086/ping"] interval: 30s timeout: 10s retries: 3 volumes: influxdb_data: driver: local driver_opts: type: none o: bind device: /opt/homelab/influxdb/data secrets: influxdb_admin_password: external: true influxdb_user_password: external: true networks: monitoring_network: external: true EOF log_info "InfluxDB optimization configuration created" } setup_connection_pooling() { log_info "Setting up database connection pooling" local pooling_config_dir="${DOCKER_COMPOSE_DIR}/connection-pooling" mkdir -p "$pooling_config_dir" # Create PgBouncer configuration for PostgreSQL cat > "${pooling_config_dir}/pgbouncer.ini" << 'EOF' [databases] homelab = host=postgres port=5432 dbname=homelab immich = host=postgres port=5432 dbname=immich nextcloud = host=postgres port=5432 dbname=nextcloud [pgbouncer] listen_addr = 0.0.0.0 listen_port = 5432 auth_type = md5 auth_file = /etc/pgbouncer/userlist.txt admin_users = postgres stats_users = postgres # Connection pooling settings pool_mode = transaction max_client_conn = 1000 default_pool_size = 20 min_pool_size = 5 reserve_pool_size = 5 reserve_pool_timeout = 5 max_db_connections = 100 max_user_connections = 100 # Performance settings server_reset_query = DISCARD ALL server_check_query = select 1 server_check_delay = 30 max_packet_size = 2147483647 # Logging log_connections = 1 log_disconnections = 1 log_pooler_errors = 1 # Timeouts server_lifetime = 3600 server_idle_timeout = 600 client_idle_timeout = 0 client_login_timeout = 60 autodb_idle_timeout = 3600 # Security ignore_startup_parameters = extra_float_digits # Advanced application_name_add_host = 1 EOF # Create PgBouncer Docker service cat > "${pooling_config_dir}/docker-compose.pgbouncer.yml" << 'EOF' version: '3.8' services: pgbouncer: image: pgbouncer/pgbouncer:latest container_name: pgbouncer restart: unless-stopped environment: - DATABASES_HOST=postgres - DATABASES_PORT=5432 - POOL_MODE=transaction - DEFAULT_POOL_SIZE=20 - MAX_CLIENT_CONN=1000 volumes: - ./pgbouncer.ini:/etc/pgbouncer/pgbouncer.ini - pgbouncer_logs:/var/log/pgbouncer ports: - "6432:5432" depends_on: - postgres deploy: resources: limits: memory: 256M reservations: memory: 128M networks: - data_network healthcheck: test: ["CMD", "psql", "-h", "localhost", "-p", "5432", "-U", "postgres", "-c", "SELECT 1"] interval: 30s timeout: 10s retries: 3 volumes: pgbouncer_logs: networks: data_network: external: true EOF log_info "Database connection pooling configuration created" } configure_filesystem_optimizations() { log_info "Configuring filesystem-level optimizations" # Create filesystem optimization configuration local fs_config_dir="${CONFIG_BACKUP_DIR}/filesystem" mkdir -p "$fs_config_dir" # Optimize ext4 filesystems optimize_ext4_filesystems # Configure swap optimization configure_swap_optimization # Setup transparent huge pages configure_transparent_hugepages # Configure kernel parameters configure_kernel_parameters } optimize_ext4_filesystems() { log_info "Optimizing ext4 filesystems" # Find ext4 filesystems local ext4_filesystems ext4_filesystems=$(findmnt -t ext4 -o TARGET --noheadings) for fs in $ext4_filesystems; do local device device=$(findmnt -n -o SOURCE "$fs") if [[ -b "$device" ]]; then log_info "Optimizing ext4 filesystem: $fs ($device)" # Get current mount options local current_opts current_opts=$(findmnt -n -o OPTIONS "$fs") # Check if we need to remount with optimizations if ! echo "$current_opts" | grep -q "noatime"; then log_info "Adding noatime option to $fs" mount -o remount,noatime "$fs" || log_warn "Failed to add noatime to $fs" fi # Update fstab for persistence local fstab_backup="${CONFIG_BACKUP_DIR}/fstab.backup.$(date +%Y%m%d_%H%M%S)" cp /etc/fstab "$fstab_backup" # Add or update noatime in fstab sed -i "s|$device.*ext4.*defaults|$device $fs ext4 defaults,noatime|g" /etc/fstab || \ log_warn "Could not update fstab for $device" fi done log_info "ext4 filesystem optimization completed" } configure_swap_optimization() { log_info "Configuring swap optimization" # Configure swappiness for better performance local current_swappiness current_swappiness=$(cat /proc/sys/vm/swappiness) if [[ "$current_swappiness" -ne 10 ]]; then echo 10 > /proc/sys/vm/swappiness log_info "Set swappiness to 10 (was $current_swappiness)" # Make persistent echo "vm.swappiness=10" >> /etc/sysctl.conf fi # Configure vfs_cache_pressure echo 50 > /proc/sys/vm/vfs_cache_pressure echo "vm.vfs_cache_pressure=50" >> /etc/sysctl.conf log_info "Swap optimization completed" } configure_transparent_hugepages() { log_info "Configuring transparent huge pages" # Disable THP for database workloads (often better performance) if [[ -f /sys/kernel/mm/transparent_hugepage/enabled ]]; then echo never > /sys/kernel/mm/transparent_hugepage/enabled echo never > /sys/kernel/mm/transparent_hugepage/defrag # Make persistent cat >> /etc/rc.local << 'EOF' #!/bin/bash # Disable transparent huge pages for database performance echo never > /sys/kernel/mm/transparent_hugepage/enabled echo never > /sys/kernel/mm/transparent_hugepage/defrag exit 0 EOF chmod +x /etc/rc.local log_info "Disabled transparent huge pages for database workloads" fi } configure_kernel_parameters() { log_info "Configuring kernel parameters for storage performance" # Backup current sysctl local sysctl_backup="${CONFIG_BACKUP_DIR}/sysctl.conf.backup.$(date +%Y%m%d_%H%M%S)" cp /etc/sysctl.conf "$sysctl_backup" # Add storage performance optimizations cat >> /etc/sysctl.conf << 'EOF' # Storage Performance Optimizations # Added by storage_performance_optimizer.sh # Virtual Memory settings vm.dirty_ratio = 5 vm.dirty_background_ratio = 2 vm.dirty_expire_centisecs = 3000 vm.dirty_writeback_centisecs = 500 # Network performance (affects storage over network) net.core.rmem_max = 134217728 net.core.wmem_max = 134217728 net.ipv4.tcp_rmem = 4096 87380 134217728 net.ipv4.tcp_wmem = 4096 65536 134217728 net.ipv4.tcp_congestion_control = bbr # File system settings fs.file-max = 2097152 fs.nr_open = 1048576 # Process limits kernel.pid_max = 4194304 EOF # Apply settings sysctl -p log_info "Kernel parameters configured for storage performance" } setup_storage_monitoring() { log_info "Setting up storage performance monitoring" # Create storage monitoring script cat > "${SCRIPT_DIR}/storage_monitor.py" << 'EOF' #!/usr/bin/env python3 """ Storage Performance Monitor Provides Prometheus metrics for storage I/O performance """ import psutil import time import json from http.server import HTTPServer, BaseHTTPRequestHandler import threading import logging import subprocess import os class StorageMonitor: def __init__(self): self.metrics = {} self.update_interval = 10 def get_disk_io_metrics(self): """Get disk I/O metrics using psutil""" try: disk_io = psutil.disk_io_counters(perdisk=True) disk_usage = {} # Get disk usage for each mount point for partition in psutil.disk_partitions(): try: usage = psutil.disk_usage(partition.mountpoint) disk_usage[partition.device] = { 'total': usage.total, 'used': usage.used, 'free': usage.free, 'percent': usage.percent, 'mountpoint': partition.mountpoint, 'fstype': partition.fstype } except PermissionError: continue metrics = { 'disk_io': {}, 'disk_usage': disk_usage, 'timestamp': time.time() } for device, io_stats in disk_io.items(): metrics['disk_io'][device] = { 'read_count': io_stats.read_count, 'write_count': io_stats.write_count, 'read_bytes': io_stats.read_bytes, 'write_bytes': io_stats.write_bytes, 'read_time': io_stats.read_time, 'write_time': io_stats.write_time } return metrics except Exception as e: logging.error(f"Error getting disk I/O metrics: {e}") return {} def get_smart_metrics(self): """Get SMART metrics for disk health""" smart_metrics = {} try: # Get list of drives result = subprocess.run(['lsblk', '-o', 'NAME,TYPE', '-n'], capture_output=True, text=True) for line in result.stdout.strip().split('\n'): parts = line.split() if len(parts) >= 2 and parts[1] == 'disk': device = f"/dev/{parts[0]}" try: # Get SMART health status smart_result = subprocess.run( ['smartctl', '-H', device], capture_output=True, text=True ) health_status = 1 if 'PASSED' in smart_result.stdout else 0 smart_metrics[device] = {'health_status': health_status} except Exception as e: logging.warning(f"Could not get SMART data for {device}: {e}") except Exception as e: logging.error(f"Error getting SMART metrics: {e}") return smart_metrics def update_metrics(self): """Update all metrics periodically""" while True: try: self.metrics = { 'disk_metrics': self.get_disk_io_metrics(), 'smart_metrics': self.get_smart_metrics(), 'last_update': time.time() } except Exception as e: logging.error(f"Error updating metrics: {e}") time.sleep(self.update_interval) class MetricsHandler(BaseHTTPRequestHandler): def __init__(self, storage_monitor, *args, **kwargs): self.storage_monitor = storage_monitor super().__init__(*args, **kwargs) def do_GET(self): if self.path == '/metrics': self.send_response(200) self.send_header('Content-type', 'text/plain') self.end_headers() metrics_text = self.generate_prometheus_metrics() self.wfile.write(metrics_text.encode()) elif self.path == '/health': self.send_response(200) self.send_header('Content-type', 'application/json') self.end_headers() self.wfile.write(json.dumps({"status": "healthy"}).encode()) else: self.send_response(404) self.end_headers() def generate_prometheus_metrics(self): """Generate Prometheus format metrics""" metrics = [] try: disk_metrics = self.storage_monitor.metrics.get('disk_metrics', {}) # Disk I/O metrics disk_io = disk_metrics.get('disk_io', {}) for device, stats in disk_io.items(): device_label = device.replace('/', '_') metrics.extend([ f'# HELP disk_read_bytes_total Total bytes read from disk', f'# TYPE disk_read_bytes_total counter', f'disk_read_bytes_total{{device="{device}"}} {stats["read_bytes"]}', f'# HELP disk_write_bytes_total Total bytes written to disk', f'# TYPE disk_write_bytes_total counter', f'disk_write_bytes_total{{device="{device}"}} {stats["write_bytes"]}', f'# HELP disk_read_operations_total Total read operations', f'# TYPE disk_read_operations_total counter', f'disk_read_operations_total{{device="{device}"}} {stats["read_count"]}', f'# HELP disk_write_operations_total Total write operations', f'# TYPE disk_write_operations_total counter', f'disk_write_operations_total{{device="{device}"}} {stats["write_count"]}', f'# HELP disk_read_time_ms_total Total time spent reading (ms)', f'# TYPE disk_read_time_ms_total counter', f'disk_read_time_ms_total{{device="{device}"}} {stats["read_time"]}', f'# HELP disk_write_time_ms_total Total time spent writing (ms)', f'# TYPE disk_write_time_ms_total counter', f'disk_write_time_ms_total{{device="{device}"}} {stats["write_time"]}', ]) # Disk usage metrics disk_usage = disk_metrics.get('disk_usage', {}) for device, usage in disk_usage.items(): metrics.extend([ f'# HELP disk_usage_bytes Disk usage in bytes', f'# TYPE disk_usage_bytes gauge', f'disk_usage_bytes{{device="{device}",mountpoint="{usage["mountpoint"]}",fstype="{usage["fstype"]}",type="total"}} {usage["total"]}', f'disk_usage_bytes{{device="{device}",mountpoint="{usage["mountpoint"]}",fstype="{usage["fstype"]}",type="used"}} {usage["used"]}', f'disk_usage_bytes{{device="{device}",mountpoint="{usage["mountpoint"]}",fstype="{usage["fstype"]}",type="free"}} {usage["free"]}', f'# HELP disk_usage_percent Disk usage percentage', f'# TYPE disk_usage_percent gauge', f'disk_usage_percent{{device="{device}",mountpoint="{usage["mountpoint"]}"}} {usage["percent"]}', ]) # SMART health metrics smart_metrics = self.storage_monitor.metrics.get('smart_metrics', {}) for device, smart_data in smart_metrics.items(): metrics.extend([ f'# HELP disk_smart_health SMART health status (1=healthy, 0=failing)', f'# TYPE disk_smart_health gauge', f'disk_smart_health{{device="{device}"}} {smart_data["health_status"]}', ]) except Exception as e: logging.error(f"Error generating metrics: {e}") metrics.append(f'# Error generating metrics: {e}') return '\n'.join(metrics) def main(): logging.basicConfig(level=logging.INFO) storage_monitor = StorageMonitor() # Start metrics collection in background metrics_thread = threading.Thread(target=storage_monitor.update_metrics, daemon=True) metrics_thread.start() # Create handler with storage_monitor def handler(*args, **kwargs): return MetricsHandler(storage_monitor, *args, **kwargs) # Start HTTP server server = HTTPServer(('0.0.0.0', 9102), handler) print("Storage metrics server started on port 9102") server.serve_forever() if __name__ == '__main__': main() EOF chmod +x "${SCRIPT_DIR}/storage_monitor.py" # Create systemd service cat > "/etc/systemd/system/storage-monitor.service" << EOF [Unit] Description=Storage Performance Monitor After=network.target [Service] Type=simple User=root WorkingDirectory=${SCRIPT_DIR} ExecStart=/usr/bin/python3 ${SCRIPT_DIR}/storage_monitor.py Restart=always RestartSec=10 [Install] WantedBy=multi-user.target EOF # Enable and start the service systemctl daemon-reload systemctl enable storage-monitor.service systemctl start storage-monitor.service log_info "Storage monitoring setup completed" } create_performance_testing_tools() { log_info "Creating storage performance testing tools" # Create comprehensive storage benchmark script cat > "${SCRIPT_DIR}/storage_benchmark.sh" << 'EOF' #!/bin/bash # Storage Performance Benchmark Tool set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/lib/error_handling.sh" BENCHMARK_DIR="/tmp/storage_benchmark_$(date +%Y%m%d_%H%M%S)" RESULTS_DIR="${SCRIPT_DIR}/../results/benchmarks" run_storage_benchmarks() { log_info "Running comprehensive storage benchmarks" mkdir -p "$BENCHMARK_DIR" "$RESULTS_DIR" local results_file="${RESULTS_DIR}/storage_benchmark_$(date +%Y%m%d_%H%M%S).json" { echo "{" echo " \"benchmark_info\": {" echo " \"timestamp\": \"$(date -Iseconds)\"," echo " \"hostname\": \"$(hostname)\"," echo " \"benchmark_dir\": \"$BENCHMARK_DIR\"" echo " }," # Sequential read/write tests echo " \"sequential_tests\": {" run_sequential_tests echo " }," # Random read/write tests echo " \"random_tests\": {" run_random_tests echo " }," # Database-like workload tests echo " \"database_tests\": {" run_database_tests echo " }," # Mixed workload tests echo " \"mixed_tests\": {" run_mixed_tests echo " }" echo "}" } > "$results_file" log_info "Benchmark results saved to: $results_file" # Cleanup rm -rf "$BENCHMARK_DIR" } run_sequential_tests() { log_info "Running sequential I/O tests" # Sequential write test local seq_write_result seq_write_result=$(fio --name=seq-write --rw=write --bs=1M --size=1G \ --directory="$BENCHMARK_DIR" --numjobs=1 --time_based=0 \ --output-format=json 2>/dev/null | jq '.jobs[0].write') # Sequential read test local seq_read_result seq_read_result=$(fio --name=seq-read --rw=read --bs=1M --size=1G \ --directory="$BENCHMARK_DIR" --numjobs=1 --time_based=0 \ --output-format=json 2>/dev/null | jq '.jobs[0].read') echo " \"sequential_write\": $seq_write_result," echo " \"sequential_read\": $seq_read_result" } run_random_tests() { log_info "Running random I/O tests" # Random read test (4K blocks) local rand_read_result rand_read_result=$(fio --name=rand-read --rw=randread --bs=4K --size=500M \ --directory="$BENCHMARK_DIR" --numjobs=4 --runtime=60 --time_based=1 \ --output-format=json 2>/dev/null | jq '.jobs[0].read') # Random write test (4K blocks) local rand_write_result rand_write_result=$(fio --name=rand-write --rw=randwrite --bs=4K --size=500M \ --directory="$BENCHMARK_DIR" --numjobs=4 --runtime=60 --time_based=1 \ --output-format=json 2>/dev/null | jq '.jobs[0].write') echo " \"random_read_4k\": $rand_read_result," echo " \"random_write_4k\": $rand_write_result" } run_database_tests() { log_info "Running database-like workload tests" # Database-like mixed workload (70% read, 30% write) local db_mixed_result db_mixed_result=$(fio --name=db-mixed --rw=randrw --rwmixread=70 --bs=8K \ --size=500M --directory="$BENCHMARK_DIR" --numjobs=8 --runtime=60 \ --time_based=1 --output-format=json 2>/dev/null | jq '.jobs[0]') echo " \"database_mixed_workload\": $db_mixed_result" } run_mixed_tests() { log_info "Running mixed workload tests" # Simulate container I/O patterns local container_result container_result=$(fio --name=container-io --rw=randrw --rwmixread=60 \ --bs=64K --size=500M --directory="$BENCHMARK_DIR" --numjobs=2 \ --runtime=60 --time_based=1 --output-format=json 2>/dev/null | jq '.jobs[0]') echo " \"container_workload\": $container_result" } # Run benchmarks if called directly if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then run_storage_benchmarks fi EOF chmod +x "${SCRIPT_DIR}/storage_benchmark.sh" # Create storage health check script cat > "${SCRIPT_DIR}/storage_health_check.sh" << 'EOF' #!/bin/bash # Storage Health Check Tool set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/lib/error_handling.sh" check_storage_health() { log_info "Performing storage health check" local health_report="${SCRIPT_DIR}/../reports/storage_health_$(date +%Y%m%d_%H%M%S).txt" mkdir -p "$(dirname "$health_report")" { echo "Storage Health Check Report" echo "Generated: $(date)" echo "==========================" echo check_disk_space check_smart_status check_filesystem_errors check_io_performance check_docker_volumes } > "$health_report" log_info "Health check report saved to: $health_report" } check_disk_space() { echo "=== Disk Space Check ===" df -h | grep -E "(Filesystem|/dev/)" echo # Check for critical space usage (>90%) local critical_mounts critical_mounts=$(df -h | awk 'NR>1 {gsub("%","",$5); if($5 > 90) print $6 " (" $5 "%)"}') if [[ -n "$critical_mounts" ]]; then echo "WARNING: Critical disk space usage detected:" echo "$critical_mounts" else echo "Disk space usage: OK" fi echo } check_smart_status() { echo "=== SMART Health Check ===" for drive in /dev/sd* /dev/nvme*; do if [[ -b "$drive" ]] && [[ ! "$drive" =~ [0-9]$ ]]; then echo "Drive: $drive" if smartctl -H "$drive" 2>/dev/null | grep -q "PASSED"; then echo " Status: HEALTHY" else echo " Status: WARNING - Check SMART details" fi fi done echo } check_filesystem_errors() { echo "=== Filesystem Error Check ===" # Check dmesg for filesystem errors local fs_errors fs_errors=$(dmesg | grep -i "error\|fail\|corrupt" | grep -E "(ext4|xfs|btrfs)" | tail -5) if [[ -n "$fs_errors" ]]; then echo "Recent filesystem errors found:" echo "$fs_errors" else echo "No recent filesystem errors found" fi echo } check_io_performance() { echo "=== I/O Performance Check ===" # Quick I/O test local test_file="/tmp/io_test_$$" local write_speed read_speed write_speed=$(dd if=/dev/zero of="$test_file" bs=1M count=100 2>&1 | grep -o '[0-9.]\+ MB/s' | tail -1) read_speed=$(dd if="$test_file" of=/dev/null bs=1M 2>&1 | grep -o '[0-9.]\+ MB/s' | tail -1) echo "Sequential write speed: ${write_speed:-Unknown}" echo "Sequential read speed: ${read_speed:-Unknown}" rm -f "$test_file" echo } check_docker_volumes() { echo "=== Docker Volume Health ===" if command -v docker &>/dev/null; then echo "Docker volumes:" docker volume ls --format "table {{.Name}}\t{{.Driver}}\t{{.Scope}}" echo # Check for dangling volumes local dangling_volumes dangling_volumes=$(docker volume ls -qf dangling=true) if [[ -n "$dangling_volumes" ]]; then echo "Dangling volumes found: $dangling_volumes" else echo "No dangling volumes found" fi else echo "Docker not available" fi echo } # Run health check if called directly if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then check_storage_health fi EOF chmod +x "${SCRIPT_DIR}/storage_health_check.sh" log_info "Storage performance testing tools created" } optimize_docker_volumes() { log_info "Optimizing Docker volume configurations" # Create optimized volume creation script cat > "${SCRIPT_DIR}/create_optimized_volumes.sh" << 'EOF' #!/bin/bash # Create Optimized Docker Volumes set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/lib/error_handling.sh" create_optimized_volumes() { log_info "Creating optimized Docker volumes" # Database volumes with specific optimizations create_database_volumes # Media volumes with large file optimizations create_media_volumes # Cache volumes with performance optimizations create_cache_volumes # Backup volumes with compression support create_backup_volumes } create_database_volumes() { log_info "Creating optimized database volumes" # PostgreSQL data volume if ! docker volume inspect postgres_data &>/dev/null; then docker volume create postgres_data \ --driver local \ --opt type=none \ --opt o=bind \ --opt device=/opt/homelab/postgres/data # Set optimal permissions and ownership mkdir -p /opt/homelab/postgres/data chown 999:999 /opt/homelab/postgres/data chmod 700 /opt/homelab/postgres/data log_info "Created PostgreSQL data volume with optimized permissions" fi # Redis data volume if ! docker volume inspect redis_data &>/dev/null; then docker volume create redis_data \ --driver local \ --opt type=none \ --opt o=bind \ --opt device=/opt/homelab/redis/data mkdir -p /opt/homelab/redis/data chown 999:999 /opt/homelab/redis/data chmod 755 /opt/homelab/redis/data log_info "Created Redis data volume" fi # InfluxDB data volume if ! docker volume inspect influxdb_data &>/dev/null; then docker volume create influxdb_data \ --driver local \ --opt type=none \ --opt o=bind \ --opt device=/opt/homelab/influxdb/data mkdir -p /opt/homelab/influxdb/data chown 1000:1000 /opt/homelab/influxdb/data chmod 755 /opt/homelab/influxdb/data log_info "Created InfluxDB data volume" fi } create_media_volumes() { log_info "Creating optimized media volumes" # Jellyfin media volume if ! docker volume inspect jellyfin_config &>/dev/null; then docker volume create jellyfin_config \ --driver local \ --opt type=none \ --opt o=bind \ --opt device=/opt/homelab/jellyfin/config mkdir -p /opt/homelab/jellyfin/config chown 1000:1000 /opt/homelab/jellyfin/config chmod 755 /opt/homelab/jellyfin/config fi # Immich upload volume if ! docker volume inspect immich_uploads &>/dev/null; then docker volume create immich_uploads \ --driver local \ --opt type=none \ --opt o=bind \ --opt device=/opt/homelab/immich/uploads mkdir -p /opt/homelab/immich/uploads chown 1000:1000 /opt/homelab/immich/uploads chmod 755 /opt/homelab/immich/uploads fi } create_cache_volumes() { log_info "Creating optimized cache volumes" # Temporary/cache volume (could use tmpfs for performance) if ! docker volume inspect app_cache &>/dev/null; then # Use tmpfs for high-performance caching if enough RAM available local total_memory_kb total_memory_kb=$(grep MemTotal /proc/meminfo | awk '{print $2}') local total_memory_gb=$((total_memory_kb / 1024 / 1024)) if [[ $total_memory_gb -gt 8 ]]; then # Use tmpfs for caching on systems with > 8GB RAM docker volume create app_cache \ --driver local \ --opt type=tmpfs \ --opt device=tmpfs \ --opt o=size=1G,uid=1000,gid=1000 log_info "Created tmpfs cache volume (1GB)" else # Use regular bind mount for caching docker volume create app_cache \ --driver local \ --opt type=none \ --opt o=bind \ --opt device=/opt/homelab/cache mkdir -p /opt/homelab/cache chmod 755 /opt/homelab/cache log_info "Created filesystem cache volume" fi fi } create_backup_volumes() { log_info "Creating optimized backup volumes" # Backup volume with compression support if ! docker volume inspect backup_data &>/dev/null; then docker volume create backup_data \ --driver local \ --opt type=none \ --opt o=bind \ --opt device=/opt/homelab/backups mkdir -p /opt/homelab/backups chmod 755 /opt/homelab/backups log_info "Created backup data volume" fi } # Run volume creation if called directly if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then create_optimized_volumes fi EOF chmod +x "${SCRIPT_DIR}/create_optimized_volumes.sh" # Run the volume optimization "${SCRIPT_DIR}/create_optimized_volumes.sh" log_info "Docker volume optimization completed" } cleanup_on_exit() { log_info "Cleaning up storage optimization resources" # Remove any temporary benchmark files rm -rf /tmp/storage_benchmark_* /tmp/io_test_* 2>/dev/null || true log_info "Storage optimization cleanup completed" } # Execute main function main "$@"