#!/bin/bash set -euo pipefail # GPU Passthrough Optimizer for Media Services # Configures GPU acceleration for Jellyfin and Immich # Part of the Migration Issues Resolution Framework # Source the error handling library SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/lib/error_handling.sh" # Configuration readonly LOG_FILE="${SCRIPT_DIR}/../logs/gpu_optimization_$(date +%Y%m%d_%H%M%S).log" readonly CONFIG_BACKUP_DIR="${SCRIPT_DIR}/../backups/gpu_configs" readonly DOCKER_COMPOSE_DIR="${SCRIPT_DIR}/../../" # Initialize logging init_logging "$LOG_FILE" main() { log_info "Starting GPU passthrough optimization for media services" # Register cleanup function register_cleanup cleanup_on_exit # Validate prerequisites validate_prerequisites # Detect GPU hardware detect_gpu_hardware # Configure Docker for GPU access configure_docker_gpu # Configure Jellyfin GPU acceleration configure_jellyfin_gpu # Configure Immich GPU acceleration configure_immich_gpu # Update Docker Compose configurations update_docker_compose_configs # Test GPU acceleration test_gpu_acceleration # Configure GPU monitoring setup_gpu_monitoring log_info "GPU passthrough optimization completed successfully" } validate_prerequisites() { log_info "Validating GPU optimization prerequisites" local required_commands=( "docker" "nvidia-smi" "lspci" "modinfo" ) for cmd in "${required_commands[@]}"; do if ! command -v "$cmd" &>/dev/null; then log_error "Required command not found: $cmd" exit 1 fi done # Check if running as root or with sudo if [[ $EUID -ne 0 ]]; then log_error "This script must be run as root or with sudo" exit 1 fi # Verify Docker is running if ! docker info &>/dev/null; then log_error "Docker is not running or accessible" exit 1 fi log_info "Prerequisites validation completed" } detect_gpu_hardware() { log_info "Detecting GPU hardware configuration" # Create GPU detection report local gpu_report="${CONFIG_BACKUP_DIR}/gpu_detection_$(date +%Y%m%d_%H%M%S).txt" mkdir -p "$(dirname "$gpu_report")" { echo "GPU Hardware Detection Report" echo "Generated: $(date)" echo "===============================" echo echo "PCI GPU Devices:" lspci | grep -i vga || echo "No VGA devices found" lspci | grep -i nvidia || echo "No NVIDIA devices found" lspci | grep -i amd || echo "No AMD devices found" lspci | grep -i intel || echo "No Intel GPU devices found" echo if command -v nvidia-smi &>/dev/null; then echo "NVIDIA GPU Status:" nvidia-smi || echo "NVIDIA SMI not available" echo fi echo "GPU-related kernel modules:" lsmod | grep -E "(nvidia|nouveau|amdgpu|radeon|i915)" || echo "No GPU modules loaded" echo if [[ -d /dev/dri ]]; then echo "DRI devices:" ls -la /dev/dri/ echo fi if [[ -e /dev/nvidia0 ]]; then echo "NVIDIA devices:" ls -la /dev/nvidia* echo fi } > "$gpu_report" log_info "GPU detection report saved to: $gpu_report" # Determine GPU type and capabilities if nvidia-smi &>/dev/null; then GPU_TYPE="nvidia" GPU_VENDOR="NVIDIA" log_info "NVIDIA GPU detected" elif lspci | grep -qi amd; then GPU_TYPE="amd" GPU_VENDOR="AMD" log_info "AMD GPU detected" elif lspci | grep -qi intel; then GPU_TYPE="intel" GPU_VENDOR="Intel" log_info "Intel GPU detected" else log_warn "No supported GPU detected - using software encoding" GPU_TYPE="software" GPU_VENDOR="Software" fi export GPU_TYPE GPU_VENDOR } configure_docker_gpu() { log_info "Configuring Docker for GPU access" case "$GPU_TYPE" in "nvidia") configure_nvidia_docker ;; "amd") configure_amd_docker ;; "intel") configure_intel_docker ;; *) log_warn "No GPU-specific Docker configuration needed" ;; esac } configure_nvidia_docker() { log_info "Configuring NVIDIA Docker support" # Check if nvidia-docker2 is installed if ! dpkg -l | grep -q nvidia-docker2; then log_info "Installing NVIDIA Docker support" # Add NVIDIA Docker repository distribution=$(. /etc/os-release;echo $ID$VERSION_ID) curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add - curl -s -L "https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list" | \ tee /etc/apt/sources.list.d/nvidia-docker.list apt-get update apt-get install -y nvidia-docker2 # Restart Docker systemctl restart docker log_info "NVIDIA Docker support installed" fi # Configure Docker daemon for NVIDIA local docker_daemon_config="/etc/docker/daemon.json" local backup_file="${CONFIG_BACKUP_DIR}/daemon.json.backup.$(date +%Y%m%d_%H%M%S)" mkdir -p "$(dirname "$backup_file")" if [[ -f "$docker_daemon_config" ]]; then cp "$docker_daemon_config" "$backup_file" log_info "Docker daemon config backed up to: $backup_file" fi # Create or update daemon.json cat > "$docker_daemon_config" << 'EOF' { "default-runtime": "nvidia", "runtimes": { "nvidia": { "path": "nvidia-container-runtime", "runtimeArgs": [] } }, "log-driver": "json-file", "log-opts": { "max-size": "10m", "max-file": "3" } } EOF # Restart Docker to apply changes systemctl restart docker # Verify NVIDIA Docker works if docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi; then log_info "NVIDIA Docker configuration verified successfully" else log_error "NVIDIA Docker test failed" exit 1 fi } configure_amd_docker() { log_info "Configuring AMD GPU Docker support" # Ensure proper device access if [[ -d /dev/dri ]]; then chmod 666 /dev/dri/* log_info "DRI device permissions configured" fi # Test AMD GPU access docker run --rm --device=/dev/dri ubuntu:20.04 ls -la /dev/dri/ || { log_error "AMD GPU device access test failed" exit 1 } log_info "AMD GPU Docker configuration completed" } configure_intel_docker() { log_info "Configuring Intel GPU Docker support" # Ensure proper device access for Intel Quick Sync if [[ -d /dev/dri ]]; then chmod 666 /dev/dri/* log_info "Intel GPU device permissions configured" fi # Load Intel GPU drivers if needed modprobe i915 || log_warn "Failed to load i915 module" log_info "Intel GPU Docker configuration completed" } configure_jellyfin_gpu() { log_info "Configuring Jellyfin GPU acceleration" local jellyfin_config="${DOCKER_COMPOSE_DIR}/jellyfin" mkdir -p "$jellyfin_config" # Create Jellyfin GPU configuration case "$GPU_TYPE" in "nvidia") create_jellyfin_nvidia_config ;; "amd") create_jellyfin_amd_config ;; "intel") create_jellyfin_intel_config ;; *) create_jellyfin_software_config ;; esac } create_jellyfin_nvidia_config() { log_info "Creating Jellyfin NVIDIA configuration" cat > "${DOCKER_COMPOSE_DIR}/jellyfin/docker-compose.gpu.yml" << 'EOF' version: '3.8' services: jellyfin: image: jellyfin/jellyfin:latest container_name: jellyfin restart: unless-stopped environment: - JELLYFIN_PublishedServerUrl=https://jellyfin.${DOMAIN} - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_DRIVER_CAPABILITIES=compute,video,utility volumes: - ./config:/config - ./cache:/cache - /media:/media:ro - /dev/shm:/dev/shm ports: - "8096:8096" runtime: nvidia deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] networks: - media_network labels: - "traefik.enable=true" - "traefik.http.routers.jellyfin.rule=Host(\`jellyfin.${DOMAIN}\`)" - "traefik.http.routers.jellyfin.tls=true" - "traefik.http.routers.jellyfin.tls.certresolver=letsencrypt" - "traefik.http.services.jellyfin.loadbalancer.server.port=8096" networks: media_network: external: true EOF # Create Jellyfin encoding configuration cat > "${DOCKER_COMPOSE_DIR}/jellyfin/encoding.xml" << 'EOF' /config/transcoding-temp /usr/share/fonts/truetype/dejavu/DejaVuSans.ttf false false 2 2048 false 180 nvenc /dev/dri/renderD128 false false hable auto 0 0.8 100 0 23 28 false yadif true true true false false false true true true EOF log_info "Jellyfin NVIDIA configuration created" } create_jellyfin_amd_config() { log_info "Creating Jellyfin AMD configuration" cat > "${DOCKER_COMPOSE_DIR}/jellyfin/docker-compose.gpu.yml" << 'EOF' version: '3.8' services: jellyfin: image: jellyfin/jellyfin:latest container_name: jellyfin restart: unless-stopped environment: - JELLYFIN_PublishedServerUrl=https://jellyfin.${DOMAIN} volumes: - ./config:/config - ./cache:/cache - /media:/media:ro - /dev/shm:/dev/shm devices: - /dev/dri:/dev/dri ports: - "8096:8096" networks: - media_network labels: - "traefik.enable=true" - "traefik.http.routers.jellyfin.rule=Host(\`jellyfin.${DOMAIN}\`)" - "traefik.http.routers.jellyfin.tls=true" - "traefik.http.routers.jellyfin.tls.certresolver=letsencrypt" - "traefik.http.services.jellyfin.loadbalancer.server.port=8096" networks: media_network: external: true EOF log_info "Jellyfin AMD configuration created" } create_jellyfin_intel_config() { log_info "Creating Jellyfin Intel configuration" cat > "${DOCKER_COMPOSE_DIR}/jellyfin/docker-compose.gpu.yml" << 'EOF' version: '3.8' services: jellyfin: image: jellyfin/jellyfin:latest container_name: jellyfin restart: unless-stopped environment: - JELLYFIN_PublishedServerUrl=https://jellyfin.${DOMAIN} volumes: - ./config:/config - ./cache:/cache - /media:/media:ro - /dev/shm:/dev/shm devices: - /dev/dri:/dev/dri ports: - "8096:8096" networks: - media_network labels: - "traefik.enable=true" - "traefik.http.routers.jellyfin.rule=Host(\`jellyfin.${DOMAIN}\`)" - "traefik.http.routers.jellyfin.tls=true" - "traefik.http.routers.jellyfin.tls.certresolver=letsencrypt" - "traefik.http.services.jellyfin.loadbalancer.server.port=8096" networks: media_network: external: true EOF log_info "Jellyfin Intel configuration created" } create_jellyfin_software_config() { log_info "Creating Jellyfin software encoding configuration" cat > "${DOCKER_COMPOSE_DIR}/jellyfin/docker-compose.gpu.yml" << 'EOF' version: '3.8' services: jellyfin: image: jellyfin/jellyfin:latest container_name: jellyfin restart: unless-stopped environment: - JELLYFIN_PublishedServerUrl=https://jellyfin.${DOMAIN} volumes: - ./config:/config - ./cache:/cache - /media:/media:ro - /dev/shm:/dev/shm ports: - "8096:8096" deploy: resources: limits: cpus: '4' memory: 4G reservations: cpus: '2' memory: 2G networks: - media_network labels: - "traefik.enable=true" - "traefik.http.routers.jellyfin.rule=Host(\`jellyfin.${DOMAIN}\`)" - "traefik.http.routers.jellyfin.tls=true" - "traefik.http.routers.jellyfin.tls.certresolver=letsencrypt" - "traefik.http.services.jellyfin.loadbalancer.server.port=8096" networks: media_network: external: true EOF log_info "Jellyfin software encoding configuration created" } configure_immich_gpu() { log_info "Configuring Immich GPU acceleration" local immich_config="${DOCKER_COMPOSE_DIR}/immich" mkdir -p "$immich_config" # Create Immich GPU configuration case "$GPU_TYPE" in "nvidia") create_immich_nvidia_config ;; "amd"|"intel") create_immich_vaapi_config ;; *) create_immich_software_config ;; esac } create_immich_nvidia_config() { log_info "Creating Immich NVIDIA configuration" cat > "${DOCKER_COMPOSE_DIR}/immich/docker-compose.gpu.yml" << 'EOF' version: '3.8' services: immich-server: container_name: immich_server image: ghcr.io/immich-app/immich-server:release restart: unless-stopped environment: - DB_HOSTNAME=immich_postgres - DB_USERNAME=postgres - DB_PASSWORD_FILE=/run/secrets/immich_db_password - DB_DATABASE_NAME=immich - REDIS_HOSTNAME=immich_redis - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_DRIVER_CAPABILITIES=compute,video,utility volumes: - ${UPLOAD_LOCATION}:/usr/src/app/upload - /etc/localtime:/etc/localtime:ro - /dev/shm:/dev/shm ports: - "2283:3001" runtime: nvidia deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] depends_on: - redis - database secrets: - immich_db_password networks: - media_network labels: - "traefik.enable=true" - "traefik.http.routers.immich.rule=Host(\`photos.${DOMAIN}\`)" - "traefik.http.routers.immich.tls=true" - "traefik.http.routers.immich.tls.certresolver=letsencrypt" - "traefik.http.services.immich.loadbalancer.server.port=3001" immich-microservices: container_name: immich_microservices image: ghcr.io/immich-app/immich-server:release restart: unless-stopped environment: - DB_HOSTNAME=immich_postgres - DB_USERNAME=postgres - DB_PASSWORD_FILE=/run/secrets/immich_db_password - DB_DATABASE_NAME=immich - REDIS_HOSTNAME=immich_redis - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_DRIVER_CAPABILITIES=compute,video,utility volumes: - ${UPLOAD_LOCATION}:/usr/src/app/upload - /etc/localtime:/etc/localtime:ro - /dev/shm:/dev/shm runtime: nvidia deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] command: ['start.sh', 'microservices'] depends_on: - redis - database secrets: - immich_db_password networks: - media_network immich-machine-learning: container_name: immich_machine_learning image: ghcr.io/immich-app/immich-machine-learning:release restart: unless-stopped environment: - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_DRIVER_CAPABILITIES=compute,video,utility volumes: - model-cache:/cache - /dev/shm:/dev/shm runtime: nvidia deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] networks: - media_network redis: container_name: immich_redis image: redis:6.2-alpine restart: unless-stopped networks: - media_network database: container_name: immich_postgres image: tensorchord/pgvecto-rs:pg14-v0.2.0 restart: unless-stopped environment: - POSTGRES_PASSWORD_FILE=/run/secrets/immich_db_password - POSTGRES_USER=postgres - POSTGRES_DB=immich volumes: - pgdata:/var/lib/postgresql/data secrets: - immich_db_password networks: - media_network volumes: pgdata: model-cache: secrets: immich_db_password: external: true networks: media_network: external: true EOF log_info "Immich NVIDIA configuration created" } create_immich_vaapi_config() { log_info "Creating Immich VA-API configuration" cat > "${DOCKER_COMPOSE_DIR}/immich/docker-compose.gpu.yml" << 'EOF' version: '3.8' services: immich-server: container_name: immich_server image: ghcr.io/immich-app/immich-server:release restart: unless-stopped environment: - DB_HOSTNAME=immich_postgres - DB_USERNAME=postgres - DB_PASSWORD_FILE=/run/secrets/immich_db_password - DB_DATABASE_NAME=immich - REDIS_HOSTNAME=immich_redis volumes: - ${UPLOAD_LOCATION}:/usr/src/app/upload - /etc/localtime:/etc/localtime:ro - /dev/shm:/dev/shm devices: - /dev/dri:/dev/dri ports: - "2283:3001" depends_on: - redis - database secrets: - immich_db_password networks: - media_network labels: - "traefik.enable=true" - "traefik.http.routers.immich.rule=Host(\`photos.${DOMAIN}\`)" - "traefik.http.routers.immich.tls=true" - "traefik.http.routers.immich.tls.certresolver=letsencrypt" - "traefik.http.services.immich.loadbalancer.server.port=3001" immich-microservices: container_name: immich_microservices image: ghcr.io/immich-app/immich-server:release restart: unless-stopped environment: - DB_HOSTNAME=immich_postgres - DB_USERNAME=postgres - DB_PASSWORD_FILE=/run/secrets/immich_db_password - DB_DATABASE_NAME=immich - REDIS_HOSTNAME=immich_redis volumes: - ${UPLOAD_LOCATION}:/usr/src/app/upload - /etc/localtime:/etc/localtime:ro - /dev/shm:/dev/shm devices: - /dev/dri:/dev/dri command: ['start.sh', 'microservices'] depends_on: - redis - database secrets: - immich_db_password networks: - media_network immich-machine-learning: container_name: immich_machine_learning image: ghcr.io/immich-app/immich-machine-learning:release restart: unless-stopped volumes: - model-cache:/cache - /dev/shm:/dev/shm networks: - media_network redis: container_name: immich_redis image: redis:6.2-alpine restart: unless-stopped networks: - media_network database: container_name: immich_postgres image: tensorchord/pgvecto-rs:pg14-v0.2.0 restart: unless-stopped environment: - POSTGRES_PASSWORD_FILE=/run/secrets/immich_db_password - POSTGRES_USER=postgres - POSTGRES_DB=immich volumes: - pgdata:/var/lib/postgresql/data secrets: - immich_db_password networks: - media_network volumes: pgdata: model-cache: secrets: immich_db_password: external: true networks: media_network: external: true EOF log_info "Immich VA-API configuration created" } create_immich_software_config() { log_info "Creating Immich software processing configuration" cat > "${DOCKER_COMPOSE_DIR}/immich/docker-compose.gpu.yml" << 'EOF' version: '3.8' services: immich-server: container_name: immich_server image: ghcr.io/immich-app/immich-server:release restart: unless-stopped environment: - DB_HOSTNAME=immich_postgres - DB_USERNAME=postgres - DB_PASSWORD_FILE=/run/secrets/immich_db_password - DB_DATABASE_NAME=immich - REDIS_HOSTNAME=immich_redis volumes: - ${UPLOAD_LOCATION}:/usr/src/app/upload - /etc/localtime:/etc/localtime:ro - /dev/shm:/dev/shm ports: - "2283:3001" deploy: resources: limits: cpus: '2' memory: 2G reservations: cpus: '1' memory: 1G depends_on: - redis - database secrets: - immich_db_password networks: - media_network labels: - "traefik.enable=true" - "traefik.http.routers.immich.rule=Host(\`photos.${DOMAIN}\`)" - "traefik.http.routers.immich.tls=true" - "traefik.http.routers.immich.tls.certresolver=letsencrypt" - "traefik.http.services.immich.loadbalancer.server.port=3001" immich-microservices: container_name: immich_microservices image: ghcr.io/immich-app/immich-server:release restart: unless-stopped environment: - DB_HOSTNAME=immich_postgres - DB_USERNAME=postgres - DB_PASSWORD_FILE=/run/secrets/immich_db_password - DB_DATABASE_NAME=immich - REDIS_HOSTNAME=immich_redis volumes: - ${UPLOAD_LOCATION}:/usr/src/app/upload - /etc/localtime:/etc/localtime:ro - /dev/shm:/dev/shm deploy: resources: limits: cpus: '4' memory: 4G reservations: cpus: '2' memory: 2G command: ['start.sh', 'microservices'] depends_on: - redis - database secrets: - immich_db_password networks: - media_network immich-machine-learning: container_name: immich_machine_learning image: ghcr.io/immich-app/immich-machine-learning:release restart: unless-stopped volumes: - model-cache:/cache - /dev/shm:/dev/shm deploy: resources: limits: cpus: '2' memory: 4G reservations: cpus: '1' memory: 2G networks: - media_network redis: container_name: immich_redis image: redis:6.2-alpine restart: unless-stopped networks: - media_network database: container_name: immich_postgres image: tensorchord/pgvecto-rs:pg14-v0.2.0 restart: unless-stopped environment: - POSTGRES_PASSWORD_FILE=/run/secrets/immich_db_password - POSTGRES_USER=postgres - POSTGRES_DB=immich volumes: - pgdata:/var/lib/postgresql/data secrets: - immich_db_password networks: - media_network volumes: pgdata: model-cache: secrets: immich_db_password: external: true networks: media_network: external: true EOF log_info "Immich software processing configuration created" } update_docker_compose_configs() { log_info "Updating main Docker Compose configurations" # Update main docker-compose.yml to include GPU configurations local main_compose="${DOCKER_COMPOSE_DIR}/docker-compose.yml" local backup_file="${CONFIG_BACKUP_DIR}/docker-compose.yml.backup.$(date +%Y%m%d_%H%M%S)" if [[ -f "$main_compose" ]]; then mkdir -p "$(dirname "$backup_file")" cp "$main_compose" "$backup_file" log_info "Main Docker Compose backed up to: $backup_file" fi # Create GPU-enabled Docker Compose override cat > "${DOCKER_COMPOSE_DIR}/docker-compose.gpu-override.yml" << EOF version: '3.8' # GPU Override Configuration # This file extends the main docker-compose.yml with GPU acceleration services: jellyfin: extends: file: ./jellyfin/docker-compose.gpu.yml service: jellyfin immich-server: extends: file: ./immich/docker-compose.gpu.yml service: immich-server immich-microservices: extends: file: ./immich/docker-compose.gpu.yml service: immich-microservices immich-machine-learning: extends: file: ./immich/docker-compose.gpu.yml service: immich-machine-learning EOF # Create deployment script with GPU support cat > "${DOCKER_COMPOSE_DIR}/deploy-with-gpu.sh" << 'EOF' #!/bin/bash # Deployment script with GPU acceleration support set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "$SCRIPT_DIR" # Source environment variables if [[ -f .env ]]; then source .env fi # Deploy with GPU override echo "Deploying services with GPU acceleration..." docker-compose -f docker-compose.yml -f docker-compose.gpu-override.yml up -d # Verify deployment echo "Verifying GPU-accelerated services..." docker-compose ps echo "GPU-enabled deployment completed successfully!" EOF chmod +x "${DOCKER_COMPOSE_DIR}/deploy-with-gpu.sh" log_info "Docker Compose GPU configurations updated" } test_gpu_acceleration() { log_info "Testing GPU acceleration functionality" case "$GPU_TYPE" in "nvidia") test_nvidia_acceleration ;; "amd"|"intel") test_vaapi_acceleration ;; *) log_info "Software encoding - no GPU tests needed" ;; esac } test_nvidia_acceleration() { log_info "Testing NVIDIA GPU acceleration" # Test NVIDIA Docker runtime if docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi; then log_info "NVIDIA Docker runtime test: PASSED" else log_error "NVIDIA Docker runtime test: FAILED" return 1 fi # Test FFMPEG with NVIDIA acceleration local test_output="${CONFIG_BACKUP_DIR}/nvidia_ffmpeg_test.log" if docker run --rm --gpus all \ -v /tmp:/tmp \ jrottenberg/ffmpeg:4.4-nvidia \ -f lavfi -i testsrc2=duration=10:size=1920x1080:rate=30 \ -c:v h264_nvenc -preset fast -f null - &> "$test_output"; then log_info "NVIDIA FFMPEG acceleration test: PASSED" else log_warn "NVIDIA FFMPEG acceleration test: FAILED - check $test_output" fi } test_vaapi_acceleration() { log_info "Testing VA-API acceleration" # Test DRI device access if docker run --rm --device=/dev/dri ubuntu:20.04 ls -la /dev/dri/; then log_info "DRI device access test: PASSED" else log_error "DRI device access test: FAILED" return 1 fi # Test VAAPI functionality local test_output="${CONFIG_BACKUP_DIR}/vaapi_test.log" if docker run --rm \ --device=/dev/dri \ jrottenberg/ffmpeg:4.4-vaapi \ -vaapi_device /dev/dri/renderD128 \ -f lavfi -i testsrc2=duration=10:size=1920x1080:rate=30 \ -vf 'format=nv12,hwupload' \ -c:v h264_vaapi -f null - &> "$test_output"; then log_info "VA-API acceleration test: PASSED" else log_warn "VA-API acceleration test: FAILED - check $test_output" fi } setup_gpu_monitoring() { log_info "Setting up GPU monitoring" # Create GPU monitoring script cat > "${SCRIPT_DIR}/gpu_monitor.py" << 'EOF' #!/usr/bin/env python3 """ GPU Monitoring Script Provides Prometheus metrics for GPU utilization """ import time import subprocess import json from http.server import HTTPServer, BaseHTTPRequestHandler import threading import logging class GPUMonitor: def __init__(self): self.gpu_type = self.detect_gpu_type() self.metrics = {} def detect_gpu_type(self): try: subprocess.run(['nvidia-smi'], capture_output=True, check=True) return 'nvidia' except: pass # Check for AMD GPU result = subprocess.run(['lspci'], capture_output=True, text=True) if 'AMD' in result.stdout: return 'amd' elif 'Intel' in result.stdout and 'VGA' in result.stdout: return 'intel' return 'software' def get_nvidia_metrics(self): try: result = subprocess.run([ 'nvidia-smi', '--query-gpu=utilization.gpu,utilization.memory,memory.used,memory.total,temperature.gpu', '--format=csv,noheader,nounits' ], capture_output=True, text=True, check=True) lines = result.stdout.strip().split('\n') gpu_metrics = [] for i, line in enumerate(lines): values = [x.strip() for x in line.split(',')] gpu_metrics.append({ 'gpu_id': i, 'utilization_gpu': float(values[0]), 'utilization_memory': float(values[1]), 'memory_used': float(values[2]), 'memory_total': float(values[3]), 'temperature': float(values[4]) }) return gpu_metrics except Exception as e: logging.error(f"Error getting NVIDIA metrics: {e}") return [] def get_system_metrics(self): # Fallback system metrics return [{ 'gpu_id': 0, 'utilization_gpu': 0, 'utilization_memory': 0, 'memory_used': 0, 'memory_total': 0, 'temperature': 0 }] def collect_metrics(self): if self.gpu_type == 'nvidia': return self.get_nvidia_metrics() else: return self.get_system_metrics() def update_metrics(self): while True: try: self.metrics = self.collect_metrics() except Exception as e: logging.error(f"Error updating metrics: {e}") time.sleep(10) # Update every 10 seconds class MetricsHandler(BaseHTTPRequestHandler): def __init__(self, gpu_monitor, *args, **kwargs): self.gpu_monitor = gpu_monitor super().__init__(*args, **kwargs) def do_GET(self): if self.path == '/metrics': self.send_response(200) self.send_header('Content-type', 'text/plain') self.end_headers() metrics_text = self.generate_prometheus_metrics() self.wfile.write(metrics_text.encode()) else: self.send_response(404) self.end_headers() def generate_prometheus_metrics(self): metrics = [] for gpu in self.gpu_monitor.metrics: gpu_id = gpu['gpu_id'] metrics.extend([ f'# HELP gpu_utilization_percent GPU utilization percentage', f'# TYPE gpu_utilization_percent gauge', f'gpu_utilization_percent{{gpu_id="{gpu_id}"}} {gpu["utilization_gpu"]}', f'# HELP gpu_memory_utilization_percent GPU memory utilization percentage', f'# TYPE gpu_memory_utilization_percent gauge', f'gpu_memory_utilization_percent{{gpu_id="{gpu_id}"}} {gpu["utilization_memory"]}', f'# HELP gpu_memory_used_mb GPU memory used in MB', f'# TYPE gpu_memory_used_mb gauge', f'gpu_memory_used_mb{{gpu_id="{gpu_id}"}} {gpu["memory_used"]}', f'# HELP gpu_memory_total_mb GPU total memory in MB', f'# TYPE gpu_memory_total_mb gauge', f'gpu_memory_total_mb{{gpu_id="{gpu_id}"}} {gpu["memory_total"]}', f'# HELP gpu_temperature_celsius GPU temperature in Celsius', f'# TYPE gpu_temperature_celsius gauge', f'gpu_temperature_celsius{{gpu_id="{gpu_id}"}} {gpu["temperature"]}', ]) return '\n'.join(metrics) def main(): logging.basicConfig(level=logging.INFO) gpu_monitor = GPUMonitor() # Start metrics collection in background metrics_thread = threading.Thread(target=gpu_monitor.update_metrics, daemon=True) metrics_thread.start() # Create handler with gpu_monitor def handler(*args, **kwargs): return MetricsHandler(gpu_monitor, *args, **kwargs) # Start HTTP server server = HTTPServer(('0.0.0.0', 9101), handler) print("GPU metrics server started on port 9101") server.serve_forever() if __name__ == '__main__': main() EOF chmod +x "${SCRIPT_DIR}/gpu_monitor.py" # Create systemd service for GPU monitoring cat > "/etc/systemd/system/gpu-monitor.service" << EOF [Unit] Description=GPU Monitoring Service After=network.target [Service] Type=simple User=root WorkingDirectory=${SCRIPT_DIR} ExecStart=/usr/bin/python3 ${SCRIPT_DIR}/gpu_monitor.py Restart=always RestartSec=10 [Install] WantedBy=multi-user.target EOF # Enable and start the service systemctl daemon-reload systemctl enable gpu-monitor.service systemctl start gpu-monitor.service # Add GPU monitoring to Prometheus configuration if [[ -f "${SCRIPT_DIR}/../monitoring/prometheus/prometheus.yml" ]]; then log_info "Adding GPU monitoring to Prometheus configuration" # Check if GPU target already exists if ! grep -q "gpu-monitor" "${SCRIPT_DIR}/../monitoring/prometheus/prometheus.yml"; then cat >> "${SCRIPT_DIR}/../monitoring/prometheus/prometheus.yml" << 'EOF' - job_name: 'gpu-monitor' static_configs: - targets: ['localhost:9101'] scrape_interval: 15s metrics_path: '/metrics' EOF fi fi log_info "GPU monitoring setup completed" } cleanup_on_exit() { log_info "Cleaning up GPU optimization resources" # Stop any test containers docker ps -q --filter "ancestor=nvidia/cuda:11.0-base" | xargs -r docker stop docker ps -q --filter "ancestor=jrottenberg/ffmpeg:4.4-nvidia" | xargs -r docker stop docker ps -q --filter "ancestor=jrottenberg/ffmpeg:4.4-vaapi" | xargs -r docker stop log_info "GPU optimization cleanup completed" } # Execute main function main "$@"