Files
HomeAudit/migration_scripts/scripts/gpu_passthrough_optimizer.sh
2025-08-24 11:13:39 -04:00

1250 lines
35 KiB
Bash
Executable File

#!/bin/bash
set -euo pipefail
# GPU Passthrough Optimizer for Media Services
# Configures GPU acceleration for Jellyfin and Immich
# Part of the Migration Issues Resolution Framework
# Source the error handling library
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "${SCRIPT_DIR}/lib/error_handling.sh"
# Configuration
readonly LOG_FILE="${SCRIPT_DIR}/../logs/gpu_optimization_$(date +%Y%m%d_%H%M%S).log"
readonly CONFIG_BACKUP_DIR="${SCRIPT_DIR}/../backups/gpu_configs"
readonly DOCKER_COMPOSE_DIR="${SCRIPT_DIR}/../../"
# Initialize logging
init_logging "$LOG_FILE"
main() {
log_info "Starting GPU passthrough optimization for media services"
# Register cleanup function
register_cleanup cleanup_on_exit
# Validate prerequisites
validate_prerequisites
# Detect GPU hardware
detect_gpu_hardware
# Configure Docker for GPU access
configure_docker_gpu
# Configure Jellyfin GPU acceleration
configure_jellyfin_gpu
# Configure Immich GPU acceleration
configure_immich_gpu
# Update Docker Compose configurations
update_docker_compose_configs
# Test GPU acceleration
test_gpu_acceleration
# Configure GPU monitoring
setup_gpu_monitoring
log_info "GPU passthrough optimization completed successfully"
}
validate_prerequisites() {
log_info "Validating GPU optimization prerequisites"
local required_commands=(
"docker" "nvidia-smi" "lspci" "modinfo"
)
for cmd in "${required_commands[@]}"; do
if ! command -v "$cmd" &>/dev/null; then
log_error "Required command not found: $cmd"
exit 1
fi
done
# Check if running as root or with sudo
if [[ $EUID -ne 0 ]]; then
log_error "This script must be run as root or with sudo"
exit 1
fi
# Verify Docker is running
if ! docker info &>/dev/null; then
log_error "Docker is not running or accessible"
exit 1
fi
log_info "Prerequisites validation completed"
}
detect_gpu_hardware() {
log_info "Detecting GPU hardware configuration"
# Create GPU detection report
local gpu_report="${CONFIG_BACKUP_DIR}/gpu_detection_$(date +%Y%m%d_%H%M%S).txt"
mkdir -p "$(dirname "$gpu_report")"
{
echo "GPU Hardware Detection Report"
echo "Generated: $(date)"
echo "==============================="
echo
echo "PCI GPU Devices:"
lspci | grep -i vga || echo "No VGA devices found"
lspci | grep -i nvidia || echo "No NVIDIA devices found"
lspci | grep -i amd || echo "No AMD devices found"
lspci | grep -i intel || echo "No Intel GPU devices found"
echo
if command -v nvidia-smi &>/dev/null; then
echo "NVIDIA GPU Status:"
nvidia-smi || echo "NVIDIA SMI not available"
echo
fi
echo "GPU-related kernel modules:"
lsmod | grep -E "(nvidia|nouveau|amdgpu|radeon|i915)" || echo "No GPU modules loaded"
echo
if [[ -d /dev/dri ]]; then
echo "DRI devices:"
ls -la /dev/dri/
echo
fi
if [[ -e /dev/nvidia0 ]]; then
echo "NVIDIA devices:"
ls -la /dev/nvidia*
echo
fi
} > "$gpu_report"
log_info "GPU detection report saved to: $gpu_report"
# Determine GPU type and capabilities
if nvidia-smi &>/dev/null; then
GPU_TYPE="nvidia"
GPU_VENDOR="NVIDIA"
log_info "NVIDIA GPU detected"
elif lspci | grep -qi amd; then
GPU_TYPE="amd"
GPU_VENDOR="AMD"
log_info "AMD GPU detected"
elif lspci | grep -qi intel; then
GPU_TYPE="intel"
GPU_VENDOR="Intel"
log_info "Intel GPU detected"
else
log_warn "No supported GPU detected - using software encoding"
GPU_TYPE="software"
GPU_VENDOR="Software"
fi
export GPU_TYPE GPU_VENDOR
}
configure_docker_gpu() {
log_info "Configuring Docker for GPU access"
case "$GPU_TYPE" in
"nvidia")
configure_nvidia_docker
;;
"amd")
configure_amd_docker
;;
"intel")
configure_intel_docker
;;
*)
log_warn "No GPU-specific Docker configuration needed"
;;
esac
}
configure_nvidia_docker() {
log_info "Configuring NVIDIA Docker support"
# Check if nvidia-docker2 is installed
if ! dpkg -l | grep -q nvidia-docker2; then
log_info "Installing NVIDIA Docker support"
# Add NVIDIA Docker repository
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add -
curl -s -L "https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list" | \
tee /etc/apt/sources.list.d/nvidia-docker.list
apt-get update
apt-get install -y nvidia-docker2
# Restart Docker
systemctl restart docker
log_info "NVIDIA Docker support installed"
fi
# Configure Docker daemon for NVIDIA
local docker_daemon_config="/etc/docker/daemon.json"
local backup_file="${CONFIG_BACKUP_DIR}/daemon.json.backup.$(date +%Y%m%d_%H%M%S)"
mkdir -p "$(dirname "$backup_file")"
if [[ -f "$docker_daemon_config" ]]; then
cp "$docker_daemon_config" "$backup_file"
log_info "Docker daemon config backed up to: $backup_file"
fi
# Create or update daemon.json
cat > "$docker_daemon_config" << 'EOF'
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "nvidia-container-runtime",
"runtimeArgs": []
}
},
"log-driver": "json-file",
"log-opts": {
"max-size": "10m",
"max-file": "3"
}
}
EOF
# Restart Docker to apply changes
systemctl restart docker
# Verify NVIDIA Docker works
if docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi; then
log_info "NVIDIA Docker configuration verified successfully"
else
log_error "NVIDIA Docker test failed"
exit 1
fi
}
configure_amd_docker() {
log_info "Configuring AMD GPU Docker support"
# Ensure proper device access
if [[ -d /dev/dri ]]; then
chmod 666 /dev/dri/*
log_info "DRI device permissions configured"
fi
# Test AMD GPU access
docker run --rm --device=/dev/dri ubuntu:20.04 ls -la /dev/dri/ || {
log_error "AMD GPU device access test failed"
exit 1
}
log_info "AMD GPU Docker configuration completed"
}
configure_intel_docker() {
log_info "Configuring Intel GPU Docker support"
# Ensure proper device access for Intel Quick Sync
if [[ -d /dev/dri ]]; then
chmod 666 /dev/dri/*
log_info "Intel GPU device permissions configured"
fi
# Load Intel GPU drivers if needed
modprobe i915 || log_warn "Failed to load i915 module"
log_info "Intel GPU Docker configuration completed"
}
configure_jellyfin_gpu() {
log_info "Configuring Jellyfin GPU acceleration"
local jellyfin_config="${DOCKER_COMPOSE_DIR}/jellyfin"
mkdir -p "$jellyfin_config"
# Create Jellyfin GPU configuration
case "$GPU_TYPE" in
"nvidia")
create_jellyfin_nvidia_config
;;
"amd")
create_jellyfin_amd_config
;;
"intel")
create_jellyfin_intel_config
;;
*)
create_jellyfin_software_config
;;
esac
}
create_jellyfin_nvidia_config() {
log_info "Creating Jellyfin NVIDIA configuration"
cat > "${DOCKER_COMPOSE_DIR}/jellyfin/docker-compose.gpu.yml" << 'EOF'
version: '3.8'
services:
jellyfin:
image: jellyfin/jellyfin:latest
container_name: jellyfin
restart: unless-stopped
environment:
- JELLYFIN_PublishedServerUrl=https://jellyfin.${DOMAIN}
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,video,utility
volumes:
- ./config:/config
- ./cache:/cache
- /media:/media:ro
- /dev/shm:/dev/shm
ports:
- "8096:8096"
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
networks:
- media_network
labels:
- "traefik.enable=true"
- "traefik.http.routers.jellyfin.rule=Host(\`jellyfin.${DOMAIN}\`)"
- "traefik.http.routers.jellyfin.tls=true"
- "traefik.http.routers.jellyfin.tls.certresolver=letsencrypt"
- "traefik.http.services.jellyfin.loadbalancer.server.port=8096"
networks:
media_network:
external: true
EOF
# Create Jellyfin encoding configuration
cat > "${DOCKER_COMPOSE_DIR}/jellyfin/encoding.xml" << 'EOF'
<?xml version="1.0" encoding="utf-8"?>
<EncodingOptions xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<TranscodingTempPath>/config/transcoding-temp</TranscodingTempPath>
<FallbackFontPath>/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf</FallbackFontPath>
<EnableFallbackFont>false</EnableFallbackFont>
<EnableAudioVbr>false</EnableAudioVbr>
<DownMixAudioBoost>2</DownMixAudioBoost>
<MaxMuxingQueueSize>2048</MaxMuxingQueueSize>
<EnableThrottling>false</EnableThrottling>
<ThrottleDelaySeconds>180</ThrottleDelaySeconds>
<HardwareAccelerationType>nvenc</HardwareAccelerationType>
<EncodeMediaSourceId />
<VaapiDevice>/dev/dri/renderD128</VaapiDevice>
<EnableTonemapping>false</EnableTonemapping>
<EnableVppTonemapping>false</EnableVppTonemapping>
<TonemappingAlgorithm>hable</TonemappingAlgorithm>
<TonemappingRange>auto</TonemappingRange>
<TonemappingDesat>0</TonemappingDesat>
<TonemappingThreshold>0.8</TonemappingThreshold>
<TonemappingPeak>100</TonemappingPeak>
<TonemappingParam>0</TonemappingParam>
<H264Crf>23</H264Crf>
<H265Crf>28</H265Crf>
<DeinterlaceDoubleRate>false</DeinterlaceDoubleRate>
<DeinterlaceMethod>yadif</DeinterlaceMethod>
<EnableDecodingColorDepth10Hevc>true</EnableDecodingColorDepth10Hevc>
<EnableDecodingColorDepth10Vp9>true</EnableDecodingColorDepth10Vp9>
<EnableEnhancedNvdecDecoder>true</EnableEnhancedNvdecDecoder>
<PreferSystemNativeHwDecoder>false</PreferSystemNativeHwDecoder>
<EnableIntelLowPowerH264HwEncoder>false</EnableIntelLowPowerH264HwEncoder>
<EnableIntelLowPowerHevcHwEncoder>false</EnableIntelLowPowerHevcHwEncoder>
<EnableHardwareEncoding>true</EnableHardwareEncoding>
<AllowHevcEncoding>true</AllowHevcEncoding>
<EnableSubtitleExtraction>true</EnableSubtitleExtraction>
</EncodingOptions>
EOF
log_info "Jellyfin NVIDIA configuration created"
}
create_jellyfin_amd_config() {
log_info "Creating Jellyfin AMD configuration"
cat > "${DOCKER_COMPOSE_DIR}/jellyfin/docker-compose.gpu.yml" << 'EOF'
version: '3.8'
services:
jellyfin:
image: jellyfin/jellyfin:latest
container_name: jellyfin
restart: unless-stopped
environment:
- JELLYFIN_PublishedServerUrl=https://jellyfin.${DOMAIN}
volumes:
- ./config:/config
- ./cache:/cache
- /media:/media:ro
- /dev/shm:/dev/shm
devices:
- /dev/dri:/dev/dri
ports:
- "8096:8096"
networks:
- media_network
labels:
- "traefik.enable=true"
- "traefik.http.routers.jellyfin.rule=Host(\`jellyfin.${DOMAIN}\`)"
- "traefik.http.routers.jellyfin.tls=true"
- "traefik.http.routers.jellyfin.tls.certresolver=letsencrypt"
- "traefik.http.services.jellyfin.loadbalancer.server.port=8096"
networks:
media_network:
external: true
EOF
log_info "Jellyfin AMD configuration created"
}
create_jellyfin_intel_config() {
log_info "Creating Jellyfin Intel configuration"
cat > "${DOCKER_COMPOSE_DIR}/jellyfin/docker-compose.gpu.yml" << 'EOF'
version: '3.8'
services:
jellyfin:
image: jellyfin/jellyfin:latest
container_name: jellyfin
restart: unless-stopped
environment:
- JELLYFIN_PublishedServerUrl=https://jellyfin.${DOMAIN}
volumes:
- ./config:/config
- ./cache:/cache
- /media:/media:ro
- /dev/shm:/dev/shm
devices:
- /dev/dri:/dev/dri
ports:
- "8096:8096"
networks:
- media_network
labels:
- "traefik.enable=true"
- "traefik.http.routers.jellyfin.rule=Host(\`jellyfin.${DOMAIN}\`)"
- "traefik.http.routers.jellyfin.tls=true"
- "traefik.http.routers.jellyfin.tls.certresolver=letsencrypt"
- "traefik.http.services.jellyfin.loadbalancer.server.port=8096"
networks:
media_network:
external: true
EOF
log_info "Jellyfin Intel configuration created"
}
create_jellyfin_software_config() {
log_info "Creating Jellyfin software encoding configuration"
cat > "${DOCKER_COMPOSE_DIR}/jellyfin/docker-compose.gpu.yml" << 'EOF'
version: '3.8'
services:
jellyfin:
image: jellyfin/jellyfin:latest
container_name: jellyfin
restart: unless-stopped
environment:
- JELLYFIN_PublishedServerUrl=https://jellyfin.${DOMAIN}
volumes:
- ./config:/config
- ./cache:/cache
- /media:/media:ro
- /dev/shm:/dev/shm
ports:
- "8096:8096"
deploy:
resources:
limits:
cpus: '4'
memory: 4G
reservations:
cpus: '2'
memory: 2G
networks:
- media_network
labels:
- "traefik.enable=true"
- "traefik.http.routers.jellyfin.rule=Host(\`jellyfin.${DOMAIN}\`)"
- "traefik.http.routers.jellyfin.tls=true"
- "traefik.http.routers.jellyfin.tls.certresolver=letsencrypt"
- "traefik.http.services.jellyfin.loadbalancer.server.port=8096"
networks:
media_network:
external: true
EOF
log_info "Jellyfin software encoding configuration created"
}
configure_immich_gpu() {
log_info "Configuring Immich GPU acceleration"
local immich_config="${DOCKER_COMPOSE_DIR}/immich"
mkdir -p "$immich_config"
# Create Immich GPU configuration
case "$GPU_TYPE" in
"nvidia")
create_immich_nvidia_config
;;
"amd"|"intel")
create_immich_vaapi_config
;;
*)
create_immich_software_config
;;
esac
}
create_immich_nvidia_config() {
log_info "Creating Immich NVIDIA configuration"
cat > "${DOCKER_COMPOSE_DIR}/immich/docker-compose.gpu.yml" << 'EOF'
version: '3.8'
services:
immich-server:
container_name: immich_server
image: ghcr.io/immich-app/immich-server:release
restart: unless-stopped
environment:
- DB_HOSTNAME=immich_postgres
- DB_USERNAME=postgres
- DB_PASSWORD_FILE=/run/secrets/immich_db_password
- DB_DATABASE_NAME=immich
- REDIS_HOSTNAME=immich_redis
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,video,utility
volumes:
- ${UPLOAD_LOCATION}:/usr/src/app/upload
- /etc/localtime:/etc/localtime:ro
- /dev/shm:/dev/shm
ports:
- "2283:3001"
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
depends_on:
- redis
- database
secrets:
- immich_db_password
networks:
- media_network
labels:
- "traefik.enable=true"
- "traefik.http.routers.immich.rule=Host(\`photos.${DOMAIN}\`)"
- "traefik.http.routers.immich.tls=true"
- "traefik.http.routers.immich.tls.certresolver=letsencrypt"
- "traefik.http.services.immich.loadbalancer.server.port=3001"
immich-microservices:
container_name: immich_microservices
image: ghcr.io/immich-app/immich-server:release
restart: unless-stopped
environment:
- DB_HOSTNAME=immich_postgres
- DB_USERNAME=postgres
- DB_PASSWORD_FILE=/run/secrets/immich_db_password
- DB_DATABASE_NAME=immich
- REDIS_HOSTNAME=immich_redis
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,video,utility
volumes:
- ${UPLOAD_LOCATION}:/usr/src/app/upload
- /etc/localtime:/etc/localtime:ro
- /dev/shm:/dev/shm
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
command: ['start.sh', 'microservices']
depends_on:
- redis
- database
secrets:
- immich_db_password
networks:
- media_network
immich-machine-learning:
container_name: immich_machine_learning
image: ghcr.io/immich-app/immich-machine-learning:release
restart: unless-stopped
environment:
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,video,utility
volumes:
- model-cache:/cache
- /dev/shm:/dev/shm
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
networks:
- media_network
redis:
container_name: immich_redis
image: redis:6.2-alpine
restart: unless-stopped
networks:
- media_network
database:
container_name: immich_postgres
image: tensorchord/pgvecto-rs:pg14-v0.2.0
restart: unless-stopped
environment:
- POSTGRES_PASSWORD_FILE=/run/secrets/immich_db_password
- POSTGRES_USER=postgres
- POSTGRES_DB=immich
volumes:
- pgdata:/var/lib/postgresql/data
secrets:
- immich_db_password
networks:
- media_network
volumes:
pgdata:
model-cache:
secrets:
immich_db_password:
external: true
networks:
media_network:
external: true
EOF
log_info "Immich NVIDIA configuration created"
}
create_immich_vaapi_config() {
log_info "Creating Immich VA-API configuration"
cat > "${DOCKER_COMPOSE_DIR}/immich/docker-compose.gpu.yml" << 'EOF'
version: '3.8'
services:
immich-server:
container_name: immich_server
image: ghcr.io/immich-app/immich-server:release
restart: unless-stopped
environment:
- DB_HOSTNAME=immich_postgres
- DB_USERNAME=postgres
- DB_PASSWORD_FILE=/run/secrets/immich_db_password
- DB_DATABASE_NAME=immich
- REDIS_HOSTNAME=immich_redis
volumes:
- ${UPLOAD_LOCATION}:/usr/src/app/upload
- /etc/localtime:/etc/localtime:ro
- /dev/shm:/dev/shm
devices:
- /dev/dri:/dev/dri
ports:
- "2283:3001"
depends_on:
- redis
- database
secrets:
- immich_db_password
networks:
- media_network
labels:
- "traefik.enable=true"
- "traefik.http.routers.immich.rule=Host(\`photos.${DOMAIN}\`)"
- "traefik.http.routers.immich.tls=true"
- "traefik.http.routers.immich.tls.certresolver=letsencrypt"
- "traefik.http.services.immich.loadbalancer.server.port=3001"
immich-microservices:
container_name: immich_microservices
image: ghcr.io/immich-app/immich-server:release
restart: unless-stopped
environment:
- DB_HOSTNAME=immich_postgres
- DB_USERNAME=postgres
- DB_PASSWORD_FILE=/run/secrets/immich_db_password
- DB_DATABASE_NAME=immich
- REDIS_HOSTNAME=immich_redis
volumes:
- ${UPLOAD_LOCATION}:/usr/src/app/upload
- /etc/localtime:/etc/localtime:ro
- /dev/shm:/dev/shm
devices:
- /dev/dri:/dev/dri
command: ['start.sh', 'microservices']
depends_on:
- redis
- database
secrets:
- immich_db_password
networks:
- media_network
immich-machine-learning:
container_name: immich_machine_learning
image: ghcr.io/immich-app/immich-machine-learning:release
restart: unless-stopped
volumes:
- model-cache:/cache
- /dev/shm:/dev/shm
networks:
- media_network
redis:
container_name: immich_redis
image: redis:6.2-alpine
restart: unless-stopped
networks:
- media_network
database:
container_name: immich_postgres
image: tensorchord/pgvecto-rs:pg14-v0.2.0
restart: unless-stopped
environment:
- POSTGRES_PASSWORD_FILE=/run/secrets/immich_db_password
- POSTGRES_USER=postgres
- POSTGRES_DB=immich
volumes:
- pgdata:/var/lib/postgresql/data
secrets:
- immich_db_password
networks:
- media_network
volumes:
pgdata:
model-cache:
secrets:
immich_db_password:
external: true
networks:
media_network:
external: true
EOF
log_info "Immich VA-API configuration created"
}
create_immich_software_config() {
log_info "Creating Immich software processing configuration"
cat > "${DOCKER_COMPOSE_DIR}/immich/docker-compose.gpu.yml" << 'EOF'
version: '3.8'
services:
immich-server:
container_name: immich_server
image: ghcr.io/immich-app/immich-server:release
restart: unless-stopped
environment:
- DB_HOSTNAME=immich_postgres
- DB_USERNAME=postgres
- DB_PASSWORD_FILE=/run/secrets/immich_db_password
- DB_DATABASE_NAME=immich
- REDIS_HOSTNAME=immich_redis
volumes:
- ${UPLOAD_LOCATION}:/usr/src/app/upload
- /etc/localtime:/etc/localtime:ro
- /dev/shm:/dev/shm
ports:
- "2283:3001"
deploy:
resources:
limits:
cpus: '2'
memory: 2G
reservations:
cpus: '1'
memory: 1G
depends_on:
- redis
- database
secrets:
- immich_db_password
networks:
- media_network
labels:
- "traefik.enable=true"
- "traefik.http.routers.immich.rule=Host(\`photos.${DOMAIN}\`)"
- "traefik.http.routers.immich.tls=true"
- "traefik.http.routers.immich.tls.certresolver=letsencrypt"
- "traefik.http.services.immich.loadbalancer.server.port=3001"
immich-microservices:
container_name: immich_microservices
image: ghcr.io/immich-app/immich-server:release
restart: unless-stopped
environment:
- DB_HOSTNAME=immich_postgres
- DB_USERNAME=postgres
- DB_PASSWORD_FILE=/run/secrets/immich_db_password
- DB_DATABASE_NAME=immich
- REDIS_HOSTNAME=immich_redis
volumes:
- ${UPLOAD_LOCATION}:/usr/src/app/upload
- /etc/localtime:/etc/localtime:ro
- /dev/shm:/dev/shm
deploy:
resources:
limits:
cpus: '4'
memory: 4G
reservations:
cpus: '2'
memory: 2G
command: ['start.sh', 'microservices']
depends_on:
- redis
- database
secrets:
- immich_db_password
networks:
- media_network
immich-machine-learning:
container_name: immich_machine_learning
image: ghcr.io/immich-app/immich-machine-learning:release
restart: unless-stopped
volumes:
- model-cache:/cache
- /dev/shm:/dev/shm
deploy:
resources:
limits:
cpus: '2'
memory: 4G
reservations:
cpus: '1'
memory: 2G
networks:
- media_network
redis:
container_name: immich_redis
image: redis:6.2-alpine
restart: unless-stopped
networks:
- media_network
database:
container_name: immich_postgres
image: tensorchord/pgvecto-rs:pg14-v0.2.0
restart: unless-stopped
environment:
- POSTGRES_PASSWORD_FILE=/run/secrets/immich_db_password
- POSTGRES_USER=postgres
- POSTGRES_DB=immich
volumes:
- pgdata:/var/lib/postgresql/data
secrets:
- immich_db_password
networks:
- media_network
volumes:
pgdata:
model-cache:
secrets:
immich_db_password:
external: true
networks:
media_network:
external: true
EOF
log_info "Immich software processing configuration created"
}
update_docker_compose_configs() {
log_info "Updating main Docker Compose configurations"
# Update main docker-compose.yml to include GPU configurations
local main_compose="${DOCKER_COMPOSE_DIR}/docker-compose.yml"
local backup_file="${CONFIG_BACKUP_DIR}/docker-compose.yml.backup.$(date +%Y%m%d_%H%M%S)"
if [[ -f "$main_compose" ]]; then
mkdir -p "$(dirname "$backup_file")"
cp "$main_compose" "$backup_file"
log_info "Main Docker Compose backed up to: $backup_file"
fi
# Create GPU-enabled Docker Compose override
cat > "${DOCKER_COMPOSE_DIR}/docker-compose.gpu-override.yml" << EOF
version: '3.8'
# GPU Override Configuration
# This file extends the main docker-compose.yml with GPU acceleration
services:
jellyfin:
extends:
file: ./jellyfin/docker-compose.gpu.yml
service: jellyfin
immich-server:
extends:
file: ./immich/docker-compose.gpu.yml
service: immich-server
immich-microservices:
extends:
file: ./immich/docker-compose.gpu.yml
service: immich-microservices
immich-machine-learning:
extends:
file: ./immich/docker-compose.gpu.yml
service: immich-machine-learning
EOF
# Create deployment script with GPU support
cat > "${DOCKER_COMPOSE_DIR}/deploy-with-gpu.sh" << 'EOF'
#!/bin/bash
# Deployment script with GPU acceleration support
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
# Source environment variables
if [[ -f .env ]]; then
source .env
fi
# Deploy with GPU override
echo "Deploying services with GPU acceleration..."
docker-compose -f docker-compose.yml -f docker-compose.gpu-override.yml up -d
# Verify deployment
echo "Verifying GPU-accelerated services..."
docker-compose ps
echo "GPU-enabled deployment completed successfully!"
EOF
chmod +x "${DOCKER_COMPOSE_DIR}/deploy-with-gpu.sh"
log_info "Docker Compose GPU configurations updated"
}
test_gpu_acceleration() {
log_info "Testing GPU acceleration functionality"
case "$GPU_TYPE" in
"nvidia")
test_nvidia_acceleration
;;
"amd"|"intel")
test_vaapi_acceleration
;;
*)
log_info "Software encoding - no GPU tests needed"
;;
esac
}
test_nvidia_acceleration() {
log_info "Testing NVIDIA GPU acceleration"
# Test NVIDIA Docker runtime
if docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi; then
log_info "NVIDIA Docker runtime test: PASSED"
else
log_error "NVIDIA Docker runtime test: FAILED"
return 1
fi
# Test FFMPEG with NVIDIA acceleration
local test_output="${CONFIG_BACKUP_DIR}/nvidia_ffmpeg_test.log"
if docker run --rm --gpus all \
-v /tmp:/tmp \
jrottenberg/ffmpeg:4.4-nvidia \
-f lavfi -i testsrc2=duration=10:size=1920x1080:rate=30 \
-c:v h264_nvenc -preset fast -f null - &> "$test_output"; then
log_info "NVIDIA FFMPEG acceleration test: PASSED"
else
log_warn "NVIDIA FFMPEG acceleration test: FAILED - check $test_output"
fi
}
test_vaapi_acceleration() {
log_info "Testing VA-API acceleration"
# Test DRI device access
if docker run --rm --device=/dev/dri ubuntu:20.04 ls -la /dev/dri/; then
log_info "DRI device access test: PASSED"
else
log_error "DRI device access test: FAILED"
return 1
fi
# Test VAAPI functionality
local test_output="${CONFIG_BACKUP_DIR}/vaapi_test.log"
if docker run --rm \
--device=/dev/dri \
jrottenberg/ffmpeg:4.4-vaapi \
-vaapi_device /dev/dri/renderD128 \
-f lavfi -i testsrc2=duration=10:size=1920x1080:rate=30 \
-vf 'format=nv12,hwupload' \
-c:v h264_vaapi -f null - &> "$test_output"; then
log_info "VA-API acceleration test: PASSED"
else
log_warn "VA-API acceleration test: FAILED - check $test_output"
fi
}
setup_gpu_monitoring() {
log_info "Setting up GPU monitoring"
# Create GPU monitoring script
cat > "${SCRIPT_DIR}/gpu_monitor.py" << 'EOF'
#!/usr/bin/env python3
"""
GPU Monitoring Script
Provides Prometheus metrics for GPU utilization
"""
import time
import subprocess
import json
from http.server import HTTPServer, BaseHTTPRequestHandler
import threading
import logging
class GPUMonitor:
def __init__(self):
self.gpu_type = self.detect_gpu_type()
self.metrics = {}
def detect_gpu_type(self):
try:
subprocess.run(['nvidia-smi'], capture_output=True, check=True)
return 'nvidia'
except:
pass
# Check for AMD GPU
result = subprocess.run(['lspci'], capture_output=True, text=True)
if 'AMD' in result.stdout:
return 'amd'
elif 'Intel' in result.stdout and 'VGA' in result.stdout:
return 'intel'
return 'software'
def get_nvidia_metrics(self):
try:
result = subprocess.run([
'nvidia-smi', '--query-gpu=utilization.gpu,utilization.memory,memory.used,memory.total,temperature.gpu',
'--format=csv,noheader,nounits'
], capture_output=True, text=True, check=True)
lines = result.stdout.strip().split('\n')
gpu_metrics = []
for i, line in enumerate(lines):
values = [x.strip() for x in line.split(',')]
gpu_metrics.append({
'gpu_id': i,
'utilization_gpu': float(values[0]),
'utilization_memory': float(values[1]),
'memory_used': float(values[2]),
'memory_total': float(values[3]),
'temperature': float(values[4])
})
return gpu_metrics
except Exception as e:
logging.error(f"Error getting NVIDIA metrics: {e}")
return []
def get_system_metrics(self):
# Fallback system metrics
return [{
'gpu_id': 0,
'utilization_gpu': 0,
'utilization_memory': 0,
'memory_used': 0,
'memory_total': 0,
'temperature': 0
}]
def collect_metrics(self):
if self.gpu_type == 'nvidia':
return self.get_nvidia_metrics()
else:
return self.get_system_metrics()
def update_metrics(self):
while True:
try:
self.metrics = self.collect_metrics()
except Exception as e:
logging.error(f"Error updating metrics: {e}")
time.sleep(10) # Update every 10 seconds
class MetricsHandler(BaseHTTPRequestHandler):
def __init__(self, gpu_monitor, *args, **kwargs):
self.gpu_monitor = gpu_monitor
super().__init__(*args, **kwargs)
def do_GET(self):
if self.path == '/metrics':
self.send_response(200)
self.send_header('Content-type', 'text/plain')
self.end_headers()
metrics_text = self.generate_prometheus_metrics()
self.wfile.write(metrics_text.encode())
else:
self.send_response(404)
self.end_headers()
def generate_prometheus_metrics(self):
metrics = []
for gpu in self.gpu_monitor.metrics:
gpu_id = gpu['gpu_id']
metrics.extend([
f'# HELP gpu_utilization_percent GPU utilization percentage',
f'# TYPE gpu_utilization_percent gauge',
f'gpu_utilization_percent{{gpu_id="{gpu_id}"}} {gpu["utilization_gpu"]}',
f'# HELP gpu_memory_utilization_percent GPU memory utilization percentage',
f'# TYPE gpu_memory_utilization_percent gauge',
f'gpu_memory_utilization_percent{{gpu_id="{gpu_id}"}} {gpu["utilization_memory"]}',
f'# HELP gpu_memory_used_mb GPU memory used in MB',
f'# TYPE gpu_memory_used_mb gauge',
f'gpu_memory_used_mb{{gpu_id="{gpu_id}"}} {gpu["memory_used"]}',
f'# HELP gpu_memory_total_mb GPU total memory in MB',
f'# TYPE gpu_memory_total_mb gauge',
f'gpu_memory_total_mb{{gpu_id="{gpu_id}"}} {gpu["memory_total"]}',
f'# HELP gpu_temperature_celsius GPU temperature in Celsius',
f'# TYPE gpu_temperature_celsius gauge',
f'gpu_temperature_celsius{{gpu_id="{gpu_id}"}} {gpu["temperature"]}',
])
return '\n'.join(metrics)
def main():
logging.basicConfig(level=logging.INFO)
gpu_monitor = GPUMonitor()
# Start metrics collection in background
metrics_thread = threading.Thread(target=gpu_monitor.update_metrics, daemon=True)
metrics_thread.start()
# Create handler with gpu_monitor
def handler(*args, **kwargs):
return MetricsHandler(gpu_monitor, *args, **kwargs)
# Start HTTP server
server = HTTPServer(('0.0.0.0', 9101), handler)
print("GPU metrics server started on port 9101")
server.serve_forever()
if __name__ == '__main__':
main()
EOF
chmod +x "${SCRIPT_DIR}/gpu_monitor.py"
# Create systemd service for GPU monitoring
cat > "/etc/systemd/system/gpu-monitor.service" << EOF
[Unit]
Description=GPU Monitoring Service
After=network.target
[Service]
Type=simple
User=root
WorkingDirectory=${SCRIPT_DIR}
ExecStart=/usr/bin/python3 ${SCRIPT_DIR}/gpu_monitor.py
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target
EOF
# Enable and start the service
systemctl daemon-reload
systemctl enable gpu-monitor.service
systemctl start gpu-monitor.service
# Add GPU monitoring to Prometheus configuration
if [[ -f "${SCRIPT_DIR}/../monitoring/prometheus/prometheus.yml" ]]; then
log_info "Adding GPU monitoring to Prometheus configuration"
# Check if GPU target already exists
if ! grep -q "gpu-monitor" "${SCRIPT_DIR}/../monitoring/prometheus/prometheus.yml"; then
cat >> "${SCRIPT_DIR}/../monitoring/prometheus/prometheus.yml" << 'EOF'
- job_name: 'gpu-monitor'
static_configs:
- targets: ['localhost:9101']
scrape_interval: 15s
metrics_path: '/metrics'
EOF
fi
fi
log_info "GPU monitoring setup completed"
}
cleanup_on_exit() {
log_info "Cleaning up GPU optimization resources"
# Stop any test containers
docker ps -q --filter "ancestor=nvidia/cuda:11.0-base" | xargs -r docker stop
docker ps -q --filter "ancestor=jrottenberg/ffmpeg:4.4-nvidia" | xargs -r docker stop
docker ps -q --filter "ancestor=jrottenberg/ffmpeg:4.4-vaapi" | xargs -r docker stop
log_info "GPU optimization cleanup completed"
}
# Execute main function
main "$@"