1250 lines
35 KiB
Bash
Executable File
1250 lines
35 KiB
Bash
Executable File
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
# GPU Passthrough Optimizer for Media Services
|
|
# Configures GPU acceleration for Jellyfin and Immich
|
|
# Part of the Migration Issues Resolution Framework
|
|
|
|
# Source the error handling library
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
source "${SCRIPT_DIR}/lib/error_handling.sh"
|
|
|
|
# Configuration
|
|
readonly LOG_FILE="${SCRIPT_DIR}/../logs/gpu_optimization_$(date +%Y%m%d_%H%M%S).log"
|
|
readonly CONFIG_BACKUP_DIR="${SCRIPT_DIR}/../backups/gpu_configs"
|
|
readonly DOCKER_COMPOSE_DIR="${SCRIPT_DIR}/../../"
|
|
|
|
# Initialize logging
|
|
init_logging "$LOG_FILE"
|
|
|
|
main() {
|
|
log_info "Starting GPU passthrough optimization for media services"
|
|
|
|
# Register cleanup function
|
|
register_cleanup cleanup_on_exit
|
|
|
|
# Validate prerequisites
|
|
validate_prerequisites
|
|
|
|
# Detect GPU hardware
|
|
detect_gpu_hardware
|
|
|
|
# Configure Docker for GPU access
|
|
configure_docker_gpu
|
|
|
|
# Configure Jellyfin GPU acceleration
|
|
configure_jellyfin_gpu
|
|
|
|
# Configure Immich GPU acceleration
|
|
configure_immich_gpu
|
|
|
|
# Update Docker Compose configurations
|
|
update_docker_compose_configs
|
|
|
|
# Test GPU acceleration
|
|
test_gpu_acceleration
|
|
|
|
# Configure GPU monitoring
|
|
setup_gpu_monitoring
|
|
|
|
log_info "GPU passthrough optimization completed successfully"
|
|
}
|
|
|
|
validate_prerequisites() {
|
|
log_info "Validating GPU optimization prerequisites"
|
|
|
|
local required_commands=(
|
|
"docker" "nvidia-smi" "lspci" "modinfo"
|
|
)
|
|
|
|
for cmd in "${required_commands[@]}"; do
|
|
if ! command -v "$cmd" &>/dev/null; then
|
|
log_error "Required command not found: $cmd"
|
|
exit 1
|
|
fi
|
|
done
|
|
|
|
# Check if running as root or with sudo
|
|
if [[ $EUID -ne 0 ]]; then
|
|
log_error "This script must be run as root or with sudo"
|
|
exit 1
|
|
fi
|
|
|
|
# Verify Docker is running
|
|
if ! docker info &>/dev/null; then
|
|
log_error "Docker is not running or accessible"
|
|
exit 1
|
|
fi
|
|
|
|
log_info "Prerequisites validation completed"
|
|
}
|
|
|
|
detect_gpu_hardware() {
|
|
log_info "Detecting GPU hardware configuration"
|
|
|
|
# Create GPU detection report
|
|
local gpu_report="${CONFIG_BACKUP_DIR}/gpu_detection_$(date +%Y%m%d_%H%M%S).txt"
|
|
mkdir -p "$(dirname "$gpu_report")"
|
|
|
|
{
|
|
echo "GPU Hardware Detection Report"
|
|
echo "Generated: $(date)"
|
|
echo "==============================="
|
|
echo
|
|
|
|
echo "PCI GPU Devices:"
|
|
lspci | grep -i vga || echo "No VGA devices found"
|
|
lspci | grep -i nvidia || echo "No NVIDIA devices found"
|
|
lspci | grep -i amd || echo "No AMD devices found"
|
|
lspci | grep -i intel || echo "No Intel GPU devices found"
|
|
echo
|
|
|
|
if command -v nvidia-smi &>/dev/null; then
|
|
echo "NVIDIA GPU Status:"
|
|
nvidia-smi || echo "NVIDIA SMI not available"
|
|
echo
|
|
fi
|
|
|
|
echo "GPU-related kernel modules:"
|
|
lsmod | grep -E "(nvidia|nouveau|amdgpu|radeon|i915)" || echo "No GPU modules loaded"
|
|
echo
|
|
|
|
if [[ -d /dev/dri ]]; then
|
|
echo "DRI devices:"
|
|
ls -la /dev/dri/
|
|
echo
|
|
fi
|
|
|
|
if [[ -e /dev/nvidia0 ]]; then
|
|
echo "NVIDIA devices:"
|
|
ls -la /dev/nvidia*
|
|
echo
|
|
fi
|
|
|
|
} > "$gpu_report"
|
|
|
|
log_info "GPU detection report saved to: $gpu_report"
|
|
|
|
# Determine GPU type and capabilities
|
|
if nvidia-smi &>/dev/null; then
|
|
GPU_TYPE="nvidia"
|
|
GPU_VENDOR="NVIDIA"
|
|
log_info "NVIDIA GPU detected"
|
|
elif lspci | grep -qi amd; then
|
|
GPU_TYPE="amd"
|
|
GPU_VENDOR="AMD"
|
|
log_info "AMD GPU detected"
|
|
elif lspci | grep -qi intel; then
|
|
GPU_TYPE="intel"
|
|
GPU_VENDOR="Intel"
|
|
log_info "Intel GPU detected"
|
|
else
|
|
log_warn "No supported GPU detected - using software encoding"
|
|
GPU_TYPE="software"
|
|
GPU_VENDOR="Software"
|
|
fi
|
|
|
|
export GPU_TYPE GPU_VENDOR
|
|
}
|
|
|
|
configure_docker_gpu() {
|
|
log_info "Configuring Docker for GPU access"
|
|
|
|
case "$GPU_TYPE" in
|
|
"nvidia")
|
|
configure_nvidia_docker
|
|
;;
|
|
"amd")
|
|
configure_amd_docker
|
|
;;
|
|
"intel")
|
|
configure_intel_docker
|
|
;;
|
|
*)
|
|
log_warn "No GPU-specific Docker configuration needed"
|
|
;;
|
|
esac
|
|
}
|
|
|
|
configure_nvidia_docker() {
|
|
log_info "Configuring NVIDIA Docker support"
|
|
|
|
# Check if nvidia-docker2 is installed
|
|
if ! dpkg -l | grep -q nvidia-docker2; then
|
|
log_info "Installing NVIDIA Docker support"
|
|
|
|
# Add NVIDIA Docker repository
|
|
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
|
|
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add -
|
|
curl -s -L "https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list" | \
|
|
tee /etc/apt/sources.list.d/nvidia-docker.list
|
|
|
|
apt-get update
|
|
apt-get install -y nvidia-docker2
|
|
|
|
# Restart Docker
|
|
systemctl restart docker
|
|
|
|
log_info "NVIDIA Docker support installed"
|
|
fi
|
|
|
|
# Configure Docker daemon for NVIDIA
|
|
local docker_daemon_config="/etc/docker/daemon.json"
|
|
local backup_file="${CONFIG_BACKUP_DIR}/daemon.json.backup.$(date +%Y%m%d_%H%M%S)"
|
|
|
|
mkdir -p "$(dirname "$backup_file")"
|
|
|
|
if [[ -f "$docker_daemon_config" ]]; then
|
|
cp "$docker_daemon_config" "$backup_file"
|
|
log_info "Docker daemon config backed up to: $backup_file"
|
|
fi
|
|
|
|
# Create or update daemon.json
|
|
cat > "$docker_daemon_config" << 'EOF'
|
|
{
|
|
"default-runtime": "nvidia",
|
|
"runtimes": {
|
|
"nvidia": {
|
|
"path": "nvidia-container-runtime",
|
|
"runtimeArgs": []
|
|
}
|
|
},
|
|
"log-driver": "json-file",
|
|
"log-opts": {
|
|
"max-size": "10m",
|
|
"max-file": "3"
|
|
}
|
|
}
|
|
EOF
|
|
|
|
# Restart Docker to apply changes
|
|
systemctl restart docker
|
|
|
|
# Verify NVIDIA Docker works
|
|
if docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi; then
|
|
log_info "NVIDIA Docker configuration verified successfully"
|
|
else
|
|
log_error "NVIDIA Docker test failed"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
configure_amd_docker() {
|
|
log_info "Configuring AMD GPU Docker support"
|
|
|
|
# Ensure proper device access
|
|
if [[ -d /dev/dri ]]; then
|
|
chmod 666 /dev/dri/*
|
|
log_info "DRI device permissions configured"
|
|
fi
|
|
|
|
# Test AMD GPU access
|
|
docker run --rm --device=/dev/dri ubuntu:20.04 ls -la /dev/dri/ || {
|
|
log_error "AMD GPU device access test failed"
|
|
exit 1
|
|
}
|
|
|
|
log_info "AMD GPU Docker configuration completed"
|
|
}
|
|
|
|
configure_intel_docker() {
|
|
log_info "Configuring Intel GPU Docker support"
|
|
|
|
# Ensure proper device access for Intel Quick Sync
|
|
if [[ -d /dev/dri ]]; then
|
|
chmod 666 /dev/dri/*
|
|
log_info "Intel GPU device permissions configured"
|
|
fi
|
|
|
|
# Load Intel GPU drivers if needed
|
|
modprobe i915 || log_warn "Failed to load i915 module"
|
|
|
|
log_info "Intel GPU Docker configuration completed"
|
|
}
|
|
|
|
configure_jellyfin_gpu() {
|
|
log_info "Configuring Jellyfin GPU acceleration"
|
|
|
|
local jellyfin_config="${DOCKER_COMPOSE_DIR}/jellyfin"
|
|
mkdir -p "$jellyfin_config"
|
|
|
|
# Create Jellyfin GPU configuration
|
|
case "$GPU_TYPE" in
|
|
"nvidia")
|
|
create_jellyfin_nvidia_config
|
|
;;
|
|
"amd")
|
|
create_jellyfin_amd_config
|
|
;;
|
|
"intel")
|
|
create_jellyfin_intel_config
|
|
;;
|
|
*)
|
|
create_jellyfin_software_config
|
|
;;
|
|
esac
|
|
}
|
|
|
|
create_jellyfin_nvidia_config() {
|
|
log_info "Creating Jellyfin NVIDIA configuration"
|
|
|
|
cat > "${DOCKER_COMPOSE_DIR}/jellyfin/docker-compose.gpu.yml" << 'EOF'
|
|
version: '3.8'
|
|
services:
|
|
jellyfin:
|
|
image: jellyfin/jellyfin:latest
|
|
container_name: jellyfin
|
|
restart: unless-stopped
|
|
environment:
|
|
- JELLYFIN_PublishedServerUrl=https://jellyfin.${DOMAIN}
|
|
- NVIDIA_VISIBLE_DEVICES=all
|
|
- NVIDIA_DRIVER_CAPABILITIES=compute,video,utility
|
|
volumes:
|
|
- ./config:/config
|
|
- ./cache:/cache
|
|
- /media:/media:ro
|
|
- /dev/shm:/dev/shm
|
|
ports:
|
|
- "8096:8096"
|
|
runtime: nvidia
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: 1
|
|
capabilities: [gpu]
|
|
networks:
|
|
- media_network
|
|
labels:
|
|
- "traefik.enable=true"
|
|
- "traefik.http.routers.jellyfin.rule=Host(\`jellyfin.${DOMAIN}\`)"
|
|
- "traefik.http.routers.jellyfin.tls=true"
|
|
- "traefik.http.routers.jellyfin.tls.certresolver=letsencrypt"
|
|
- "traefik.http.services.jellyfin.loadbalancer.server.port=8096"
|
|
|
|
networks:
|
|
media_network:
|
|
external: true
|
|
EOF
|
|
|
|
# Create Jellyfin encoding configuration
|
|
cat > "${DOCKER_COMPOSE_DIR}/jellyfin/encoding.xml" << 'EOF'
|
|
<?xml version="1.0" encoding="utf-8"?>
|
|
<EncodingOptions xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
|
|
<TranscodingTempPath>/config/transcoding-temp</TranscodingTempPath>
|
|
<FallbackFontPath>/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf</FallbackFontPath>
|
|
<EnableFallbackFont>false</EnableFallbackFont>
|
|
<EnableAudioVbr>false</EnableAudioVbr>
|
|
<DownMixAudioBoost>2</DownMixAudioBoost>
|
|
<MaxMuxingQueueSize>2048</MaxMuxingQueueSize>
|
|
<EnableThrottling>false</EnableThrottling>
|
|
<ThrottleDelaySeconds>180</ThrottleDelaySeconds>
|
|
<HardwareAccelerationType>nvenc</HardwareAccelerationType>
|
|
<EncodeMediaSourceId />
|
|
<VaapiDevice>/dev/dri/renderD128</VaapiDevice>
|
|
<EnableTonemapping>false</EnableTonemapping>
|
|
<EnableVppTonemapping>false</EnableVppTonemapping>
|
|
<TonemappingAlgorithm>hable</TonemappingAlgorithm>
|
|
<TonemappingRange>auto</TonemappingRange>
|
|
<TonemappingDesat>0</TonemappingDesat>
|
|
<TonemappingThreshold>0.8</TonemappingThreshold>
|
|
<TonemappingPeak>100</TonemappingPeak>
|
|
<TonemappingParam>0</TonemappingParam>
|
|
<H264Crf>23</H264Crf>
|
|
<H265Crf>28</H265Crf>
|
|
<DeinterlaceDoubleRate>false</DeinterlaceDoubleRate>
|
|
<DeinterlaceMethod>yadif</DeinterlaceMethod>
|
|
<EnableDecodingColorDepth10Hevc>true</EnableDecodingColorDepth10Hevc>
|
|
<EnableDecodingColorDepth10Vp9>true</EnableDecodingColorDepth10Vp9>
|
|
<EnableEnhancedNvdecDecoder>true</EnableEnhancedNvdecDecoder>
|
|
<PreferSystemNativeHwDecoder>false</PreferSystemNativeHwDecoder>
|
|
<EnableIntelLowPowerH264HwEncoder>false</EnableIntelLowPowerH264HwEncoder>
|
|
<EnableIntelLowPowerHevcHwEncoder>false</EnableIntelLowPowerHevcHwEncoder>
|
|
<EnableHardwareEncoding>true</EnableHardwareEncoding>
|
|
<AllowHevcEncoding>true</AllowHevcEncoding>
|
|
<EnableSubtitleExtraction>true</EnableSubtitleExtraction>
|
|
</EncodingOptions>
|
|
EOF
|
|
|
|
log_info "Jellyfin NVIDIA configuration created"
|
|
}
|
|
|
|
create_jellyfin_amd_config() {
|
|
log_info "Creating Jellyfin AMD configuration"
|
|
|
|
cat > "${DOCKER_COMPOSE_DIR}/jellyfin/docker-compose.gpu.yml" << 'EOF'
|
|
version: '3.8'
|
|
services:
|
|
jellyfin:
|
|
image: jellyfin/jellyfin:latest
|
|
container_name: jellyfin
|
|
restart: unless-stopped
|
|
environment:
|
|
- JELLYFIN_PublishedServerUrl=https://jellyfin.${DOMAIN}
|
|
volumes:
|
|
- ./config:/config
|
|
- ./cache:/cache
|
|
- /media:/media:ro
|
|
- /dev/shm:/dev/shm
|
|
devices:
|
|
- /dev/dri:/dev/dri
|
|
ports:
|
|
- "8096:8096"
|
|
networks:
|
|
- media_network
|
|
labels:
|
|
- "traefik.enable=true"
|
|
- "traefik.http.routers.jellyfin.rule=Host(\`jellyfin.${DOMAIN}\`)"
|
|
- "traefik.http.routers.jellyfin.tls=true"
|
|
- "traefik.http.routers.jellyfin.tls.certresolver=letsencrypt"
|
|
- "traefik.http.services.jellyfin.loadbalancer.server.port=8096"
|
|
|
|
networks:
|
|
media_network:
|
|
external: true
|
|
EOF
|
|
|
|
log_info "Jellyfin AMD configuration created"
|
|
}
|
|
|
|
create_jellyfin_intel_config() {
|
|
log_info "Creating Jellyfin Intel configuration"
|
|
|
|
cat > "${DOCKER_COMPOSE_DIR}/jellyfin/docker-compose.gpu.yml" << 'EOF'
|
|
version: '3.8'
|
|
services:
|
|
jellyfin:
|
|
image: jellyfin/jellyfin:latest
|
|
container_name: jellyfin
|
|
restart: unless-stopped
|
|
environment:
|
|
- JELLYFIN_PublishedServerUrl=https://jellyfin.${DOMAIN}
|
|
volumes:
|
|
- ./config:/config
|
|
- ./cache:/cache
|
|
- /media:/media:ro
|
|
- /dev/shm:/dev/shm
|
|
devices:
|
|
- /dev/dri:/dev/dri
|
|
ports:
|
|
- "8096:8096"
|
|
networks:
|
|
- media_network
|
|
labels:
|
|
- "traefik.enable=true"
|
|
- "traefik.http.routers.jellyfin.rule=Host(\`jellyfin.${DOMAIN}\`)"
|
|
- "traefik.http.routers.jellyfin.tls=true"
|
|
- "traefik.http.routers.jellyfin.tls.certresolver=letsencrypt"
|
|
- "traefik.http.services.jellyfin.loadbalancer.server.port=8096"
|
|
|
|
networks:
|
|
media_network:
|
|
external: true
|
|
EOF
|
|
|
|
log_info "Jellyfin Intel configuration created"
|
|
}
|
|
|
|
create_jellyfin_software_config() {
|
|
log_info "Creating Jellyfin software encoding configuration"
|
|
|
|
cat > "${DOCKER_COMPOSE_DIR}/jellyfin/docker-compose.gpu.yml" << 'EOF'
|
|
version: '3.8'
|
|
services:
|
|
jellyfin:
|
|
image: jellyfin/jellyfin:latest
|
|
container_name: jellyfin
|
|
restart: unless-stopped
|
|
environment:
|
|
- JELLYFIN_PublishedServerUrl=https://jellyfin.${DOMAIN}
|
|
volumes:
|
|
- ./config:/config
|
|
- ./cache:/cache
|
|
- /media:/media:ro
|
|
- /dev/shm:/dev/shm
|
|
ports:
|
|
- "8096:8096"
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
cpus: '4'
|
|
memory: 4G
|
|
reservations:
|
|
cpus: '2'
|
|
memory: 2G
|
|
networks:
|
|
- media_network
|
|
labels:
|
|
- "traefik.enable=true"
|
|
- "traefik.http.routers.jellyfin.rule=Host(\`jellyfin.${DOMAIN}\`)"
|
|
- "traefik.http.routers.jellyfin.tls=true"
|
|
- "traefik.http.routers.jellyfin.tls.certresolver=letsencrypt"
|
|
- "traefik.http.services.jellyfin.loadbalancer.server.port=8096"
|
|
|
|
networks:
|
|
media_network:
|
|
external: true
|
|
EOF
|
|
|
|
log_info "Jellyfin software encoding configuration created"
|
|
}
|
|
|
|
configure_immich_gpu() {
|
|
log_info "Configuring Immich GPU acceleration"
|
|
|
|
local immich_config="${DOCKER_COMPOSE_DIR}/immich"
|
|
mkdir -p "$immich_config"
|
|
|
|
# Create Immich GPU configuration
|
|
case "$GPU_TYPE" in
|
|
"nvidia")
|
|
create_immich_nvidia_config
|
|
;;
|
|
"amd"|"intel")
|
|
create_immich_vaapi_config
|
|
;;
|
|
*)
|
|
create_immich_software_config
|
|
;;
|
|
esac
|
|
}
|
|
|
|
create_immich_nvidia_config() {
|
|
log_info "Creating Immich NVIDIA configuration"
|
|
|
|
cat > "${DOCKER_COMPOSE_DIR}/immich/docker-compose.gpu.yml" << 'EOF'
|
|
version: '3.8'
|
|
services:
|
|
immich-server:
|
|
container_name: immich_server
|
|
image: ghcr.io/immich-app/immich-server:release
|
|
restart: unless-stopped
|
|
environment:
|
|
- DB_HOSTNAME=immich_postgres
|
|
- DB_USERNAME=postgres
|
|
- DB_PASSWORD_FILE=/run/secrets/immich_db_password
|
|
- DB_DATABASE_NAME=immich
|
|
- REDIS_HOSTNAME=immich_redis
|
|
- NVIDIA_VISIBLE_DEVICES=all
|
|
- NVIDIA_DRIVER_CAPABILITIES=compute,video,utility
|
|
volumes:
|
|
- ${UPLOAD_LOCATION}:/usr/src/app/upload
|
|
- /etc/localtime:/etc/localtime:ro
|
|
- /dev/shm:/dev/shm
|
|
ports:
|
|
- "2283:3001"
|
|
runtime: nvidia
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: 1
|
|
capabilities: [gpu]
|
|
depends_on:
|
|
- redis
|
|
- database
|
|
secrets:
|
|
- immich_db_password
|
|
networks:
|
|
- media_network
|
|
labels:
|
|
- "traefik.enable=true"
|
|
- "traefik.http.routers.immich.rule=Host(\`photos.${DOMAIN}\`)"
|
|
- "traefik.http.routers.immich.tls=true"
|
|
- "traefik.http.routers.immich.tls.certresolver=letsencrypt"
|
|
- "traefik.http.services.immich.loadbalancer.server.port=3001"
|
|
|
|
immich-microservices:
|
|
container_name: immich_microservices
|
|
image: ghcr.io/immich-app/immich-server:release
|
|
restart: unless-stopped
|
|
environment:
|
|
- DB_HOSTNAME=immich_postgres
|
|
- DB_USERNAME=postgres
|
|
- DB_PASSWORD_FILE=/run/secrets/immich_db_password
|
|
- DB_DATABASE_NAME=immich
|
|
- REDIS_HOSTNAME=immich_redis
|
|
- NVIDIA_VISIBLE_DEVICES=all
|
|
- NVIDIA_DRIVER_CAPABILITIES=compute,video,utility
|
|
volumes:
|
|
- ${UPLOAD_LOCATION}:/usr/src/app/upload
|
|
- /etc/localtime:/etc/localtime:ro
|
|
- /dev/shm:/dev/shm
|
|
runtime: nvidia
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: 1
|
|
capabilities: [gpu]
|
|
command: ['start.sh', 'microservices']
|
|
depends_on:
|
|
- redis
|
|
- database
|
|
secrets:
|
|
- immich_db_password
|
|
networks:
|
|
- media_network
|
|
|
|
immich-machine-learning:
|
|
container_name: immich_machine_learning
|
|
image: ghcr.io/immich-app/immich-machine-learning:release
|
|
restart: unless-stopped
|
|
environment:
|
|
- NVIDIA_VISIBLE_DEVICES=all
|
|
- NVIDIA_DRIVER_CAPABILITIES=compute,video,utility
|
|
volumes:
|
|
- model-cache:/cache
|
|
- /dev/shm:/dev/shm
|
|
runtime: nvidia
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: 1
|
|
capabilities: [gpu]
|
|
networks:
|
|
- media_network
|
|
|
|
redis:
|
|
container_name: immich_redis
|
|
image: redis:6.2-alpine
|
|
restart: unless-stopped
|
|
networks:
|
|
- media_network
|
|
|
|
database:
|
|
container_name: immich_postgres
|
|
image: tensorchord/pgvecto-rs:pg14-v0.2.0
|
|
restart: unless-stopped
|
|
environment:
|
|
- POSTGRES_PASSWORD_FILE=/run/secrets/immich_db_password
|
|
- POSTGRES_USER=postgres
|
|
- POSTGRES_DB=immich
|
|
volumes:
|
|
- pgdata:/var/lib/postgresql/data
|
|
secrets:
|
|
- immich_db_password
|
|
networks:
|
|
- media_network
|
|
|
|
volumes:
|
|
pgdata:
|
|
model-cache:
|
|
|
|
secrets:
|
|
immich_db_password:
|
|
external: true
|
|
|
|
networks:
|
|
media_network:
|
|
external: true
|
|
EOF
|
|
|
|
log_info "Immich NVIDIA configuration created"
|
|
}
|
|
|
|
create_immich_vaapi_config() {
|
|
log_info "Creating Immich VA-API configuration"
|
|
|
|
cat > "${DOCKER_COMPOSE_DIR}/immich/docker-compose.gpu.yml" << 'EOF'
|
|
version: '3.8'
|
|
services:
|
|
immich-server:
|
|
container_name: immich_server
|
|
image: ghcr.io/immich-app/immich-server:release
|
|
restart: unless-stopped
|
|
environment:
|
|
- DB_HOSTNAME=immich_postgres
|
|
- DB_USERNAME=postgres
|
|
- DB_PASSWORD_FILE=/run/secrets/immich_db_password
|
|
- DB_DATABASE_NAME=immich
|
|
- REDIS_HOSTNAME=immich_redis
|
|
volumes:
|
|
- ${UPLOAD_LOCATION}:/usr/src/app/upload
|
|
- /etc/localtime:/etc/localtime:ro
|
|
- /dev/shm:/dev/shm
|
|
devices:
|
|
- /dev/dri:/dev/dri
|
|
ports:
|
|
- "2283:3001"
|
|
depends_on:
|
|
- redis
|
|
- database
|
|
secrets:
|
|
- immich_db_password
|
|
networks:
|
|
- media_network
|
|
labels:
|
|
- "traefik.enable=true"
|
|
- "traefik.http.routers.immich.rule=Host(\`photos.${DOMAIN}\`)"
|
|
- "traefik.http.routers.immich.tls=true"
|
|
- "traefik.http.routers.immich.tls.certresolver=letsencrypt"
|
|
- "traefik.http.services.immich.loadbalancer.server.port=3001"
|
|
|
|
immich-microservices:
|
|
container_name: immich_microservices
|
|
image: ghcr.io/immich-app/immich-server:release
|
|
restart: unless-stopped
|
|
environment:
|
|
- DB_HOSTNAME=immich_postgres
|
|
- DB_USERNAME=postgres
|
|
- DB_PASSWORD_FILE=/run/secrets/immich_db_password
|
|
- DB_DATABASE_NAME=immich
|
|
- REDIS_HOSTNAME=immich_redis
|
|
volumes:
|
|
- ${UPLOAD_LOCATION}:/usr/src/app/upload
|
|
- /etc/localtime:/etc/localtime:ro
|
|
- /dev/shm:/dev/shm
|
|
devices:
|
|
- /dev/dri:/dev/dri
|
|
command: ['start.sh', 'microservices']
|
|
depends_on:
|
|
- redis
|
|
- database
|
|
secrets:
|
|
- immich_db_password
|
|
networks:
|
|
- media_network
|
|
|
|
immich-machine-learning:
|
|
container_name: immich_machine_learning
|
|
image: ghcr.io/immich-app/immich-machine-learning:release
|
|
restart: unless-stopped
|
|
volumes:
|
|
- model-cache:/cache
|
|
- /dev/shm:/dev/shm
|
|
networks:
|
|
- media_network
|
|
|
|
redis:
|
|
container_name: immich_redis
|
|
image: redis:6.2-alpine
|
|
restart: unless-stopped
|
|
networks:
|
|
- media_network
|
|
|
|
database:
|
|
container_name: immich_postgres
|
|
image: tensorchord/pgvecto-rs:pg14-v0.2.0
|
|
restart: unless-stopped
|
|
environment:
|
|
- POSTGRES_PASSWORD_FILE=/run/secrets/immich_db_password
|
|
- POSTGRES_USER=postgres
|
|
- POSTGRES_DB=immich
|
|
volumes:
|
|
- pgdata:/var/lib/postgresql/data
|
|
secrets:
|
|
- immich_db_password
|
|
networks:
|
|
- media_network
|
|
|
|
volumes:
|
|
pgdata:
|
|
model-cache:
|
|
|
|
secrets:
|
|
immich_db_password:
|
|
external: true
|
|
|
|
networks:
|
|
media_network:
|
|
external: true
|
|
EOF
|
|
|
|
log_info "Immich VA-API configuration created"
|
|
}
|
|
|
|
create_immich_software_config() {
|
|
log_info "Creating Immich software processing configuration"
|
|
|
|
cat > "${DOCKER_COMPOSE_DIR}/immich/docker-compose.gpu.yml" << 'EOF'
|
|
version: '3.8'
|
|
services:
|
|
immich-server:
|
|
container_name: immich_server
|
|
image: ghcr.io/immich-app/immich-server:release
|
|
restart: unless-stopped
|
|
environment:
|
|
- DB_HOSTNAME=immich_postgres
|
|
- DB_USERNAME=postgres
|
|
- DB_PASSWORD_FILE=/run/secrets/immich_db_password
|
|
- DB_DATABASE_NAME=immich
|
|
- REDIS_HOSTNAME=immich_redis
|
|
volumes:
|
|
- ${UPLOAD_LOCATION}:/usr/src/app/upload
|
|
- /etc/localtime:/etc/localtime:ro
|
|
- /dev/shm:/dev/shm
|
|
ports:
|
|
- "2283:3001"
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
cpus: '2'
|
|
memory: 2G
|
|
reservations:
|
|
cpus: '1'
|
|
memory: 1G
|
|
depends_on:
|
|
- redis
|
|
- database
|
|
secrets:
|
|
- immich_db_password
|
|
networks:
|
|
- media_network
|
|
labels:
|
|
- "traefik.enable=true"
|
|
- "traefik.http.routers.immich.rule=Host(\`photos.${DOMAIN}\`)"
|
|
- "traefik.http.routers.immich.tls=true"
|
|
- "traefik.http.routers.immich.tls.certresolver=letsencrypt"
|
|
- "traefik.http.services.immich.loadbalancer.server.port=3001"
|
|
|
|
immich-microservices:
|
|
container_name: immich_microservices
|
|
image: ghcr.io/immich-app/immich-server:release
|
|
restart: unless-stopped
|
|
environment:
|
|
- DB_HOSTNAME=immich_postgres
|
|
- DB_USERNAME=postgres
|
|
- DB_PASSWORD_FILE=/run/secrets/immich_db_password
|
|
- DB_DATABASE_NAME=immich
|
|
- REDIS_HOSTNAME=immich_redis
|
|
volumes:
|
|
- ${UPLOAD_LOCATION}:/usr/src/app/upload
|
|
- /etc/localtime:/etc/localtime:ro
|
|
- /dev/shm:/dev/shm
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
cpus: '4'
|
|
memory: 4G
|
|
reservations:
|
|
cpus: '2'
|
|
memory: 2G
|
|
command: ['start.sh', 'microservices']
|
|
depends_on:
|
|
- redis
|
|
- database
|
|
secrets:
|
|
- immich_db_password
|
|
networks:
|
|
- media_network
|
|
|
|
immich-machine-learning:
|
|
container_name: immich_machine_learning
|
|
image: ghcr.io/immich-app/immich-machine-learning:release
|
|
restart: unless-stopped
|
|
volumes:
|
|
- model-cache:/cache
|
|
- /dev/shm:/dev/shm
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
cpus: '2'
|
|
memory: 4G
|
|
reservations:
|
|
cpus: '1'
|
|
memory: 2G
|
|
networks:
|
|
- media_network
|
|
|
|
redis:
|
|
container_name: immich_redis
|
|
image: redis:6.2-alpine
|
|
restart: unless-stopped
|
|
networks:
|
|
- media_network
|
|
|
|
database:
|
|
container_name: immich_postgres
|
|
image: tensorchord/pgvecto-rs:pg14-v0.2.0
|
|
restart: unless-stopped
|
|
environment:
|
|
- POSTGRES_PASSWORD_FILE=/run/secrets/immich_db_password
|
|
- POSTGRES_USER=postgres
|
|
- POSTGRES_DB=immich
|
|
volumes:
|
|
- pgdata:/var/lib/postgresql/data
|
|
secrets:
|
|
- immich_db_password
|
|
networks:
|
|
- media_network
|
|
|
|
volumes:
|
|
pgdata:
|
|
model-cache:
|
|
|
|
secrets:
|
|
immich_db_password:
|
|
external: true
|
|
|
|
networks:
|
|
media_network:
|
|
external: true
|
|
EOF
|
|
|
|
log_info "Immich software processing configuration created"
|
|
}
|
|
|
|
update_docker_compose_configs() {
|
|
log_info "Updating main Docker Compose configurations"
|
|
|
|
# Update main docker-compose.yml to include GPU configurations
|
|
local main_compose="${DOCKER_COMPOSE_DIR}/docker-compose.yml"
|
|
local backup_file="${CONFIG_BACKUP_DIR}/docker-compose.yml.backup.$(date +%Y%m%d_%H%M%S)"
|
|
|
|
if [[ -f "$main_compose" ]]; then
|
|
mkdir -p "$(dirname "$backup_file")"
|
|
cp "$main_compose" "$backup_file"
|
|
log_info "Main Docker Compose backed up to: $backup_file"
|
|
fi
|
|
|
|
# Create GPU-enabled Docker Compose override
|
|
cat > "${DOCKER_COMPOSE_DIR}/docker-compose.gpu-override.yml" << EOF
|
|
version: '3.8'
|
|
|
|
# GPU Override Configuration
|
|
# This file extends the main docker-compose.yml with GPU acceleration
|
|
|
|
services:
|
|
jellyfin:
|
|
extends:
|
|
file: ./jellyfin/docker-compose.gpu.yml
|
|
service: jellyfin
|
|
|
|
immich-server:
|
|
extends:
|
|
file: ./immich/docker-compose.gpu.yml
|
|
service: immich-server
|
|
|
|
immich-microservices:
|
|
extends:
|
|
file: ./immich/docker-compose.gpu.yml
|
|
service: immich-microservices
|
|
|
|
immich-machine-learning:
|
|
extends:
|
|
file: ./immich/docker-compose.gpu.yml
|
|
service: immich-machine-learning
|
|
EOF
|
|
|
|
# Create deployment script with GPU support
|
|
cat > "${DOCKER_COMPOSE_DIR}/deploy-with-gpu.sh" << 'EOF'
|
|
#!/bin/bash
|
|
# Deployment script with GPU acceleration support
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
cd "$SCRIPT_DIR"
|
|
|
|
# Source environment variables
|
|
if [[ -f .env ]]; then
|
|
source .env
|
|
fi
|
|
|
|
# Deploy with GPU override
|
|
echo "Deploying services with GPU acceleration..."
|
|
docker-compose -f docker-compose.yml -f docker-compose.gpu-override.yml up -d
|
|
|
|
# Verify deployment
|
|
echo "Verifying GPU-accelerated services..."
|
|
docker-compose ps
|
|
|
|
echo "GPU-enabled deployment completed successfully!"
|
|
EOF
|
|
|
|
chmod +x "${DOCKER_COMPOSE_DIR}/deploy-with-gpu.sh"
|
|
|
|
log_info "Docker Compose GPU configurations updated"
|
|
}
|
|
|
|
test_gpu_acceleration() {
|
|
log_info "Testing GPU acceleration functionality"
|
|
|
|
case "$GPU_TYPE" in
|
|
"nvidia")
|
|
test_nvidia_acceleration
|
|
;;
|
|
"amd"|"intel")
|
|
test_vaapi_acceleration
|
|
;;
|
|
*)
|
|
log_info "Software encoding - no GPU tests needed"
|
|
;;
|
|
esac
|
|
}
|
|
|
|
test_nvidia_acceleration() {
|
|
log_info "Testing NVIDIA GPU acceleration"
|
|
|
|
# Test NVIDIA Docker runtime
|
|
if docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi; then
|
|
log_info "NVIDIA Docker runtime test: PASSED"
|
|
else
|
|
log_error "NVIDIA Docker runtime test: FAILED"
|
|
return 1
|
|
fi
|
|
|
|
# Test FFMPEG with NVIDIA acceleration
|
|
local test_output="${CONFIG_BACKUP_DIR}/nvidia_ffmpeg_test.log"
|
|
|
|
if docker run --rm --gpus all \
|
|
-v /tmp:/tmp \
|
|
jrottenberg/ffmpeg:4.4-nvidia \
|
|
-f lavfi -i testsrc2=duration=10:size=1920x1080:rate=30 \
|
|
-c:v h264_nvenc -preset fast -f null - &> "$test_output"; then
|
|
log_info "NVIDIA FFMPEG acceleration test: PASSED"
|
|
else
|
|
log_warn "NVIDIA FFMPEG acceleration test: FAILED - check $test_output"
|
|
fi
|
|
}
|
|
|
|
test_vaapi_acceleration() {
|
|
log_info "Testing VA-API acceleration"
|
|
|
|
# Test DRI device access
|
|
if docker run --rm --device=/dev/dri ubuntu:20.04 ls -la /dev/dri/; then
|
|
log_info "DRI device access test: PASSED"
|
|
else
|
|
log_error "DRI device access test: FAILED"
|
|
return 1
|
|
fi
|
|
|
|
# Test VAAPI functionality
|
|
local test_output="${CONFIG_BACKUP_DIR}/vaapi_test.log"
|
|
|
|
if docker run --rm \
|
|
--device=/dev/dri \
|
|
jrottenberg/ffmpeg:4.4-vaapi \
|
|
-vaapi_device /dev/dri/renderD128 \
|
|
-f lavfi -i testsrc2=duration=10:size=1920x1080:rate=30 \
|
|
-vf 'format=nv12,hwupload' \
|
|
-c:v h264_vaapi -f null - &> "$test_output"; then
|
|
log_info "VA-API acceleration test: PASSED"
|
|
else
|
|
log_warn "VA-API acceleration test: FAILED - check $test_output"
|
|
fi
|
|
}
|
|
|
|
setup_gpu_monitoring() {
|
|
log_info "Setting up GPU monitoring"
|
|
|
|
# Create GPU monitoring script
|
|
cat > "${SCRIPT_DIR}/gpu_monitor.py" << 'EOF'
|
|
#!/usr/bin/env python3
|
|
"""
|
|
GPU Monitoring Script
|
|
Provides Prometheus metrics for GPU utilization
|
|
"""
|
|
|
|
import time
|
|
import subprocess
|
|
import json
|
|
from http.server import HTTPServer, BaseHTTPRequestHandler
|
|
import threading
|
|
import logging
|
|
|
|
class GPUMonitor:
|
|
def __init__(self):
|
|
self.gpu_type = self.detect_gpu_type()
|
|
self.metrics = {}
|
|
|
|
def detect_gpu_type(self):
|
|
try:
|
|
subprocess.run(['nvidia-smi'], capture_output=True, check=True)
|
|
return 'nvidia'
|
|
except:
|
|
pass
|
|
|
|
# Check for AMD GPU
|
|
result = subprocess.run(['lspci'], capture_output=True, text=True)
|
|
if 'AMD' in result.stdout:
|
|
return 'amd'
|
|
elif 'Intel' in result.stdout and 'VGA' in result.stdout:
|
|
return 'intel'
|
|
|
|
return 'software'
|
|
|
|
def get_nvidia_metrics(self):
|
|
try:
|
|
result = subprocess.run([
|
|
'nvidia-smi', '--query-gpu=utilization.gpu,utilization.memory,memory.used,memory.total,temperature.gpu',
|
|
'--format=csv,noheader,nounits'
|
|
], capture_output=True, text=True, check=True)
|
|
|
|
lines = result.stdout.strip().split('\n')
|
|
gpu_metrics = []
|
|
|
|
for i, line in enumerate(lines):
|
|
values = [x.strip() for x in line.split(',')]
|
|
gpu_metrics.append({
|
|
'gpu_id': i,
|
|
'utilization_gpu': float(values[0]),
|
|
'utilization_memory': float(values[1]),
|
|
'memory_used': float(values[2]),
|
|
'memory_total': float(values[3]),
|
|
'temperature': float(values[4])
|
|
})
|
|
|
|
return gpu_metrics
|
|
except Exception as e:
|
|
logging.error(f"Error getting NVIDIA metrics: {e}")
|
|
return []
|
|
|
|
def get_system_metrics(self):
|
|
# Fallback system metrics
|
|
return [{
|
|
'gpu_id': 0,
|
|
'utilization_gpu': 0,
|
|
'utilization_memory': 0,
|
|
'memory_used': 0,
|
|
'memory_total': 0,
|
|
'temperature': 0
|
|
}]
|
|
|
|
def collect_metrics(self):
|
|
if self.gpu_type == 'nvidia':
|
|
return self.get_nvidia_metrics()
|
|
else:
|
|
return self.get_system_metrics()
|
|
|
|
def update_metrics(self):
|
|
while True:
|
|
try:
|
|
self.metrics = self.collect_metrics()
|
|
except Exception as e:
|
|
logging.error(f"Error updating metrics: {e}")
|
|
time.sleep(10) # Update every 10 seconds
|
|
|
|
class MetricsHandler(BaseHTTPRequestHandler):
|
|
def __init__(self, gpu_monitor, *args, **kwargs):
|
|
self.gpu_monitor = gpu_monitor
|
|
super().__init__(*args, **kwargs)
|
|
|
|
def do_GET(self):
|
|
if self.path == '/metrics':
|
|
self.send_response(200)
|
|
self.send_header('Content-type', 'text/plain')
|
|
self.end_headers()
|
|
|
|
metrics_text = self.generate_prometheus_metrics()
|
|
self.wfile.write(metrics_text.encode())
|
|
else:
|
|
self.send_response(404)
|
|
self.end_headers()
|
|
|
|
def generate_prometheus_metrics(self):
|
|
metrics = []
|
|
|
|
for gpu in self.gpu_monitor.metrics:
|
|
gpu_id = gpu['gpu_id']
|
|
|
|
metrics.extend([
|
|
f'# HELP gpu_utilization_percent GPU utilization percentage',
|
|
f'# TYPE gpu_utilization_percent gauge',
|
|
f'gpu_utilization_percent{{gpu_id="{gpu_id}"}} {gpu["utilization_gpu"]}',
|
|
|
|
f'# HELP gpu_memory_utilization_percent GPU memory utilization percentage',
|
|
f'# TYPE gpu_memory_utilization_percent gauge',
|
|
f'gpu_memory_utilization_percent{{gpu_id="{gpu_id}"}} {gpu["utilization_memory"]}',
|
|
|
|
f'# HELP gpu_memory_used_mb GPU memory used in MB',
|
|
f'# TYPE gpu_memory_used_mb gauge',
|
|
f'gpu_memory_used_mb{{gpu_id="{gpu_id}"}} {gpu["memory_used"]}',
|
|
|
|
f'# HELP gpu_memory_total_mb GPU total memory in MB',
|
|
f'# TYPE gpu_memory_total_mb gauge',
|
|
f'gpu_memory_total_mb{{gpu_id="{gpu_id}"}} {gpu["memory_total"]}',
|
|
|
|
f'# HELP gpu_temperature_celsius GPU temperature in Celsius',
|
|
f'# TYPE gpu_temperature_celsius gauge',
|
|
f'gpu_temperature_celsius{{gpu_id="{gpu_id}"}} {gpu["temperature"]}',
|
|
])
|
|
|
|
return '\n'.join(metrics)
|
|
|
|
def main():
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
gpu_monitor = GPUMonitor()
|
|
|
|
# Start metrics collection in background
|
|
metrics_thread = threading.Thread(target=gpu_monitor.update_metrics, daemon=True)
|
|
metrics_thread.start()
|
|
|
|
# Create handler with gpu_monitor
|
|
def handler(*args, **kwargs):
|
|
return MetricsHandler(gpu_monitor, *args, **kwargs)
|
|
|
|
# Start HTTP server
|
|
server = HTTPServer(('0.0.0.0', 9101), handler)
|
|
print("GPU metrics server started on port 9101")
|
|
server.serve_forever()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
EOF
|
|
|
|
chmod +x "${SCRIPT_DIR}/gpu_monitor.py"
|
|
|
|
# Create systemd service for GPU monitoring
|
|
cat > "/etc/systemd/system/gpu-monitor.service" << EOF
|
|
[Unit]
|
|
Description=GPU Monitoring Service
|
|
After=network.target
|
|
|
|
[Service]
|
|
Type=simple
|
|
User=root
|
|
WorkingDirectory=${SCRIPT_DIR}
|
|
ExecStart=/usr/bin/python3 ${SCRIPT_DIR}/gpu_monitor.py
|
|
Restart=always
|
|
RestartSec=10
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
EOF
|
|
|
|
# Enable and start the service
|
|
systemctl daemon-reload
|
|
systemctl enable gpu-monitor.service
|
|
systemctl start gpu-monitor.service
|
|
|
|
# Add GPU monitoring to Prometheus configuration
|
|
if [[ -f "${SCRIPT_DIR}/../monitoring/prometheus/prometheus.yml" ]]; then
|
|
log_info "Adding GPU monitoring to Prometheus configuration"
|
|
|
|
# Check if GPU target already exists
|
|
if ! grep -q "gpu-monitor" "${SCRIPT_DIR}/../monitoring/prometheus/prometheus.yml"; then
|
|
cat >> "${SCRIPT_DIR}/../monitoring/prometheus/prometheus.yml" << 'EOF'
|
|
|
|
- job_name: 'gpu-monitor'
|
|
static_configs:
|
|
- targets: ['localhost:9101']
|
|
scrape_interval: 15s
|
|
metrics_path: '/metrics'
|
|
EOF
|
|
fi
|
|
fi
|
|
|
|
log_info "GPU monitoring setup completed"
|
|
}
|
|
|
|
cleanup_on_exit() {
|
|
log_info "Cleaning up GPU optimization resources"
|
|
|
|
# Stop any test containers
|
|
docker ps -q --filter "ancestor=nvidia/cuda:11.0-base" | xargs -r docker stop
|
|
docker ps -q --filter "ancestor=jrottenberg/ffmpeg:4.4-nvidia" | xargs -r docker stop
|
|
docker ps -q --filter "ancestor=jrottenberg/ffmpeg:4.4-vaapi" | xargs -r docker stop
|
|
|
|
log_info "GPU optimization cleanup completed"
|
|
}
|
|
|
|
# Execute main function
|
|
main "$@" |