Provision a 5090-tuned container and bake in cuDNN 9.17 fix

- config: batch_size_per_card 14 -> 32 (5090 32GB headroom)
- setup_server.sh: pin nvidia-cudnn-cu13>=9.17 to match the sm_120 wheel
  (without it conv2d hits "Cannot load symbol cublasLtCreate" abort)
- new scripts/recreate_container.sh: one-shot rebuild with --shm-size 8g,
  preserves /root/.netrc so wandb auth survives, runs setup_server.sh

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
main
songhyeonsu 1 month ago
parent f5f8939a5c
commit c7c13e48cd

@ -95,7 +95,7 @@ Train:
loader:
shuffle: True
drop_last: True
batch_size_per_card: 14
batch_size_per_card: 32
num_workers: 8
Eval:

@ -0,0 +1,50 @@
#!/usr/bin/env bash
# kr_lp_pgnet 컨테이너 재생성 (--shm-size 8g 적용 등)
#
# 호스트에서 실행:
# bash scripts/recreate_container.sh
#
# 동작:
# 1. /root/.netrc 백업 (wandb 인증 보존)
# 2. 기존 컨테이너 강제 삭제
# 3. 새 컨테이너 시작 (--shm-size 8g, --gpus all, bind mount)
# 4. .netrc 복원
# 5. setup_server.sh 자동 실행 (paddle 등 재설치)
set -euo pipefail
NAME=kr_lp_pgnet
IMAGE=ubuntu:24.04
SHM=8g
WORKSPACE=/home/cuuva/workspace
# 1. .netrc 백업 (wandb 로그인 보존용)
NETRC_BAK=/tmp/${NAME}_netrc.bak
if docker exec "$NAME" test -f /root/.netrc 2>/dev/null; then
docker cp "$NAME":/root/.netrc "$NETRC_BAK"
echo " ✓ /root/.netrc backed up → $NETRC_BAK"
fi
# 2. 기존 컨테이너 강제 정리
docker rm -f "$NAME" 2>/dev/null || true
# 3. 새 컨테이너 (--shm-size 8g 핵심)
docker run -d --name "$NAME" --gpus all \
--shm-size="$SHM" \
-v "$WORKSPACE:/workspace" \
-w /workspace \
--restart unless-stopped \
"$IMAGE" sleep infinity
echo " ✓ 새 컨테이너 시작 (shm-size=$SHM)"
# 4. .netrc 복원
if [ -f "$NETRC_BAK" ]; then
docker cp "$NETRC_BAK" "$NAME":/root/.netrc
rm "$NETRC_BAK"
echo " ✓ /root/.netrc 복원 (wandb login 유지)"
fi
# 5. setup_server.sh 자동 실행 (paddle/PaddleOCR/cuDNN 등 재설치)
echo
echo " → setup_server.sh 실행..."
docker exec "$NAME" bash /workspace/kr_lp_pgnet/scripts/setup_server.sh

@ -54,8 +54,11 @@ while IFS= read -r line; do
$PIP install $PIP_OPTS --ignore-installed "$line" 2>/dev/null || echo " skip: $line"
done < /tmp/kr_lp_req.txt
echo "[5/7] OpenCV (numpy2 호환) + numpy<2 (PaddleOCR release/2.7 호환성) + wandb"
echo "[5/7] OpenCV (numpy2 호환) + numpy<2 (PaddleOCR release/2.7 호환성) + wandb + cuDNN 9.17"
$PIP install $PIP_OPTS 'opencv-python>=4.10' 'opencv-contrib-python>=4.10' wandb
# paddle sm_120 wheel은 cuDNN 9.17 빌드라 paddle deps의 9.13.0.50을 9.17로 upgrade 필요.
# (안 하면 conv2d에서 cublasLtCreate 심볼 로드 실패 → process abort)
$PIP install $PIP_OPTS --upgrade 'nvidia-cudnn-cu13>=9.17,<9.18'
# imgaug 등이 numpy 1.x API(np.sctypes)에 의존하므로 numpy 1.x로 핀.
# paddle 3.3.0.dev는 numpy 1.x도 호환.
$PIP install $PIP_OPTS 'numpy<2' --force-reinstall

Loading…
Cancel
Save