diff --git a/configs/kr_lp_pgnet.yml b/configs/kr_lp_pgnet.yml index 54d7202..4f351c0 100644 --- a/configs/kr_lp_pgnet.yml +++ b/configs/kr_lp_pgnet.yml @@ -95,7 +95,7 @@ Train: loader: shuffle: True drop_last: True - batch_size_per_card: 14 + batch_size_per_card: 32 num_workers: 8 Eval: diff --git a/scripts/recreate_container.sh b/scripts/recreate_container.sh new file mode 100755 index 0000000..58f159e --- /dev/null +++ b/scripts/recreate_container.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# kr_lp_pgnet 컨테이너 재생성 (--shm-size 8g 적용 등) +# +# 호스트에서 실행: +# bash scripts/recreate_container.sh +# +# 동작: +# 1. /root/.netrc 백업 (wandb 인증 보존) +# 2. 기존 컨테이너 강제 삭제 +# 3. 새 컨테이너 시작 (--shm-size 8g, --gpus all, bind mount) +# 4. .netrc 복원 +# 5. setup_server.sh 자동 실행 (paddle 등 재설치) + +set -euo pipefail + +NAME=kr_lp_pgnet +IMAGE=ubuntu:24.04 +SHM=8g +WORKSPACE=/home/cuuva/workspace + +# 1. .netrc 백업 (wandb 로그인 보존용) +NETRC_BAK=/tmp/${NAME}_netrc.bak +if docker exec "$NAME" test -f /root/.netrc 2>/dev/null; then + docker cp "$NAME":/root/.netrc "$NETRC_BAK" + echo " ✓ /root/.netrc backed up → $NETRC_BAK" +fi + +# 2. 기존 컨테이너 강제 정리 +docker rm -f "$NAME" 2>/dev/null || true + +# 3. 새 컨테이너 (--shm-size 8g 핵심) +docker run -d --name "$NAME" --gpus all \ + --shm-size="$SHM" \ + -v "$WORKSPACE:/workspace" \ + -w /workspace \ + --restart unless-stopped \ + "$IMAGE" sleep infinity +echo " ✓ 새 컨테이너 시작 (shm-size=$SHM)" + +# 4. .netrc 복원 +if [ -f "$NETRC_BAK" ]; then + docker cp "$NETRC_BAK" "$NAME":/root/.netrc + rm "$NETRC_BAK" + echo " ✓ /root/.netrc 복원 (wandb login 유지)" +fi + +# 5. setup_server.sh 자동 실행 (paddle/PaddleOCR/cuDNN 등 재설치) +echo +echo " → setup_server.sh 실행..." +docker exec "$NAME" bash /workspace/kr_lp_pgnet/scripts/setup_server.sh diff --git a/scripts/setup_server.sh b/scripts/setup_server.sh index 5c00423..78878aa 100755 --- a/scripts/setup_server.sh +++ b/scripts/setup_server.sh @@ -54,8 +54,11 @@ while IFS= read -r line; do $PIP install $PIP_OPTS --ignore-installed "$line" 2>/dev/null || echo " skip: $line" done < /tmp/kr_lp_req.txt -echo "[5/7] OpenCV (numpy2 호환) + numpy<2 (PaddleOCR release/2.7 호환성) + wandb" +echo "[5/7] OpenCV (numpy2 호환) + numpy<2 (PaddleOCR release/2.7 호환성) + wandb + cuDNN 9.17" $PIP install $PIP_OPTS 'opencv-python>=4.10' 'opencv-contrib-python>=4.10' wandb +# paddle sm_120 wheel은 cuDNN 9.17 빌드라 paddle deps의 9.13.0.50을 9.17로 upgrade 필요. +# (안 하면 conv2d에서 cublasLtCreate 심볼 로드 실패 → process abort) +$PIP install $PIP_OPTS --upgrade 'nvidia-cudnn-cu13>=9.17,<9.18' # imgaug 등이 numpy 1.x API(np.sctypes)에 의존하므로 numpy 1.x로 핀. # paddle 3.3.0.dev는 numpy 1.x도 호환. $PIP install $PIP_OPTS 'numpy<2' --force-reinstall