generate_synthetic: stratified Korean/region sampling + boost 2-line weight

- StratifiedSampler ensures each Korean char (40) and region (16) appears uniformly per cycle instead of random.choice. Lifts per-class exposure from ~1/40th of plates to ~1/40th but evenly distributed (no rare class starvation). - Type weights: type1 80→50, type3 10→30, type4 5→15. Boost 2-line to 45% so the detection head sees enough 2-line samples to learn bottom-line detection (currently 3/3 test samples miss bottom line). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
4 weeks ago · db2ef9b4ec
parent 518aabab08
commit db2ef9b4ec
1 changed files with 28 additions and 10 deletions
--- a/data_gen/generate_synthetic.py
+++ b/data_gen/generate_synthetic.py
@ -58,6 +58,19 @@ def random_bright(img: np.ndarray, scale_range=(0.55, 1.45)) -> np.ndarray:
    return cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2BGR)


+class StratifiedSampler:
+    """완전 균등 샘플링: 모든 키를 셔플해서 한 사이클 동안 정확히 1번씩 등장."""
+    def __init__(self, keys):
+        self.keys = list(keys)
+        self._pool = []
+
+    def next(self):
+        if not self._pool:
+            self._pool = self.keys.copy()
+            random.shuffle(self._pool)
+        return self._pool.pop()
+
+
 class LPGenerator:
    def __init__(self, asset_dir: Path):
        self.asset = Path(asset_dir)
@ -77,6 +90,11 @@ class LPGenerator:
        self.region_y_imgs = self._load("region_y")
        self.region_g_imgs = self._load("region_g")

+        # 한글/지역명은 클래스 수가 적고 plate당 1개만 등장 → 균등 샘플링 필수
+        self.hangul_sampler = StratifiedSampler(HANGUL_CHAR_MAP)
+        self.region_y_sampler = StratifiedSampler(self.region_y_imgs)
+        self.region_g_sampler = StratifiedSampler(self.region_g_imgs)
+
    def _load(self, sub: str) -> dict:
        out = {}
        for fp in sorted((self.asset / sub).iterdir()):
@ -95,7 +113,7 @@ class LPGenerator:
        char = self._resize_dict(self.char_w, 60, 83)

        d = [random.choice('0123456789') for _ in range(2)]
-        ch = random.choice(list(HANGUL_CHAR_MAP))
+        ch = self.hangul_sampler.next()
        e = [random.choice('0123456789') for _ in range(4)]

        row, col = 13, 35
@ -118,7 +136,7 @@ class LPGenerator:
        char = self._resize_dict(self.char_w, 49, 70)

        d = [random.choice('0123456789') for _ in range(2)]
-        ch = random.choice(list(HANGUL_CHAR_MAP))
+        ch = self.hangul_sampler.next()
        e = [random.choice('0123456789') for _ in range(4)]

        row, col = 46, 10
@ -135,7 +153,7 @@ class LPGenerator:
        poly = [[x0, row], [x1, row], [x1, row + 83], [x0, row + 83]]
        return plate, [{"transcription": text, "points": poly}]

-    def _gen_two_line(self, plate_bg, num_src, char_src, region_src):
+    def _gen_two_line(self, plate_bg, num_src, char_src, region_src, region_sampler):
        """두 줄 LP (336x170). 위·아래 줄 각각 tight polygon 생성."""
        plate = cv2.resize(plate_bg, (336, 170))
        num1 = self._resize_dict(num_src, 44, 60)
@ -143,9 +161,9 @@ class LPGenerator:
        region = self._resize_dict(region_src, 88, 60)
        char = self._resize_dict(char_src, 64, 62)

-        rkey = random.choice(list(region))
+        rkey = region_sampler.next()
        d = [random.choice('0123456789') for _ in range(2)]
-        ch = random.choice(list(HANGUL_CHAR_MAP))
+        ch = self.hangul_sampler.next()
        e = [random.choice('0123456789') for _ in range(4)]

        # 위 줄: region + 숫자2
@ -174,15 +192,15 @@ class LPGenerator:
        ]

    def gen_type3(self):
-        return self._gen_two_line(self.plate_y, self.num_y, self.char_y, self.region_y_imgs)
+        return self._gen_two_line(self.plate_y, self.num_y, self.char_y, self.region_y_imgs, self.region_y_sampler)

    def gen_type4(self):
-        return self._gen_two_line(self.plate_g, self.num_g, self.char_g, self.region_g_imgs)
+        return self._gen_two_line(self.plate_g, self.num_g, self.char_g, self.region_g_imgs, self.region_g_sampler)


-# 한국 도로 LP 분포 추정 (자가용 92% + 영업용 7.5%, 신형 가로 ~98% 등)
-# 자산 한계 고려한 합성 가중치 — generate_synthetic 호출 시 --type_weights 로 덮어쓰기 가능.
-TYPE_DEFAULT_WEIGHTS = {'1': 0.80, '2': 0.05, '3': 0.10, '4': 0.05}
+# 학습용 가중치 — 검증 단계에서는 두줄(type3+4)을 도로 분포보다 의도적으로 늘려
+# 모델이 윗줄/아랫줄 동시 검출을 충분히 학습하게 함. 추론 시 도로 분포로 평가됨.
+TYPE_DEFAULT_WEIGHTS = {'1': 0.50, '2': 0.05, '3': 0.30, '4': 0.15}


 def main():