diff --git a/data_gen/generate_synthetic.py b/data_gen/generate_synthetic.py index 7e8e2bd..469dd3f 100644 --- a/data_gen/generate_synthetic.py +++ b/data_gen/generate_synthetic.py @@ -58,6 +58,19 @@ def random_bright(img: np.ndarray, scale_range=(0.55, 1.45)) -> np.ndarray: return cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2BGR) +class StratifiedSampler: + """완전 균등 샘플링: 모든 키를 셔플해서 한 사이클 동안 정확히 1번씩 등장.""" + def __init__(self, keys): + self.keys = list(keys) + self._pool = [] + + def next(self): + if not self._pool: + self._pool = self.keys.copy() + random.shuffle(self._pool) + return self._pool.pop() + + class LPGenerator: def __init__(self, asset_dir: Path): self.asset = Path(asset_dir) @@ -77,6 +90,11 @@ class LPGenerator: self.region_y_imgs = self._load("region_y") self.region_g_imgs = self._load("region_g") + # 한글/지역명은 클래스 수가 적고 plate당 1개만 등장 → 균등 샘플링 필수 + self.hangul_sampler = StratifiedSampler(HANGUL_CHAR_MAP) + self.region_y_sampler = StratifiedSampler(self.region_y_imgs) + self.region_g_sampler = StratifiedSampler(self.region_g_imgs) + def _load(self, sub: str) -> dict: out = {} for fp in sorted((self.asset / sub).iterdir()): @@ -95,7 +113,7 @@ class LPGenerator: char = self._resize_dict(self.char_w, 60, 83) d = [random.choice('0123456789') for _ in range(2)] - ch = random.choice(list(HANGUL_CHAR_MAP)) + ch = self.hangul_sampler.next() e = [random.choice('0123456789') for _ in range(4)] row, col = 13, 35 @@ -118,7 +136,7 @@ class LPGenerator: char = self._resize_dict(self.char_w, 49, 70) d = [random.choice('0123456789') for _ in range(2)] - ch = random.choice(list(HANGUL_CHAR_MAP)) + ch = self.hangul_sampler.next() e = [random.choice('0123456789') for _ in range(4)] row, col = 46, 10 @@ -135,7 +153,7 @@ class LPGenerator: poly = [[x0, row], [x1, row], [x1, row + 83], [x0, row + 83]] return plate, [{"transcription": text, "points": poly}] - def _gen_two_line(self, plate_bg, num_src, char_src, region_src): + def _gen_two_line(self, plate_bg, num_src, char_src, region_src, region_sampler): """두 줄 LP (336x170). 위·아래 줄 각각 tight polygon 생성.""" plate = cv2.resize(plate_bg, (336, 170)) num1 = self._resize_dict(num_src, 44, 60) @@ -143,9 +161,9 @@ class LPGenerator: region = self._resize_dict(region_src, 88, 60) char = self._resize_dict(char_src, 64, 62) - rkey = random.choice(list(region)) + rkey = region_sampler.next() d = [random.choice('0123456789') for _ in range(2)] - ch = random.choice(list(HANGUL_CHAR_MAP)) + ch = self.hangul_sampler.next() e = [random.choice('0123456789') for _ in range(4)] # 위 줄: region + 숫자2 @@ -174,15 +192,15 @@ class LPGenerator: ] def gen_type3(self): - return self._gen_two_line(self.plate_y, self.num_y, self.char_y, self.region_y_imgs) + return self._gen_two_line(self.plate_y, self.num_y, self.char_y, self.region_y_imgs, self.region_y_sampler) def gen_type4(self): - return self._gen_two_line(self.plate_g, self.num_g, self.char_g, self.region_g_imgs) + return self._gen_two_line(self.plate_g, self.num_g, self.char_g, self.region_g_imgs, self.region_g_sampler) -# 한국 도로 LP 분포 추정 (자가용 92% + 영업용 7.5%, 신형 가로 ~98% 등) -# 자산 한계 고려한 합성 가중치 — generate_synthetic 호출 시 --type_weights 로 덮어쓰기 가능. -TYPE_DEFAULT_WEIGHTS = {'1': 0.80, '2': 0.05, '3': 0.10, '4': 0.05} +# 학습용 가중치 — 검증 단계에서는 두줄(type3+4)을 도로 분포보다 의도적으로 늘려 +# 모델이 윗줄/아랫줄 동시 검출을 충분히 학습하게 함. 추론 시 도로 분포로 평가됨. +TYPE_DEFAULT_WEIGHTS = {'1': 0.50, '2': 0.05, '3': 0.30, '4': 0.15} def main():