generate_synthetic: stratified Korean/region sampling + boost 2-line weight

- StratifiedSampler ensures each Korean char (40) and region (16) appears uniformly
  per cycle instead of random.choice. Lifts per-class exposure from ~1/40th of plates
  to ~1/40th but evenly distributed (no rare class starvation).
- Type weights: type1 80→50, type3 10→30, type4 5→15. Boost 2-line to 45% so
  the detection head sees enough 2-line samples to learn bottom-line detection
  (currently 3/3 test samples miss bottom line).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
main
songhyeonsoo 4 weeks ago
parent 518aabab08
commit db2ef9b4ec

@ -58,6 +58,19 @@ def random_bright(img: np.ndarray, scale_range=(0.55, 1.45)) -> np.ndarray:
return cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2BGR)
class StratifiedSampler:
"""완전 균등 샘플링: 모든 키를 셔플해서 한 사이클 동안 정확히 1번씩 등장."""
def __init__(self, keys):
self.keys = list(keys)
self._pool = []
def next(self):
if not self._pool:
self._pool = self.keys.copy()
random.shuffle(self._pool)
return self._pool.pop()
class LPGenerator:
def __init__(self, asset_dir: Path):
self.asset = Path(asset_dir)
@ -77,6 +90,11 @@ class LPGenerator:
self.region_y_imgs = self._load("region_y")
self.region_g_imgs = self._load("region_g")
# 한글/지역명은 클래스 수가 적고 plate당 1개만 등장 → 균등 샘플링 필수
self.hangul_sampler = StratifiedSampler(HANGUL_CHAR_MAP)
self.region_y_sampler = StratifiedSampler(self.region_y_imgs)
self.region_g_sampler = StratifiedSampler(self.region_g_imgs)
def _load(self, sub: str) -> dict:
out = {}
for fp in sorted((self.asset / sub).iterdir()):
@ -95,7 +113,7 @@ class LPGenerator:
char = self._resize_dict(self.char_w, 60, 83)
d = [random.choice('0123456789') for _ in range(2)]
ch = random.choice(list(HANGUL_CHAR_MAP))
ch = self.hangul_sampler.next()
e = [random.choice('0123456789') for _ in range(4)]
row, col = 13, 35
@ -118,7 +136,7 @@ class LPGenerator:
char = self._resize_dict(self.char_w, 49, 70)
d = [random.choice('0123456789') for _ in range(2)]
ch = random.choice(list(HANGUL_CHAR_MAP))
ch = self.hangul_sampler.next()
e = [random.choice('0123456789') for _ in range(4)]
row, col = 46, 10
@ -135,7 +153,7 @@ class LPGenerator:
poly = [[x0, row], [x1, row], [x1, row + 83], [x0, row + 83]]
return plate, [{"transcription": text, "points": poly}]
def _gen_two_line(self, plate_bg, num_src, char_src, region_src):
def _gen_two_line(self, plate_bg, num_src, char_src, region_src, region_sampler):
"""두 줄 LP (336x170). 위·아래 줄 각각 tight polygon 생성."""
plate = cv2.resize(plate_bg, (336, 170))
num1 = self._resize_dict(num_src, 44, 60)
@ -143,9 +161,9 @@ class LPGenerator:
region = self._resize_dict(region_src, 88, 60)
char = self._resize_dict(char_src, 64, 62)
rkey = random.choice(list(region))
rkey = region_sampler.next()
d = [random.choice('0123456789') for _ in range(2)]
ch = random.choice(list(HANGUL_CHAR_MAP))
ch = self.hangul_sampler.next()
e = [random.choice('0123456789') for _ in range(4)]
# 위 줄: region + 숫자2
@ -174,15 +192,15 @@ class LPGenerator:
]
def gen_type3(self):
return self._gen_two_line(self.plate_y, self.num_y, self.char_y, self.region_y_imgs)
return self._gen_two_line(self.plate_y, self.num_y, self.char_y, self.region_y_imgs, self.region_y_sampler)
def gen_type4(self):
return self._gen_two_line(self.plate_g, self.num_g, self.char_g, self.region_g_imgs)
return self._gen_two_line(self.plate_g, self.num_g, self.char_g, self.region_g_imgs, self.region_g_sampler)
# 한국 도로 LP 분포 추정 (자가용 92% + 영업용 7.5%, 신형 가로 ~98% 등)
# 자산 한계 고려한 합성 가중치 — generate_synthetic 호출 시 --type_weights 로 덮어쓰기 가능.
TYPE_DEFAULT_WEIGHTS = {'1': 0.80, '2': 0.05, '3': 0.10, '4': 0.05}
# 학습용 가중치 — 검증 단계에서는 두줄(type3+4)을 도로 분포보다 의도적으로 늘려
# 모델이 윗줄/아랫줄 동시 검출을 충분히 학습하게 함. 추론 시 도로 분포로 평가됨.
TYPE_DEFAULT_WEIGHTS = {'1': 0.50, '2': 0.05, '3': 0.30, '4': 0.15}
def main():

Loading…
Cancel
Save