{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "2c786740",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/cuuva/anaconda3/envs/mfn/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 56341.73 examples/s]\n",
      "Generating test split: 100%|██████████| 2200/2200 [00:00<00:00, 96950.62 examples/s]\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "ds = load_dataset(\"logasja/lfw\", \"pairs\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "cf343f72",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Generating train split: 100%|██████████| 13233/13233 [00:00<00:00, 29211.85 examples/s]\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "ds = load_dataset(\"logasja/lfw\", \"default\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "14ee413a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saving train images...\n"
     ]
    },
    {
     "ename": "KeyError",
     "evalue": "'image1'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[5], line 29\u001b[0m\n\u001b[1;32m     27\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSaving train images...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     28\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, item \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(train_data):\n\u001b[0;32m---> 29\u001b[0m     img1 \u001b[38;5;241m=\u001b[39m Image\u001b[38;5;241m.\u001b[39mfromarray(\u001b[43mitem\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mimage1\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m)\u001b[38;5;241m.\u001b[39mconvert(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRGB\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     30\u001b[0m     img2 \u001b[38;5;241m=\u001b[39m Image\u001b[38;5;241m.\u001b[39mfromarray(item[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mimage2\u001b[39m\u001b[38;5;124m'\u001b[39m])\u001b[38;5;241m.\u001b[39mconvert(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRGB\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     31\u001b[0m     label \u001b[38;5;241m=\u001b[39m item[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlabel\u001b[39m\u001b[38;5;124m'\u001b[39m]  \u001b[38;5;66;03m# 0: same, 1: different\u001b[39;00m\n",
      "\u001b[0;31mKeyError\u001b[0m: 'image1'"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "import os\n",
    "from PIL import Image\n",
    "import numpy as np\n",
    "\n",
    "# ----------------------------\n",
    "# 경로 설정\n",
    "# ----------------------------\n",
    "LOCAL_DATA_DIR = \"/home/cuuva/lfw_images\"  # 저장할 최상위 폴더\n",
    "TRAIN_DIR = os.path.join(LOCAL_DATA_DIR, \"train\")\n",
    "TEST_DIR = os.path.join(LOCAL_DATA_DIR, \"test\")\n",
    "\n",
    "os.makedirs(TRAIN_DIR, exist_ok=True)\n",
    "os.makedirs(TEST_DIR, exist_ok=True)\n",
    "\n",
    "# ----------------------------\n",
    "# Hugging Face LFW 불러오기\n",
    "# ----------------------------\n",
    "dataset = load_dataset(\"logasja/lfw\", \"pairs\")\n",
    "\n",
    "train_data = dataset[\"train\"]\n",
    "test_data = dataset[\"test\"]\n",
    "\n",
    "# ----------------------------\n",
    "# train 데이터 저장\n",
    "# ----------------------------\n",
    "print(\"Saving train images...\")\n",
    "for i, item in enumerate(train_data):\n",
    "    img1 = Image.fromarray(item['image1']).convert(\"RGB\")\n",
    "    img2 = Image.fromarray(item['image2']).convert(\"RGB\")\n",
    "    label = item['label']  # 0: same, 1: different\n",
    "\n",
    "    # 파일 이름 예: train_00001_1.jpg, train_00001_2.jpg\n",
    "    img1.save(os.path.join(TRAIN_DIR, f\"train_{i}_1.jpg\"))\n",
    "    img2.save(os.path.join(TRAIN_DIR, f\"train_{i}_2.jpg\"))\n",
    "\n",
    "print(f\"Train images saved: {len(train_data)*2}\")\n",
    "\n",
    "# ----------------------------\n",
    "# test 데이터 저장\n",
    "# ----------------------------\n",
    "print(\"Saving test images...\")\n",
    "for i, item in enumerate(test_data):\n",
    "    img1 = Image.fromarray(item['image1']).convert(\"RGB\")\n",
    "    img2 = Image.fromarray(item['image2']).convert(\"RGB\")\n",
    "    label = item['label']\n",
    "\n",
    "    # 파일 이름 예: test_00001_1.jpg, test_00001_2.jpg\n",
    "    img1.save(os.path.join(TEST_DIR, f\"test_{i}_1.jpg\"))\n",
    "    img2.save(os.path.join(TEST_DIR, f\"test_{i}_2.jpg\"))\n",
    "\n",
    "print(f\"Test images saved: {len(test_data)*2}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b5830c3f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['pair', 'img_0', 'img_1']\n",
      "{'pair': 1, 'img_0': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=250x250 at 0x7C0069CF23A0>, 'img_1': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=250x250 at 0x7C0069CF27F0>}\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "ds = load_dataset(\"logasja/lfw\", \"pairs\", split=\"test\")\n",
    "print(ds.column_names)  # 현재 컬럼 이름 확인\n",
    "print(ds[0])            # 첫 번째 데이터 샘플 확인\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf133128",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mfn",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.20"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}