{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "2c786740", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/cuuva/anaconda3/envs/mfn/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 56341.73 examples/s]\n", "Generating test split: 100%|██████████| 2200/2200 [00:00<00:00, 96950.62 examples/s]\n" ] } ], "source": [ "from datasets import load_dataset\n", "\n", "ds = load_dataset(\"logasja/lfw\", \"pairs\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "cf343f72", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Generating train split: 100%|██████████| 13233/13233 [00:00<00:00, 29211.85 examples/s]\n" ] } ], "source": [ "from datasets import load_dataset\n", "\n", "ds = load_dataset(\"logasja/lfw\", \"default\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "14ee413a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Saving train images...\n" ] }, { "ename": "KeyError", "evalue": "'image1'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[5], line 29\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSaving train images...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, item \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(train_data):\n\u001b[0;32m---> 29\u001b[0m img1 \u001b[38;5;241m=\u001b[39m Image\u001b[38;5;241m.\u001b[39mfromarray(\u001b[43mitem\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mimage1\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m)\u001b[38;5;241m.\u001b[39mconvert(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRGB\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 30\u001b[0m img2 \u001b[38;5;241m=\u001b[39m Image\u001b[38;5;241m.\u001b[39mfromarray(item[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mimage2\u001b[39m\u001b[38;5;124m'\u001b[39m])\u001b[38;5;241m.\u001b[39mconvert(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRGB\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 31\u001b[0m label \u001b[38;5;241m=\u001b[39m item[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlabel\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;66;03m# 0: same, 1: different\u001b[39;00m\n", "\u001b[0;31mKeyError\u001b[0m: 'image1'" ] } ], "source": [ "from datasets import load_dataset\n", "import os\n", "from PIL import Image\n", "import numpy as np\n", "\n", "# ----------------------------\n", "# 경로 설정\n", "# ----------------------------\n", "LOCAL_DATA_DIR = \"/home/cuuva/lfw_images\" # 저장할 최상위 폴더\n", "TRAIN_DIR = os.path.join(LOCAL_DATA_DIR, \"train\")\n", "TEST_DIR = os.path.join(LOCAL_DATA_DIR, \"test\")\n", "\n", "os.makedirs(TRAIN_DIR, exist_ok=True)\n", "os.makedirs(TEST_DIR, exist_ok=True)\n", "\n", "# ----------------------------\n", "# Hugging Face LFW 불러오기\n", "# ----------------------------\n", "dataset = load_dataset(\"logasja/lfw\", \"pairs\")\n", "\n", "train_data = dataset[\"train\"]\n", "test_data = dataset[\"test\"]\n", "\n", "# ----------------------------\n", "# train 데이터 저장\n", "# ----------------------------\n", "print(\"Saving train images...\")\n", "for i, item in enumerate(train_data):\n", " img1 = Image.fromarray(item['image1']).convert(\"RGB\")\n", " img2 = Image.fromarray(item['image2']).convert(\"RGB\")\n", " label = item['label'] # 0: same, 1: different\n", "\n", " # 파일 이름 예: train_00001_1.jpg, train_00001_2.jpg\n", " img1.save(os.path.join(TRAIN_DIR, f\"train_{i}_1.jpg\"))\n", " img2.save(os.path.join(TRAIN_DIR, f\"train_{i}_2.jpg\"))\n", "\n", "print(f\"Train images saved: {len(train_data)*2}\")\n", "\n", "# ----------------------------\n", "# test 데이터 저장\n", "# ----------------------------\n", "print(\"Saving test images...\")\n", "for i, item in enumerate(test_data):\n", " img1 = Image.fromarray(item['image1']).convert(\"RGB\")\n", " img2 = Image.fromarray(item['image2']).convert(\"RGB\")\n", " label = item['label']\n", "\n", " # 파일 이름 예: test_00001_1.jpg, test_00001_2.jpg\n", " img1.save(os.path.join(TEST_DIR, f\"test_{i}_1.jpg\"))\n", " img2.save(os.path.join(TEST_DIR, f\"test_{i}_2.jpg\"))\n", "\n", "print(f\"Test images saved: {len(test_data)*2}\")\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "b5830c3f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['pair', 'img_0', 'img_1']\n", "{'pair': 1, 'img_0': , 'img_1': }\n" ] } ], "source": [ "from datasets import load_dataset\n", "\n", "ds = load_dataset(\"logasja/lfw\", \"pairs\", split=\"test\")\n", "print(ds.column_names) # 현재 컬럼 이름 확인\n", "print(ds[0]) # 첫 번째 데이터 샘플 확인\n" ] }, { "cell_type": "code", "execution_count": null, "id": "cf133128", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "mfn", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.20" } }, "nbformat": 4, "nbformat_minor": 5 }