You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

180 lines
6.6 KiB

{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "2c786740",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/cuuva/anaconda3/envs/mfn/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 56341.73 examples/s]\n",
"Generating test split: 100%|██████████| 2200/2200 [00:00<00:00, 96950.62 examples/s]\n"
]
}
],
"source": [
"from datasets import load_dataset\n",
"\n",
"ds = load_dataset(\"logasja/lfw\", \"pairs\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "cf343f72",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Generating train split: 100%|██████████| 13233/13233 [00:00<00:00, 29211.85 examples/s]\n"
]
}
],
"source": [
"from datasets import load_dataset\n",
"\n",
"ds = load_dataset(\"logasja/lfw\", \"default\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "14ee413a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Saving train images...\n"
]
},
{
"ename": "KeyError",
"evalue": "'image1'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[5], line 29\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSaving train images...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, item \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(train_data):\n\u001b[0;32m---> 29\u001b[0m img1 \u001b[38;5;241m=\u001b[39m Image\u001b[38;5;241m.\u001b[39mfromarray(\u001b[43mitem\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mimage1\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m)\u001b[38;5;241m.\u001b[39mconvert(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRGB\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 30\u001b[0m img2 \u001b[38;5;241m=\u001b[39m Image\u001b[38;5;241m.\u001b[39mfromarray(item[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mimage2\u001b[39m\u001b[38;5;124m'\u001b[39m])\u001b[38;5;241m.\u001b[39mconvert(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRGB\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 31\u001b[0m label \u001b[38;5;241m=\u001b[39m item[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlabel\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;66;03m# 0: same, 1: different\u001b[39;00m\n",
"\u001b[0;31mKeyError\u001b[0m: 'image1'"
]
}
],
"source": [
"from datasets import load_dataset\n",
"import os\n",
"from PIL import Image\n",
"import numpy as np\n",
"\n",
"# ----------------------------\n",
"# 경로 설정\n",
"# ----------------------------\n",
"LOCAL_DATA_DIR = \"/home/cuuva/lfw_images\" # 저장할 최상위 폴더\n",
"TRAIN_DIR = os.path.join(LOCAL_DATA_DIR, \"train\")\n",
"TEST_DIR = os.path.join(LOCAL_DATA_DIR, \"test\")\n",
"\n",
"os.makedirs(TRAIN_DIR, exist_ok=True)\n",
"os.makedirs(TEST_DIR, exist_ok=True)\n",
"\n",
"# ----------------------------\n",
"# Hugging Face LFW 불러오기\n",
"# ----------------------------\n",
"dataset = load_dataset(\"logasja/lfw\", \"pairs\")\n",
"\n",
"train_data = dataset[\"train\"]\n",
"test_data = dataset[\"test\"]\n",
"\n",
"# ----------------------------\n",
"# train 데이터 저장\n",
"# ----------------------------\n",
"print(\"Saving train images...\")\n",
"for i, item in enumerate(train_data):\n",
" img1 = Image.fromarray(item['image1']).convert(\"RGB\")\n",
" img2 = Image.fromarray(item['image2']).convert(\"RGB\")\n",
" label = item['label'] # 0: same, 1: different\n",
"\n",
" # 파일 이름 예: train_00001_1.jpg, train_00001_2.jpg\n",
" img1.save(os.path.join(TRAIN_DIR, f\"train_{i}_1.jpg\"))\n",
" img2.save(os.path.join(TRAIN_DIR, f\"train_{i}_2.jpg\"))\n",
"\n",
"print(f\"Train images saved: {len(train_data)*2}\")\n",
"\n",
"# ----------------------------\n",
"# test 데이터 저장\n",
"# ----------------------------\n",
"print(\"Saving test images...\")\n",
"for i, item in enumerate(test_data):\n",
" img1 = Image.fromarray(item['image1']).convert(\"RGB\")\n",
" img2 = Image.fromarray(item['image2']).convert(\"RGB\")\n",
" label = item['label']\n",
"\n",
" # 파일 이름 예: test_00001_1.jpg, test_00001_2.jpg\n",
" img1.save(os.path.join(TEST_DIR, f\"test_{i}_1.jpg\"))\n",
" img2.save(os.path.join(TEST_DIR, f\"test_{i}_2.jpg\"))\n",
"\n",
"print(f\"Test images saved: {len(test_data)*2}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b5830c3f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['pair', 'img_0', 'img_1']\n",
"{'pair': 1, 'img_0': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=250x250 at 0x7C0069CF23A0>, 'img_1': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=250x250 at 0x7C0069CF27F0>}\n"
]
}
],
"source": [
"from datasets import load_dataset\n",
"\n",
"ds = load_dataset(\"logasja/lfw\", \"pairs\", split=\"test\")\n",
"print(ds.column_names) # 현재 컬럼 이름 확인\n",
"print(ds[0]) # 첫 번째 데이터 샘플 확인\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e53a5b1",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "mfn",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}