You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
180 lines
6.6 KiB
180 lines
6.6 KiB
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "2c786740",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/home/cuuva/anaconda3/envs/mfn/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
|
" from .autonotebook import tqdm as notebook_tqdm\n",
|
|
"Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 56341.73 examples/s]\n",
|
|
"Generating test split: 100%|██████████| 2200/2200 [00:00<00:00, 96950.62 examples/s]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from datasets import load_dataset\n",
|
|
"\n",
|
|
"ds = load_dataset(\"logasja/lfw\", \"pairs\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "cf343f72",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Generating train split: 100%|██████████| 13233/13233 [00:00<00:00, 29211.85 examples/s]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from datasets import load_dataset\n",
|
|
"\n",
|
|
"ds = load_dataset(\"logasja/lfw\", \"default\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "14ee413a",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Saving train images...\n"
|
|
]
|
|
},
|
|
{
|
|
"ename": "KeyError",
|
|
"evalue": "'image1'",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[5], line 29\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSaving train images...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, item \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(train_data):\n\u001b[0;32m---> 29\u001b[0m img1 \u001b[38;5;241m=\u001b[39m Image\u001b[38;5;241m.\u001b[39mfromarray(\u001b[43mitem\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mimage1\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m)\u001b[38;5;241m.\u001b[39mconvert(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRGB\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 30\u001b[0m img2 \u001b[38;5;241m=\u001b[39m Image\u001b[38;5;241m.\u001b[39mfromarray(item[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mimage2\u001b[39m\u001b[38;5;124m'\u001b[39m])\u001b[38;5;241m.\u001b[39mconvert(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRGB\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 31\u001b[0m label \u001b[38;5;241m=\u001b[39m item[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlabel\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;66;03m# 0: same, 1: different\u001b[39;00m\n",
|
|
"\u001b[0;31mKeyError\u001b[0m: 'image1'"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from datasets import load_dataset\n",
|
|
"import os\n",
|
|
"from PIL import Image\n",
|
|
"import numpy as np\n",
|
|
"\n",
|
|
"# ----------------------------\n",
|
|
"# 경로 설정\n",
|
|
"# ----------------------------\n",
|
|
"LOCAL_DATA_DIR = \"/home/cuuva/lfw_images\" # 저장할 최상위 폴더\n",
|
|
"TRAIN_DIR = os.path.join(LOCAL_DATA_DIR, \"train\")\n",
|
|
"TEST_DIR = os.path.join(LOCAL_DATA_DIR, \"test\")\n",
|
|
"\n",
|
|
"os.makedirs(TRAIN_DIR, exist_ok=True)\n",
|
|
"os.makedirs(TEST_DIR, exist_ok=True)\n",
|
|
"\n",
|
|
"# ----------------------------\n",
|
|
"# Hugging Face LFW 불러오기\n",
|
|
"# ----------------------------\n",
|
|
"dataset = load_dataset(\"logasja/lfw\", \"pairs\")\n",
|
|
"\n",
|
|
"train_data = dataset[\"train\"]\n",
|
|
"test_data = dataset[\"test\"]\n",
|
|
"\n",
|
|
"# ----------------------------\n",
|
|
"# train 데이터 저장\n",
|
|
"# ----------------------------\n",
|
|
"print(\"Saving train images...\")\n",
|
|
"for i, item in enumerate(train_data):\n",
|
|
" img1 = Image.fromarray(item['image1']).convert(\"RGB\")\n",
|
|
" img2 = Image.fromarray(item['image2']).convert(\"RGB\")\n",
|
|
" label = item['label'] # 0: same, 1: different\n",
|
|
"\n",
|
|
" # 파일 이름 예: train_00001_1.jpg, train_00001_2.jpg\n",
|
|
" img1.save(os.path.join(TRAIN_DIR, f\"train_{i}_1.jpg\"))\n",
|
|
" img2.save(os.path.join(TRAIN_DIR, f\"train_{i}_2.jpg\"))\n",
|
|
"\n",
|
|
"print(f\"Train images saved: {len(train_data)*2}\")\n",
|
|
"\n",
|
|
"# ----------------------------\n",
|
|
"# test 데이터 저장\n",
|
|
"# ----------------------------\n",
|
|
"print(\"Saving test images...\")\n",
|
|
"for i, item in enumerate(test_data):\n",
|
|
" img1 = Image.fromarray(item['image1']).convert(\"RGB\")\n",
|
|
" img2 = Image.fromarray(item['image2']).convert(\"RGB\")\n",
|
|
" label = item['label']\n",
|
|
"\n",
|
|
" # 파일 이름 예: test_00001_1.jpg, test_00001_2.jpg\n",
|
|
" img1.save(os.path.join(TEST_DIR, f\"test_{i}_1.jpg\"))\n",
|
|
" img2.save(os.path.join(TEST_DIR, f\"test_{i}_2.jpg\"))\n",
|
|
"\n",
|
|
"print(f\"Test images saved: {len(test_data)*2}\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "b5830c3f",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"['pair', 'img_0', 'img_1']\n",
|
|
"{'pair': 1, 'img_0': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=250x250 at 0x7C0069CF23A0>, 'img_1': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=250x250 at 0x7C0069CF27F0>}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from datasets import load_dataset\n",
|
|
"\n",
|
|
"ds = load_dataset(\"logasja/lfw\", \"pairs\", split=\"test\")\n",
|
|
"print(ds.column_names) # 현재 컬럼 이름 확인\n",
|
|
"print(ds[0]) # 첫 번째 데이터 샘플 확인\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "7e53a5b1",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "mfn",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.12"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|