Source code for shapenet.scripts.prepare_datasets

import kaggle
import os
import zipfile
import glob
from shapedata.io import pts_exporter
import shutil
import pandas as pd
from multiprocessing import Pool
from functools import partial
from sklearn.model_selection import train_test_split
from shapedata import SingleShapeDataProcessing
import numpy as np


[docs]def _make_pca(data_dir, out_file, normalize_rot=False, rotation_idxs=()):
    """
    Creates a PCA from data in a given directory
    
    Parameters
    ----------
    data_dir : str
        directory containing the image and landmark files
    out_file : str
        file the pca will be saved to
    normalize_rot : bool, optional
        whether or not to normalize the data's rotation
    rotation_idxs : tuple, optional
        indices for rotation normalization, msut be specified if 
        ``normalize_rot=True``
    
    """

    data_dir = os.path.abspath(data_dir)
    out_file = os.path.abspath(out_file)

    data = SingleShapeDataProcessing.from_dir(data_dir)
    if normalize_rot:
        for idx in range(len(data)):
            data[idx] = data[idx].normalize_rotation(rotation_idxs[0],
                                                     rotation_idxs[1])

    pca = data.lmk_pca(True, True)

    if out_file.endswith(".npz"):
        np.savez(out_file, shapes=pca)
    elif out_file.endswith(".npy"):
        np.save(out_file, pca)
    elif out_file.endswith(".txt"):
        np.savetxt(out_file, pca)
    else:
        np.savez(out_file + ".npz", shapes=pca)


[docs]def _process_single_cat_file(file, target_dir):
    """
    Processes a single file of the cat dataset
    
    Parameters
    ----------
    file : str
        the file to process
    target_dir : str
        the target directory
    
    """

    file = os.path.abspath(file)
    target_dir = os.path.abspath(target_dir)

    pd_frame = pd.read_csv(str(file) + ".cat", sep=' ', header=None)
    landmarks = (pd_frame.as_matrix()[0][1:-1]).reshape((-1, 2))
    # switch xy
    landmarks[:, [0, 1]] = landmarks[:, [1, 0]]

    target_file = os.path.join(target_dir, os.path.split(
        os.path.split(file)[0])[-1] + "_" + os.path.split(file)[-1])

    # export landmarks
    pts_exporter(landmarks, str(target_file.rsplit(".", 1)[0]) + ".pts")

    # move image file
    shutil.move(file, target_file)
    os.remove(file + ".cat")


[docs]def _prepare_cats(out_dir, remove_zip=False, normalize_pca_rot=False,
                  **split_options):
    """
    Prepares the cat dataset (with multiprocessing)
    
    Parameters
    ----------
    out_dir : str
        the output directory
    remove_zip : bool, optional
        whether or not to remove the ZIP file after finishing the preparation
    normalize_pca_rot : bool, optional
        whether or not to normalize the data's rotation during PCA

    See Also
    --------
    `Cat Dataset <https://www.kaggle.com/crawford/cat-dataset>`_
    
    """


    out_dir = os.path.abspath(out_dir)

    data_path = os.path.join(out_dir, "Cats")
    os.makedirs(data_path, exist_ok=True)

    if not os.path.isfile(os.path.join(data_path, "cats.zip")):
        print("\tDownloading Data")
        kaggle.api.dataset_download_cli("crawford/cat-dataset",
                                        path=data_path, unzip=True)

    if not (os.path.isdir(os.path.join(data_path, "train")) and
            os.path.isdir(os.path.join(data_path, "test"))):

        if not os.path.isdir(os.path.join(data_path, "tmp_data")):
            print("\tExtracting Data")
            with zipfile.ZipFile(os.path.join(data_path, "cats.zip")) as zip_ref:
                zip_ref.extractall(os.path.join(data_path, "tmp_data"))

        # get all jpeg files
        sub_dirs = [os.path.join(data_path, "tmp_data", x)
                    for x in os.listdir(os.path.join(data_path, "tmp_data"))
                    if os.path.isdir(os.path.join(data_path, "tmp_data", x))]

        img_files = []
        for _dir in sub_dirs:
            img_files += [os.path.join(_dir, x) for x in os.listdir(_dir)
                          if x.endswith(".jpg")]

        train_files, test_files = train_test_split(img_files, **split_options)

        if not (os.path.isdir(os.path.join(data_path, "train")) and
                os.path.isdir(os.path.join(data_path, "test"))):

            print("Preprocessing Data")

            os.makedirs(os.path.join(data_path, "train"), exist_ok=True)
            with Pool() as p:
                p.map(partial(_process_single_cat_file,
                              target_dir=os.path.join(data_path, "train")),
                      train_files)

            os.makedirs(os.path.join(data_path, "test"), exist_ok=True)
            with Pool() as p:
                p.map(partial(_process_single_cat_file,
                              target_dir=os.path.join(data_path, "test")),
                      test_files)

        shutil.rmtree(os.path.join(data_path, "tmp_data"))

    print("Make PCA")
    _make_pca(os.path.join(data_path, "train"),
              os.path.join(data_path, "train_pca.npz"),
              normalize_rot=normalize_pca_rot, rotation_idxs=(0, 1))

    if remove_zip:
        os.remove(os.path.join(data_path, "cats.zip"))


[docs]def _prepare_ibug_dset(zip_file, dset_name, out_dir, remove_zip=False,
                       normalize_pca_rot=True):
    """
    Prepares an ibug dataset (from a given zipfile)
    
    Parameters
    ----------
    zip_file : str
        the zip archive containing the data
    dset_name : str
        the dataset's name
    out_dir : str
        the output directory
    remove_zip : bool, optional
        whether or not to remove the ZIP file after finishing the preparation
    normalize_pca_rot : bool, optional
        whether or not to normalize the data's rotation during PCA

    See Also
    --------
    `iBug Datasets <https://ibug.doc.ic.ac.uk/resources/facial-point-annotations/>`_
    
    """

    zip_file = os.path.abspath(zip_file)
    out_dir = os.path.abspath(out_dir)

    data_path = os.path.join(out_dir, dset_name)
    os.makedirs(data_path, exist_ok=True)

    print("\tExtracting Data")
    with zipfile.ZipFile(zip_file) as zip_ref:
        zip_ref.extractall(data_path)

    print("\tPreprocessing Data")
    _make_pca(os.path.join(data_path, "trainset"),
              os.path.join(data_path, "train_pca.npz"),
              normalize_rot=normalize_pca_rot, rotation_idxs=(37, 46))

    if remove_zip:
        os.remove(zip_file)


[docs]def prepare_lfpw_dset():
    """
    Prepares the LFPW Dataset from commandline arguments

    See Also
    --------
    :meth:`_prepare_ibug_dset`
    `iBug Datasets <https://ibug.doc.ic.ac.uk/resources/facial-point-annotations/>`_
    `LFPW Dataset <https://neerajkumar.org/databases/lfpw/>`_
    
    """

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--zip_file", type=str,
                        help="Zipfile containing the lfpw database")

    parser.add_argument("-d", "--ddir", type=str,
                        help="Target data directory")

    parser.add_argument("--normalize_pca_rot", action="store_true",
                        help="Whether or not to normalize the pca's rotation")

    parser.add_argument("--remove_zip", action="store_true",
                        help="Zipfiles will be removed after processing data",
                        default=False)

    args = parser.parse_args()

    _prepare_ibug_dset(args.zip_file, "lfpw", args.ddir, args.remove_zip,
                       args.normalize_pca_rot)


[docs]def prepare_helen_dset():
    """
    Prepares the HELEN Dataset from commandline arguments

    See Also
    --------
    :meth:`_prepare_ibug_dset`
    `iBug Datasets <https://ibug.doc.ic.ac.uk/resources/facial-point-annotations/>`_
    `HELEN Dataset <http://www.ifp.illinois.edu/~vuongle2/helen/>`_

    """
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--zip_file", type=str,
                        help="Zipfile containing the helen database")

    parser.add_argument("-d", "--ddir", type=str,
                        help="Target data directory")

    parser.add_argument("--normalize_pca_rot", action="store_true",
                        help="Whether or not to normalize the pca's rotation")

    parser.add_argument("--remove_zip", action="store_true",
                        help="Zipfiles will be removed after processing data",
                        default=False)

    args = parser.parse_args()

    _prepare_ibug_dset(args.zip_file, "helen", args.ddir, args.remove_zip,
                       args.normalize_pca_rot)


[docs]def prepare_cat_dset():
    """
    Prepares the Cat Dataset from commandline arguments
    
    See Also
    --------
    :meth:`_prepare_cats`
    `Cat Dataset <https://www.kaggle.com/crawford/cat-dataset>`_

    """
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("-d", "--ddir", type=str,
                        help="Target data directory")
    parser.add_argument("--normalize_pca_rot", action="store_true",
                        help="Whether or not to normalize the pca's rotation")
    parser.add_argument("--test_size", type=float, default=0.25,
                        help="Testsize for \
                            sklearn.model_selection.train_test_split")
    parser.add_argument("--train_size", type=float, default=None,
                        help="Testsize for \
                            sklearn.model_selection.train_test_split")
    parser.add_argument("--no_shuffle", action="store_true",
                        help="If specified, data will not be shuffled during \
                            train_test_split ")
    parser.add_argument("--random_state", type=int, default=None,
                        help="random state for \
                            sklearn.model_selection.train_test_split ")
    parser.add_argument("--remove_zip", action="store_true",
                        help="Zipfiles will be removed after processing data",
                        default=False)
    args = parser.parse_args()

    split_options = {
        "test_size": args.test_size,
        "train_size": args.train_size,
        "shuffle": False if args.no_shuffle else True,
        "random_state": args.random_state
    }
    _prepare_cats(args.ddir, args.remove_zip, args.normalize_pca_rot,
                  **split_options)


[docs]def prepare_all_data():
    """
    Prepares all Datasets from commandline arguments

    See Also
    --------
    :meth:`_prepare_ibug_dset`
    :meth:`_prepare_cats`
    
    """
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--lfpw", action="store_true",
                        help="If Flag is set, the lfpw database will be \
                            preprocessed; Must specify '--lzip' argument ",
                        default=False)
    parser.add_argument("--helen", action="store_true",
                        help="If Flag is set, the helen database will be \
                            preprocessed; Must specify '--hzip' argument ",
                        default=False)
    parser.add_argument("--cats", action="store_true",
                        help="If Flag is set, the cat database will be \
                        downloaded and preprocessed ",
                        default=False)
    parser.add_argument("--lzip", type=str, default=None,
                        help="Zipfile containing the lfpw database")
    parser.add_argument("--hzip", type=str, default=None,
                        help="Zipfile containing the helen database")

    parser.add_argument("-d", "--ddir", type=str,
                        help="Target data directory")
    parser.add_argument("--test_size", type=float, default=0.25,
                        help="Testsize for \
                        sklearn.model_selection.train_test_split")
    parser.add_argument("--train_size", type=float, default=None,
                        help="Testsize for \
                        sklearn.model_selection.train_test_split")
    parser.add_argument("--no_shuffle", action="store_true",
                        help="If specified, data will not be shuffled during \
                        train_test_split ")
    parser.add_argument("--random_state", type=int, default=None,
                        help="random state for \
                        sklearn.model_selection.train_test_split ")
    parser.add_argument("--remove_zip", action="store_true",
                        help="Zipfiles will be removed after processing data",
                        default=False)

    parser.add_argument("--normalize_pca_rot", action="store_true",
                        help="Whether or not to normalize the pca's rotation")

    args = parser.parse_args()

    data_dir = args.ddir

    split_options = {
        "test_size": args.test_size,
        "train_size": args.train_size,
        "shuffle": False if args.no_shuffle else True,
        "random_state": args.random_state
    }

    if args.remove_zip:
        remove_zip = True
    else:
        args.remove_zip = False

    if args.cats:
        print("Prepare Cats Dataset")
        _prepare_cats(data_dir, remove_zip=remove_zip,
                      normalize_pca_rot=args.normalize_pca_rot,
                      **split_options)

    if args.lfpw and args.lzip is not None:
        print("Prepare LFPW Dataset")
        _prepare_ibug_dset(args.lzip, "lfpw", data_dir, remove_zip=remove_zip,
                           normalize_pca_rot=args.normalize_pca_rot,
                           )

    if args.helen and args.hzip is not None:
        print("Prepare HELEN Dataset")
        _prepare_ibug_dset(args.hzip, "helen", data_dir, remove_zip=remove_zip,
                           normalize_pca_rot=args.normalize_pca_rot,
                           )

    print("Preprocessed all dataset!")


if __name__ == '__main__':
    prepare_all_data()