Source code for common_datasets.multiclass_classification._multiclass_classification

"""
This module contains the multiclass data loaders
"""

import json
import pkgutil

import pandas as pd

from .._io import (read_csv_data,
                    load_arff_template_multiclass,
                    prepare_csv_data_template)

SUMMARY_PATH = 'data/summary_multiclass_classification.json'
summary = json.loads(pkgutil.get_data('common_datasets', SUMMARY_PATH).decode('utf-8'))

summary_pdf = pd.DataFrame.from_dict(summary)

__all__= ['load_glass',
          'load_satimage',
          'load_ecoli',
          'load_abalone',
          'load_yeast',
          'load_automobile',
          'load_balance',
          'load_car',
          'load_cleveland',
          'load_contraceptive',
          'load_dermatology',
          'load_fars',
          'load_flare',
          'load_hayes_roth',
          'load_kr_vs_k',
          'load_led7digit',
          'load_movement_libras',
          'load_newthyroid',
          'load_nursery',
          'load_page_blocks',
          'load_post_operative',
          'load_segment',
          'load_splice',
          'load_tae',
          'load_vowel',
          'load_zoo',
          'get_filtered_data_loaders',
          'get_data_loaders',
          'summary_pdf',
          'get_summary_pdf']

########
# arff #
########

[docs]def load_automobile(): """ Loads the automobile dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/automobile/automobile.dat' return load_arff_template_multiclass(path=path, target_label='Symboling', name='automobile')
[docs]def load_balance(): """ Loads the balance dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/balance/balance.dat' return load_arff_template_multiclass(path=path, target_label='Balance_scale', name='balance')
[docs]def load_car(): """ Loads the car dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/car/car.dat' return load_arff_template_multiclass(path=path, target_label='Acceptability', name='car')
[docs]def load_cleveland(): """ Loads the cleveland dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/cleveland/cleveland.dat' return load_arff_template_multiclass(path=path, target_label='Num', name='cleveland')
[docs]def load_contraceptive(): """ Loads the contraceptive dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/contraceptive/contraceptive.dat' return load_arff_template_multiclass(path=path, target_label='Contraceptive_method', name='contraceptive')
[docs]def load_dermatology(): """ Loads the dermatology dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/dermatology/dermatology.dat' return load_arff_template_multiclass(path=path, target_label='Class', name='dermatology')
def load_fars(): """ Loads the fars dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/fars/fars.dat' return load_arff_template_multiclass(path=path, target_label='INJURY_SEVERITY', name='fars')
[docs]def load_flare(): """ Loads the flare dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/flare/flare.dat' return load_arff_template_multiclass(path=path, target_label='Class', name='flare')
[docs]def load_hayes_roth(): """ Loads the hayes_roth dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/hayes-roth/hayes-roth.dat' return load_arff_template_multiclass(path=path, target_label='Class', name='hayes_roth')
def load_kr_vs_k(): """ Loads the kr_vs_k dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/kr-vs-k/kr-vs-k.dat' return load_arff_template_multiclass(path=path, target_label='Game', name='kr_vs_k')
[docs]def load_led7digit(): """ Loads the led7digit dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/led7digit/led7digit.dat' return load_arff_template_multiclass(path=path, target_label='Number', name='led7digit')
[docs]def load_movement_libras(): """ Loads the movement_libras dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/movement_libras/movement_libras.dat' return load_arff_template_multiclass(path=path, target_label='Class', name='movement_libras')
[docs]def load_newthyroid(): """ Loads the newthyroid dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/newthyroid/newthyroid.dat' return load_arff_template_multiclass(path=path, target_label='Class', name='newthyroid')
def load_nursery(): """ Loads the nursery dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/nursery/nursery.dat' return load_arff_template_multiclass(path=path, target_label='Class', name='nursery')
[docs]def load_page_blocks(): """ Loads the page_blocks dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/page-blocks/page-blocks.dat' return load_arff_template_multiclass(path=path, target_label='Class', name='page_blocks')
[docs]def load_post_operative(): """ Loads the post_operative dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/post-operative/post-operative.dat' return load_arff_template_multiclass(path=path, target_label='Decision', name='post_operative')
[docs]def load_segment(): """ Loads the segment dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/segment/segment.dat' return load_arff_template_multiclass(path=path, target_label='Class', name='segment')
[docs]def load_splice(): """ Loads the splice dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/splice/splice.dat' return load_arff_template_multiclass(path=path, target_label='Class', name='splice')
[docs]def load_tae(): """ Loads the tae dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/tae/tae.dat' return load_arff_template_multiclass(path=path, target_label='Class', name='tae')
[docs]def load_vowel(): """ Loads the vowel dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/vowel/vowel.dat' return load_arff_template_multiclass(path=path, target_label='Class', name='vowel')
[docs]def load_zoo(): """ Loads the zoo dataset Returns: dict: the dataset in sklearn.datasets format """ path = 'data/classification/zoo/zoo.dat' return load_arff_template_multiclass(path=path, target_label='Type', name='zoo')
####### # csv # #######
[docs]def load_glass(): """ Load the glass dataset Returns: dict: the dataset in sklearn.datasets format """ dataset= read_csv_data('data/classification/glass/glass.data.txt') dataset.columns= list(dataset.columns[:-1]) + ['target'] del dataset[dataset.columns[0]] return prepare_csv_data_template(dataset=dataset, name='glass', target_label='target', problem_type='multiclass')
[docs]def load_satimage(): """ Load the SATIMAGE dataset Returns: dict: the dataset in sklearn.datasets format """ db0= read_csv_data('data/classification/satimage/sat.trn.txt', sep= ' ') db1= read_csv_data('data/classification/satimage/sat.tst.txt', sep= ' ') dataset= pd.concat([db0, db1]) dataset.columns= list(dataset.columns[:-1]) + ['target'] return prepare_csv_data_template(dataset=dataset, name='SATIMAGE', target_label='target', problem_type='multiclass')
[docs]def load_ecoli(): """ Load the ecoli dataset Returns: dict: the dataset in sklearn.datasets format """ dataset= read_csv_data('data/classification/ecoli/ecoli.data.txt', delim_whitespace=True) dataset.columns= list(dataset.columns[:-1]) + ['target'] del dataset[dataset.columns[0]] return prepare_csv_data_template(dataset=dataset, name='ecoli', target_label='target', problem_type='multiclass')
[docs]def load_abalone(): """ Load the abalone dataset Returns: dict: the dataset in sklearn.datasets format """ dataset= read_csv_data('data/classification/abalone/abalone.data.txt') dataset.columns= list(dataset.columns[:-1]) + ['target'] del dataset[dataset.columns[0]] return prepare_csv_data_template(dataset=dataset, name='abalone', target_label='target', problem_type='multiclass')
[docs]def load_yeast(): """ Load the yeast dataset Returns: dict: the dataset in sklearn.datasets format """ dataset= read_csv_data('data/classification/yeast/yeast.data.txt', delim_whitespace=True) dataset.columns= list(dataset.columns[:-1]) + ['target'] del dataset[dataset.columns[0]] return prepare_csv_data_template(dataset=dataset, name='yeast', target_label='target', problem_type='multiclass')
############################# def get_filtered_data_loaders(*, n_col_bounds=(1, 5000), n_col_orig_bounds=(1, 5000), n_bounds=(1, 10000), n_smallest=-1, sorting=None, n_from_phenotypes=None): """ Get filtered data loaders. Args: n_col_bounds (tuple): the lower and upper bounds on the number of columns n_col_orig_bounds (tuple): the lower and upper bounds on the number of original columns n_bounds (tuple): the lower and upper bounds on the number of records n_smallest (int): the number of smallest in the sense of "sorting" sorting (str): the sorting attribute ('n', 'n_col', 'n_minority', 'imbalance_ratio') n_from_phenotypes (bool): the maximum number of datasets from a phenotype Returns: list: the list of data loaders """ descriptors= summary_pdf data_loaders = descriptors[(descriptors['n'] >= n_bounds[0]) & (descriptors['n'] < n_bounds[1]) & (descriptors['n_col'] >= n_col_bounds[0]) & (descriptors['n_col'] < n_col_bounds[1]) & (descriptors['n_col_orig'] >= n_col_orig_bounds[0]) & (descriptors['n_col_orig'] < n_col_orig_bounds[1])] if n_from_phenotypes is not None: data_loaders = data_loaders.groupby('phenotype').head(n_from_phenotypes) if sorting is not None: data_loaders = data_loaders.sort_values(sorting) if n_smallest != -1: data_loaders = data_loaders[:n_smallest] data_loaders = data_loaders['data_loader'].values return [globals()[data_loader] for data_loader in data_loaders] def get_data_loaders(subset='all', n_smallest=-1, sorting=None, n_from_phenotypes=None): """ Get a subset of data loaders Args: subset (str): 'all'/'study'/'small'/'tiny' n_smallest (int): the number of smallest in the sense of "sorting" sorting (str): the sorting attribute ('n', 'n_col') n_from_phenotypes (int): the maximum number of datasets from a phenotype Returns: list: the list of data loaders """ n_col_bounds = (1, 5000) n_col_orig_bounds = (1, 5000) n_bounds = (1, 10000) if subset == 'study': n_col_bounds = (n_col_bounds[0], 100) n_bounds = (n_bounds[0], 4000) elif subset == 'small': n_col_bounds = (n_col_bounds[0], 100) n_bounds = (n_bounds[0], 1000) elif subset == 'tiny': n_bounds = (n_bounds[0], 120) return get_filtered_data_loaders(n_col_bounds=n_col_bounds, n_col_orig_bounds=n_col_orig_bounds, n_bounds=n_bounds, n_smallest=n_smallest, sorting=sorting, n_from_phenotypes=n_from_phenotypes) def get_summary_pdf(): """ Returns the summary pandas dataframe with loader function references Returns: pd.DataFrame: the summary pandas dataframe """ descriptors = summary_pdf descriptors['data_loader_function'] = descriptors['data_loader'].apply(lambda x: globals()[x]) return descriptors