"""
This module contains the multiclass data loaders
"""
import json
import pkgutil
import pandas as pd
from .._io import (read_csv_data,
load_arff_template_multiclass,
prepare_csv_data_template)
SUMMARY_PATH = 'data/summary_multiclass_classification.json'
summary = json.loads(pkgutil.get_data('common_datasets', SUMMARY_PATH).decode('utf-8'))
summary_pdf = pd.DataFrame.from_dict(summary)
__all__= ['load_glass',
'load_satimage',
'load_ecoli',
'load_abalone',
'load_yeast',
'load_automobile',
'load_balance',
'load_car',
'load_cleveland',
'load_contraceptive',
'load_dermatology',
'load_fars',
'load_flare',
'load_hayes_roth',
'load_kr_vs_k',
'load_led7digit',
'load_movement_libras',
'load_newthyroid',
'load_nursery',
'load_page_blocks',
'load_post_operative',
'load_segment',
'load_splice',
'load_tae',
'load_vowel',
'load_zoo',
'get_filtered_data_loaders',
'get_data_loaders',
'summary_pdf',
'get_summary_pdf']
########
# arff #
########
[docs]def load_automobile():
"""
Loads the automobile dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/automobile/automobile.dat'
return load_arff_template_multiclass(path=path,
target_label='Symboling',
name='automobile')
[docs]def load_balance():
"""
Loads the balance dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/balance/balance.dat'
return load_arff_template_multiclass(path=path,
target_label='Balance_scale',
name='balance')
[docs]def load_car():
"""
Loads the car dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/car/car.dat'
return load_arff_template_multiclass(path=path,
target_label='Acceptability',
name='car')
[docs]def load_cleveland():
"""
Loads the cleveland dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/cleveland/cleveland.dat'
return load_arff_template_multiclass(path=path,
target_label='Num',
name='cleveland')
[docs]def load_contraceptive():
"""
Loads the contraceptive dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/contraceptive/contraceptive.dat'
return load_arff_template_multiclass(path=path,
target_label='Contraceptive_method',
name='contraceptive')
[docs]def load_dermatology():
"""
Loads the dermatology dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/dermatology/dermatology.dat'
return load_arff_template_multiclass(path=path,
target_label='Class',
name='dermatology')
def load_fars():
"""
Loads the fars dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/fars/fars.dat'
return load_arff_template_multiclass(path=path,
target_label='INJURY_SEVERITY',
name='fars')
[docs]def load_flare():
"""
Loads the flare dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/flare/flare.dat'
return load_arff_template_multiclass(path=path,
target_label='Class',
name='flare')
[docs]def load_hayes_roth():
"""
Loads the hayes_roth dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/hayes-roth/hayes-roth.dat'
return load_arff_template_multiclass(path=path,
target_label='Class',
name='hayes_roth')
def load_kr_vs_k():
"""
Loads the kr_vs_k dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/kr-vs-k/kr-vs-k.dat'
return load_arff_template_multiclass(path=path,
target_label='Game',
name='kr_vs_k')
[docs]def load_led7digit():
"""
Loads the led7digit dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/led7digit/led7digit.dat'
return load_arff_template_multiclass(path=path,
target_label='Number',
name='led7digit')
[docs]def load_movement_libras():
"""
Loads the movement_libras dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/movement_libras/movement_libras.dat'
return load_arff_template_multiclass(path=path,
target_label='Class',
name='movement_libras')
[docs]def load_newthyroid():
"""
Loads the newthyroid dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/newthyroid/newthyroid.dat'
return load_arff_template_multiclass(path=path,
target_label='Class',
name='newthyroid')
def load_nursery():
"""
Loads the nursery dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/nursery/nursery.dat'
return load_arff_template_multiclass(path=path,
target_label='Class',
name='nursery')
[docs]def load_page_blocks():
"""
Loads the page_blocks dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/page-blocks/page-blocks.dat'
return load_arff_template_multiclass(path=path,
target_label='Class',
name='page_blocks')
[docs]def load_post_operative():
"""
Loads the post_operative dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/post-operative/post-operative.dat'
return load_arff_template_multiclass(path=path,
target_label='Decision',
name='post_operative')
[docs]def load_segment():
"""
Loads the segment dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/segment/segment.dat'
return load_arff_template_multiclass(path=path,
target_label='Class',
name='segment')
[docs]def load_splice():
"""
Loads the splice dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/splice/splice.dat'
return load_arff_template_multiclass(path=path,
target_label='Class',
name='splice')
[docs]def load_tae():
"""
Loads the tae dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/tae/tae.dat'
return load_arff_template_multiclass(path=path,
target_label='Class',
name='tae')
[docs]def load_vowel():
"""
Loads the vowel dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/vowel/vowel.dat'
return load_arff_template_multiclass(path=path,
target_label='Class',
name='vowel')
[docs]def load_zoo():
"""
Loads the zoo dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
path = 'data/classification/zoo/zoo.dat'
return load_arff_template_multiclass(path=path,
target_label='Type',
name='zoo')
#######
# csv #
#######
[docs]def load_glass():
"""
Load the glass dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
dataset= read_csv_data('data/classification/glass/glass.data.txt')
dataset.columns= list(dataset.columns[:-1]) + ['target']
del dataset[dataset.columns[0]]
return prepare_csv_data_template(dataset=dataset,
name='glass',
target_label='target',
problem_type='multiclass')
[docs]def load_satimage():
"""
Load the SATIMAGE dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
db0= read_csv_data('data/classification/satimage/sat.trn.txt',
sep= ' ')
db1= read_csv_data('data/classification/satimage/sat.tst.txt',
sep= ' ')
dataset= pd.concat([db0, db1])
dataset.columns= list(dataset.columns[:-1]) + ['target']
return prepare_csv_data_template(dataset=dataset,
name='SATIMAGE',
target_label='target',
problem_type='multiclass')
[docs]def load_ecoli():
"""
Load the ecoli dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
dataset= read_csv_data('data/classification/ecoli/ecoli.data.txt',
delim_whitespace=True)
dataset.columns= list(dataset.columns[:-1]) + ['target']
del dataset[dataset.columns[0]]
return prepare_csv_data_template(dataset=dataset,
name='ecoli',
target_label='target',
problem_type='multiclass')
[docs]def load_abalone():
"""
Load the abalone dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
dataset= read_csv_data('data/classification/abalone/abalone.data.txt')
dataset.columns= list(dataset.columns[:-1]) + ['target']
del dataset[dataset.columns[0]]
return prepare_csv_data_template(dataset=dataset,
name='abalone',
target_label='target',
problem_type='multiclass')
[docs]def load_yeast():
"""
Load the yeast dataset
Returns:
dict: the dataset in sklearn.datasets format
"""
dataset= read_csv_data('data/classification/yeast/yeast.data.txt',
delim_whitespace=True)
dataset.columns= list(dataset.columns[:-1]) + ['target']
del dataset[dataset.columns[0]]
return prepare_csv_data_template(dataset=dataset,
name='yeast',
target_label='target',
problem_type='multiclass')
#############################
def get_filtered_data_loaders(*,
n_col_bounds=(1, 5000),
n_col_orig_bounds=(1, 5000),
n_bounds=(1, 10000),
n_smallest=-1,
sorting=None,
n_from_phenotypes=None):
"""
Get filtered data loaders.
Args:
n_col_bounds (tuple): the lower and upper bounds on the number
of columns
n_col_orig_bounds (tuple): the lower and upper bounds on the
number of original columns
n_bounds (tuple): the lower and upper bounds on the number
of records
n_smallest (int): the number of smallest in the sense of "sorting"
sorting (str): the sorting attribute ('n', 'n_col', 'n_minority',
'imbalance_ratio')
n_from_phenotypes (bool): the maximum number of datasets from a
phenotype
Returns:
list: the list of data loaders
"""
descriptors= summary_pdf
data_loaders = descriptors[(descriptors['n'] >= n_bounds[0])
& (descriptors['n'] < n_bounds[1])
& (descriptors['n_col'] >= n_col_bounds[0])
& (descriptors['n_col'] < n_col_bounds[1])
& (descriptors['n_col_orig'] >= n_col_orig_bounds[0])
& (descriptors['n_col_orig'] < n_col_orig_bounds[1])]
if n_from_phenotypes is not None:
data_loaders = data_loaders.groupby('phenotype').head(n_from_phenotypes)
if sorting is not None:
data_loaders = data_loaders.sort_values(sorting)
if n_smallest != -1:
data_loaders = data_loaders[:n_smallest]
data_loaders = data_loaders['data_loader'].values
return [globals()[data_loader] for data_loader in data_loaders]
def get_data_loaders(subset='all',
n_smallest=-1,
sorting=None,
n_from_phenotypes=None):
"""
Get a subset of data loaders
Args:
subset (str): 'all'/'study'/'small'/'tiny'
n_smallest (int): the number of smallest in the sense of "sorting"
sorting (str): the sorting attribute ('n', 'n_col')
n_from_phenotypes (int): the maximum number of datasets from a
phenotype
Returns:
list: the list of data loaders
"""
n_col_bounds = (1, 5000)
n_col_orig_bounds = (1, 5000)
n_bounds = (1, 10000)
if subset == 'study':
n_col_bounds = (n_col_bounds[0], 100)
n_bounds = (n_bounds[0], 4000)
elif subset == 'small':
n_col_bounds = (n_col_bounds[0], 100)
n_bounds = (n_bounds[0], 1000)
elif subset == 'tiny':
n_bounds = (n_bounds[0], 120)
return get_filtered_data_loaders(n_col_bounds=n_col_bounds,
n_col_orig_bounds=n_col_orig_bounds,
n_bounds=n_bounds,
n_smallest=n_smallest,
sorting=sorting,
n_from_phenotypes=n_from_phenotypes)
def get_summary_pdf():
"""
Returns the summary pandas dataframe with loader function references
Returns:
pd.DataFrame: the summary pandas dataframe
"""
descriptors = summary_pdf
descriptors['data_loader_function'] = descriptors['data_loader'].apply(lambda x: globals()[x])
return descriptors