diff --git a/docs/source/conf.py b/docs/source/conf.py index d3f13a6cf5d11f25a49d1d9862abcbb88d8a01a9..a1f6a297698440c561cfaf48397235bf977e23bd 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -24,6 +24,9 @@ import sys sys.path.insert(0, os.path.abspath('.')) sys.path.insert(0, os.path.abspath('../../multiview_platform')) sys.path.insert(0, os.path.abspath('../..')) +file_loc = os.path.split(__file__)[0] +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(file_loc), '.'))) +import multiview_platform # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. diff --git a/multiview_platform/datasets/__init__.py b/multiview_platform/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8cfb3abd3fb6871b4c6d41cbce518949ed192e19 --- /dev/null +++ b/multiview_platform/datasets/__init__.py @@ -0,0 +1,2 @@ +from multiview_platform.datasets.base import * +from multiview_platform.datasets.data_sample import DataSample, Metriclearn_array diff --git a/multiview_platform/datasets/base.py b/multiview_platform/datasets/base.py new file mode 100644 index 0000000000000000000000000000000000000000..338f3e042e57c7abd8fe4785c2a66b6d29a5acd8 --- /dev/null +++ b/multiview_platform/datasets/base.py @@ -0,0 +1,204 @@ +import pickle +import numpy as np +import numpy.ma as ma +from multiview_platform.datasets.data_sample import DataSample +from six.moves import cPickle as pickle #for performance +from __future__ import print_function +import numpy as np + + +def save_dict(di_, filename_): + with open(filename_, 'wb') as f: + pickle.dump(di_, f) + +def load_dict(filename_): + with open(filename_, 'rb') as f: + ret_di = pickle.load(f) + return ret_di + + +def load_data(address, output='array', pickle=False): + if output.startswith(('array')): + views = np.empty((len(address)), dtype=object) + else: + views = {} + i = 0 + nb_samples, nb_features = _determine_dimensions(address) + for addr in address: + data = _load_view_sample(addr, nb_samples , nb_features[i], pickle=pickle) + views[i] = data + i += 1 + return DataSample(data=views) + +def _determine_dimensions(address): + nb_features = [] + nb_samples = 0 + nb_sample_max = -1 + for adr in address: + try: + f = open(adr, "r") + line = f.readline() + nb_samples += 1 + while line : + line = f.readline() + l = line.split() + nb_samples += 1 + nb_features.append(len(l)) + line = f.readline() + if nb_sample_max < nb_samples: + nb_sample_max = nb_samples + f.close() + except IOError: + raise IOError("file adr can't be open") + return nb_sample_max, nb_features + +def _load_view_sample(adr, nb_samples, nb_features, pickle=False): + """Load a sample from file and returns a dictionary + (word,count) + + - Input: + + :param lrows: number or list of rows, + a list of strings if partial=True; + otherwise, based on pref if version="classic" or + "prefix", fact otherwise + :type lrows: int or list of int + :param lcolumns: number or list of columns + a list of strings if partial=True ; + otherwise, based on suff if version="classic" or "suffix", + fact otherwise + :type lcolumns: int or list of int + :param string version: (default = "classic") version name + :param boolean partial: (default value = False) build of partial + if True partial dictionaries are loaded based + on nrows and lcolumns + + - Output: + + :returns: nbL , nbEx , dsample , dpref , dsuff , dfact + :rtype: int , int , dict , dict , dict , dict + + + :Example: + + Let's say you are interested in the samples 10, 25, and 50, and want to + know their class name. + + >>> from metriclearning.datasets.base import load_data_sample + >>> from metriclearning.tests.datasets.get_dataset_path import get_dataset_path + >>> train_file = '3.pautomac_light.train' # '4.spice.train' + >>> data = load_data_sample(adr=get_dataset_path(train_file)) + >>> data.nbL + 4 + >>> data.nbEx + 5000 + >>> data.data + Splearn_array([[ 3., 0., 3., ..., -1., -1., -1.], + [ 3., 3., -1., ..., -1., -1., -1.], + [ 3., 2., 0., ..., -1., -1., -1.], + ..., + [ 3., 1., 3., ..., -1., -1., -1.], + [ 3., 0., 3., ..., -1., -1., -1.], + [ 3., 3., 1., ..., -1., -1., -1.]]) + + """ + #nb_sample, max_length = _read_dimension(adr=adr) + f = open(adr, "r") + line = f.readline() + l = line.split() + nbEx = int(l[0]) + nbL = int(l[1]) + line = f.readline() + data1 = np.zeros((nb_samples, nb_features), dtype=np.float) + data1 += np.NAN + datatrue = np.ones((nb_samples, nb_features), dtype=np.bool) + i = 0 + while line: + l = line.split() + # w = () if int(l[0]) == 0 else tuple([int(x) for x in l[1:]]) + # dsample[w] = dsample[w] + 1 if w in dsample else 1 + # traitement du mot vide pour les préfixes, suffixes et facteurs + w = [float(x) for x in l[0:]] + data1[i, :len(w)] = w + line = f.readline() + i += 1 + if i > nbEx: + raise IndexError("dimension is not well defined") + masint= np.isnan(data1) + # masint = np.logical_not(masint) + madata1 = ma.MaskedArray(data1, masint) + f.close() + + if pickle: + _create_pickle_files(adr=adr, dsample=madata1) + return madata1 + +# def _read_dimension(adr): +# f = open(adr, "r") +# line = f.readline() +# l = line.split() +# nbEx = int(l[0]) +# nbL = int(l[1]) +# line = f.readline() +# max_length = 0 +# nb_sample = 0 +# while line: +# l = line.split() +# nb_sample += 1 +# length = int(l[0]) +# if max_length < length: +# max_length = length +# line = f.readline() +# f.close() +# if nb_sample != nbEx: +# raise ValueError("check imput file, metadata " + str(nbEx) + +# "do not match number of samples " + str(nb_sample)) +# return nb_sample , max_length + +# def _load_file_1lecture(adr, pickle=False): +# dsample = {} # dictionary (word,count) +# f = open(adr, "r") +# line = f.readline() +# l = line.split() +# nbEx = int(l[0]) +# nbL = int(l[1]) +# line = f.readline() +# data1 = np.zeros((0,0)) +# length = 0 +# while line: +# l = line.split() +# # w = () if int(l[0]) == 0 else tuple([int(x) for x in l[1:]]) +# # dsample[w] = dsample[w] + 1 if w in dsample else 1 +# # traitement du mot vide pour les préfixes, suffixes et facteurs +# w = [] if int(l[0]) == 0 else [int(x) for x in l[1:]] +# word = np.array(w, ndmin=2, dtype=np.uint32) +# diff = abs(int(l[0]) - length) +# if len(w) > length and not np.array_equal(data1, np.zeros((0,0))): +# data1 = _add_empty(data1, diff) +# elif word.shape[0] < length and not np.array_equal(data1, np.zeros((0,0))): +# word = _add_empty(word, diff) +# +# if np.array_equal(data1, np.zeros((0,0))): +# data1 = word +# else: +# data1 = np.concatenate((data1, word), axis=0) +# length = data1.shape[1] +# line = f.readline() +# +# f.close() +# if pickle: +# _create_pickle_files(adr=adr, dsample=dsample) +# return nbL, nbEx, data1 + + +# def _add_empty(data, diff): +# empty = np.zeros((data.shape[0], diff)) +# empty += -1 +# data = np.concatenate((data, empty), axis=1) +# return data + + +def _create_pickle_files(self, adr, dsample): + f = open(adr + ".sample.pkl", "wb") + pickle.dump(dsample, f) + f.close() diff --git a/multiview_platform/datasets/data_sample.py b/multiview_platform/datasets/data_sample.py new file mode 100644 index 0000000000000000000000000000000000000000..833528dae22b84ebe45a551d4f4adb64fc4af3e2 --- /dev/null +++ b/multiview_platform/datasets/data_sample.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- + +"""This module contains the DataSample class and Splearn_array class +The DataSample class encapsulates a sample 's components +nbL and nbEx numbers, +Splearn_array class inherit from numpy ndarray and contains a 2d data ndarray +with the shape + +==== ==== ==== ==== ==== +x x x x -1 +x x x x x +x x -1 -1 -1 +x -1 -1 -1 -1 +-1 -1 -1 -1 -1 +==== ==== ==== ==== ==== + +where -1 a indicates a empty cell, +the number nbL and nbEx and , the fourth dictionaries for sample, +prefix, suffix and factor where they are computed +""" +import numpy as np +import numpy.ma as ma + + +class MultiView_array(ma.MaskedArray): + """Splearn_array inherit from numpy ndarray + + :Example: + + >>> from multiview_platform.datasets.base import load_data + >>> from multiview_platform.datasets.get_dataset_path import get_dataset_path + >>> train_file = '' # '4.spice.train' + >>> data = load_data(adr=get_dataset_path(train_file)) + >>> print(data.__class__) + >>> data.data + + """ + def __new__(cls, data): + shapes_int = [] + index = 0 + new_data = data + shape_ext = len(data) + thekeys = None + if isinstance(data, dict): + shape_ext = len(data) + for key, dat_values in data.items(): + new_data = cls._populate_new_data(index, dat_values, new_data) + shapes_int.append(dat_values.shape[0]) + index += 1 + thekeys = data.keys() + + if isinstance(data, np.ndarray): + shape_ext = data.shape[0] + for dat_values in data: + shapes_int.append(dat_values.shape[0]) + new_data = cls._populate_new_data(index, dat_values, new_data) + index += 1 + # obj = ma.MaskedArray.__new(new_data) # new_data.view() a.MaskedArray(new_data, mask=new_data.mask).view(cls) + # bj = super(Metriclearn_array, cls).__new__(cls, new_data.data, new_data.mask) + obj = ma.masked_array(new_data.data, new_data.mask).view(cls) + obj.shapes_int = shapes_int + obj.shape_ext = shape_ext + obj.keys = thekeys + return obj + + + @staticmethod + def _populate_new_data(index, dat_values, new_data): + if index == 0: + if isinstance(dat_values, ma.MaskedArray): + new_data = dat_values + else: + new_data = dat_values.view(ma.MaskedArray) # ma.masked_array(dat_values, mask=ma.nomask) dat_values.view(ma.MaskedArray) #( + new_data.mask = ma.nomask + else: + if isinstance(dat_values, ma.MaskedArray): + new_data = ma.hstack((new_data, dat_values)) + else: + new_data = ma.hstack((new_data, dat_values.view(ma.MaskedArray) ) ) # ma.masked_array(dat_values, mask=ma.nomask + return new_data + + def __array_finalize__(self, obj): + if obj is None: return + super(MultiView_array, self).__array_finalize__(obj) + self.shapes_int = getattr(obj, 'shapes_int', None) + self.shape_ext = getattr(obj, 'shape_ext', None) + self.keys = getattr(obj, 'keys', None) + + def getCol(self, view, col): + start = np.sum(np.asarray(self.shapes_int[0: view])) + return self.data[start+col, :] + + def getView(self, view): + start = np.sum(np.asarray(self.shapes_int[0: view])) + stop = start + self.shapes_int[view] + return self.data[start:stop, :] + + def getRaw(self, view, raw): + start = np.sum(np.asarray(self.shapes_int[0: view])) + stop = np.sum(np.asarray(self.shapes_int[0: view+1])) + return self.data[start:stop, raw] + +class DataSample(dict): + """ A DataSample instance + + :Example: + + >>> from multiview_platform.datasets.base import load_data + >>> from multiview_platform.datasets.get_dataset_path import get_dataset_path + >>> train_file = '' # '4.spice.train' + >>> data = load_data_sample(adr=get_dataset_path(train_file)) + >>> print + (data.__class__) + + >>> data.data + + - Input: + + :param string adr: adresse and name of the loaden file + :param string type: (default value = 'SPiCe') indicate + the structure of the file + :param lrows: number or list of rows, + a list of strings if partial=True; + otherwise, based on self.pref if version="classic" or + "prefix", self.fact otherwise + :type lrows: int or list of int + :param lcolumns: number or list of columns + a list of strings if partial=True ; + otherwise, based on self.suff if version="classic" or "suffix", + self.fact otherwise + :type lcolumns: int or list of int + :param string version: (default = "classic") version name + :param boolean partial: (default value = False) build of partial + + """ + + def __init__(self, data=None, **kwargs): + + # The dictionary that contains the sample + super(DataSample, self).__init__(kwargs) + self._data = None # Metriclearn_array(np.zeros((0,0))) + if data is not None: + self._data = MultiView_array(data) + + + @property + def data(self): + """Metriclearn_array""" + + return self._data + + @data.setter + def data(self, data): + if isinstance(data, (MultiView_array, np.ndarray, ma.MaskedArray, np.generic)): + self._data = data + else: + raise TypeError("sample should be a MultiView_array.") + + + + diff --git a/multiview_platform/datasets/get_dataset_path.py b/multiview_platform/datasets/get_dataset_path.py new file mode 100644 index 0000000000000000000000000000000000000000..f533aa84910b5ee415db222072d88b66620754ba --- /dev/null +++ b/multiview_platform/datasets/get_dataset_path.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + + +from __future__ import print_function, division + +import os + + +def get_dataset_path(filename): + """Return the absolute path of a reference dataset for tests + + - Input parameter: + + :param str filename: File name of the file containing reference data + for tests (which must be in ``skgilearn/tests/datasets/``) + + - Output parameters: + + :returns: The absolute path where the file with name **filename** is stored + :rtype: str + + """ + + datasets_path = os.path.dirname(os.path.abspath(__file__)) + return os.path.join(datasets_path, filename) diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/adaboost.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/adaboost.py index 0547e5c299e4e757bf87a2259ed634bd16a0388e..b8deb248f31d907887c72e5b6c1eb2ca531cec5b 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/adaboost.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/adaboost.py @@ -14,6 +14,10 @@ __status__ = "Prototype" # Production, Development, Prototype class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier): + """ + This class implement a Classifier with adaboost algorithm. + + """ def __init__(self, random_state=None, n_estimators=50, base_estimator=None, **kwargs): diff --git a/multiview_platform/mono_multi_view_classifiers/utils/make_file_config.py b/multiview_platform/mono_multi_view_classifiers/utils/make_file_config.py index bb7c78a8fcc61be472304ebfa709f9afc888fdda..121e1f869c321aabd6a287632c7401c67210257f 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/make_file_config.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/make_file_config.py @@ -1,23 +1,36 @@ import os, sys, inspect -from multiview_platform.mono_multi_view_classifiers.monoview_classifiers.adaboost import Adaboost +# from multiview_platform.mono_multi_view_classifiers.monoview_classifiers.adaboost import Adaboost import importlib -classifier_dict = {"0": ['mono', Adaboost, +classifier_dict = {"0": ['mono', 'Adaboost', 'multiview_platform.mono_multi_view_classifiers.monoview_classifiers.adaboost']} val = classifier_dict["0"] mymodule = importlib.import_module(val[2]) + +for name in dir(mymodule): + att = getattr(mymodule, name) + try: + getattr(att, "__module__") + if att.__module__.startswith(mymodule.__name__): + if inspect.isclass(att): + print(att) + print(name) + except Exception: + pass + + parameter = {"0":[]} -parameter instring = "multiview_platform/mono_multi_view_classifiers/monoview_classifiers/" if instring in mymodule.__file__: - sig = inspect.signature(val[1].__init__) + monInstance = getattr(mymodule, 'Adaboost') + sig = inspect.signature(monInstance.__init__) for arg_idx, name in enumerate(sig.parameters): param= sig.parameters[name] if not name.startswith('self'): - parameter{"0"}.append(name) + parameter["0"].append(name) if param.default is not inspect.Parameter.empty: value_default = param.default @@ -26,21 +39,39 @@ if instring in mymodule.__file__: print() -dir(mymodule) -if val[1] in dir(mymodule): +class ConfigurationMaker(): + """ + Find the name of the classifier from the dict classier to report + + + + """ + _path_classifier_mono = 'multiview_platform/mono_multi_view_classifier/monoview_classifiers' + _path_classifier_multi = 'multiview_platform/mono_multi_view_classifier/multiview_classifier' + + def __init__(self, classifier_dict=None): + if classifier_dict is None: + classifier_dict = {"0": ['mono', 'Adaboost', + 'multiview_platform.mono_multi_view_classifiers.monoview_classifiers.adaboost']} + names = [] + for key, val in classifier_dict.items(): + mymodule = importlib.import_module(val[2]) + names.append(self._get_module_name(mymodule)) + + + def _get_module_name(self, mymodule): + for name in dir(mymodule): + att = getattr(mymodule, name) + try: + getattr(att, "__module__") + if att.__module__.startswith(mymodule.__name__): + if inspect.isclass(att): + if att == val[1]: + return name + except Exception: + return None + return None -# class ConfigurationMaker(): -# """ -# -# """ -# _path_classifier_mono = 'multiview_platform/mono_multi_view_classifier/monoview_classifiers' -# _path_classifier_multi = 'multiview_platform/mono_multi_view_classifier/multiview_classifier' -# -# def __init__(self ): -# classifier_dict = {"0": ['mono', Adaboost, -# 'multiview_platform.mono_multi_view_classifiers.monoview_classifiers.']} -# -# for key, val in classifier_dict.items(): # mymodule = importlib.import_module(val[2]) # module_file = mymodule.__file__ # getattr(self._path_classifier_mono, module_file[:-3]) diff --git a/multiview_platform/mono_multi_view_classifiers/utils/parameters.py b/multiview_platform/mono_multi_view_classifiers/utils/parameters.py new file mode 100644 index 0000000000000000000000000000000000000000..2b61691f20124cb20fd7872aa8c44f5757397f02 --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/utils/parameters.py @@ -0,0 +1,145 @@ +import numpy as np + + +class Parameter_pdata(object): + class __Parameter_pdata: + nbr_i = 0 + # option de renormalisation des donnees + # la séparation se faisant à une permutation pret et à un facteur de + # renormalisation pret, on peut choisir de normaliser les données au debut + # de l'algo et/ou à chaque iteration de l'algo et/ou à la fin de l'algo + # on normalise A ou S + _data_norm = {'FlagInit': True, 'FlagIter': False, 'FlagEnd': False} + # % on normalise suivant les colonnes (1) 'dim' (norme des colonnes à 1) ou les + # 'dim'% lignes (2) (norme des lignes à 1) + _Norm = {'p': 1, 'dim': 1, 'x': 'A'} + _list_mode = ['real', 'simul'] + _list_x = ['A', 'S'] + + def __init__(self): + self._Norm['p'] = 1 + self._Norm['dim'] = 1 + self._Norm['x'] = self._list_x[0] + self.mode = self._list_mode[1] + self.sigma = 20000 + self.dim = 1 + if self.nbr_i > 0: + raise ValueError("Instance of class Parameter_pdata can be only one") + self.nbr_i += 1 + + def __str__(self): + return repr(self) + + instance = None + + # def __init__(self, arg): + # if not Parameter_pdata.instance: + # Parameter_pdata.instance = Parameter_pdata.__Parameter_pdata(arg) + # else: + # Parameter_pdata.instance.val = arg + + def __new__(cls): # _new_ est toujours une méthode de classe + if not Parameter_pdata.instance: + Parameter_pdata.instance = Parameter_pdata.__Parameter_pdata() + return Parameter_pdata.instance + + def __getattr__(self, attr): + return getattr(self.instance, attr) + + # def __setattr__(self, attr, val): + # return setattr(self.instance, attr, val) + + def __setattr__(self, name): + return setattr(self.instance, name) + + +class Parameter_palgo(object): + class __Parameter_palgo: + + nbr_i = 0 + _list_algo = ['BCVMFB', 'PALS', 'STALS', 'LSfro', 'LSkl'] + _stop = {'DifA': False, 'DifS': False, + 'ObjFct': True, 'threshold': np.finfo(float).eps} + _pfwt = {'w': 'db6', 'family_pfwt': 'db', + 'level': 10, 'K': 4, + 'Ls': 3000, 'L1': 3000, 'L2': 3000} + # _wavelette_type = ['db', 'db6'] + # 'LS' pour Lee et Seung + # 'Lips' pour la constante de Lipschitz + # 'PALM' pas de preconditionnement + _list_precond = ['LS', 'Lips', 'PALM'] + + def __init__(self): + self.flagWave = False + self.val = None + algo_value = self._list_algo[1] + self._algo = algo_value + self.gamma = 0.99 + self.inf = np.inf + self.eps = np.finfo(float).eps + self.niter = 1000 + self.eta_inf = 'eps' + self.eta_sup = 'inf' + self.alpha_A = 0.0 + self.p_A = 1 + self.p_S = 1 + self.alpha_S = 0.0 + # self.level = 10 + self.alpha_S_eval = False + self.stopThreshold = 10e-5, + self.precond = 'LS' # 'LS' pour Lee et Seung + self.F = None + self.Fstar = None + self.verbose = False + + if self.nbr_i > 0: + raise ValueError("Instance of class Parameter_pdata can be only one") + self.nbr_i += 1 + + def __str__(self): + return repr(self) + repr(self.val) + + @property + def algo(self): + return self._algo + + @algo.setter + def algo(self, algo_value): + if algo_value not in self._list_algo: + raise NameError("parameter algo must be in %s" % self._list_algo) + else: + self._algo = algo_value + + instance = None + + # def __init__(self, arg): + # if not Parameter_pdata.instance: + # Parameter_pdata.instance = Parameter_pdata.__Parameter_pdata(arg) + # else: + # Parameter_pdata.instance.val = arg + + def __new__(cls): # _new_ est toujours une méthode de classe + if not Parameter_palgo.instance: + Parameter_palgo.instance = Parameter_palgo.__Parameter_palgo() + return Parameter_palgo.instance + + def __getattr__(self, attr): + return getattr(self.instance, attr) + + # def __setattr__(self, attr, val): + # return setattr(self.instance, attr, val) + + def __setattr__(self, name): + return setattr(self.instance, name) + + +if __name__ == '__main__': + a = Parameter_pdata() + a = Parameter_pdata() + b = Parameter_pdata() + b.val = 6 + b.x = 8 + a.x = 10 + param = Parameter_palgo() + algo = param._list_algo[3] + param.algo = algo