From 8a13f3f86757795b74ce075aec1163671f66ad69 Mon Sep 17 00:00:00 2001 From: bbauvin <baptiste.bauvin@centrale-marseille.fr> Date: Wed, 18 Oct 2017 17:32:53 -0400 Subject: [PATCH] Updated SCM version, now it seems to work but tweaked a bit on features --- Code/MonoMultiViewClassifiers/ExecClassif.py | 4 +- .../MonoviewClassifiers/SCM.py | 480 +++--------------- .../Multiview/Fusion/Methods/LateFusion.py | 23 +- .../LateFusionPackage/MajorityVoting.py | 3 + .../Methods/LateFusionPackage/SCMForLinear.py | 344 ++----------- .../Methods/LateFusionPackage/SVMForLinear.py | 11 +- .../LateFusionPackage/WeightedLinear.py | 3 + 7 files changed, 169 insertions(+), 699 deletions(-) diff --git a/Code/MonoMultiViewClassifiers/ExecClassif.py b/Code/MonoMultiViewClassifiers/ExecClassif.py index 86b6c6d6..caea0c6e 100644 --- a/Code/MonoMultiViewClassifiers/ExecClassif.py +++ b/Code/MonoMultiViewClassifiers/ExecClassif.py @@ -59,7 +59,7 @@ def initBenchmark(args): for multiviewPackageName in allMultiviewPackages: if multiviewPackageName in algosMutliview: multiviewPackage = getattr(Multiview, multiviewPackageName) - multiviewModule = getattr(multiviewPackage, multiviewPackageName) + multiviewModule = getattr(multiviewPackage, multiviewPackageName+"Module") benchmark = multiviewModule.getBenchmark(benchmark, args=args) if "Monoview" in args.CL_type: if args.CL_algos_monoview == ['']: @@ -272,7 +272,7 @@ def classifyOneIter(LABELS_DICTIONARY, argumentDictionaries, nbCores, directory, # __ EXECUTION __ # # _______________ # def execClassif(arguments): - import pdb;pdb.set_trace() + # import pdb;pdb.set_trace() testVersions() start = time.time() args = execution.parseTheArgs(arguments) diff --git a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/SCM.py b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/SCM.py index 567b2eb3..9909564b 100644 --- a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/SCM.py +++ b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/SCM.py @@ -1,8 +1,18 @@ -from pyscm.utils import _pack_binary_bytes_to_ints -import pyscm +# from pyscm.utils import _pack_binary_bytes_to_ints +# import pyscm import h5py # from pyscm.binary_attributes.base import BaseBinaryAttributeList import os +import itertools +# import pyscm.deprecated as pyscm +import numpy as np + +from pyscm.scm import SetCoveringMachineClassifier as scm +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.pipeline import Pipeline +from sklearn.model_selection import RandomizedSearchCV +from sklearn.externals.six import iteritems, iterkeys, itervalues +from scipy.stats import uniform, randint # from ..Multiview import GetMultiviewDb as DB # from ..utils.Dataset import getShape @@ -14,435 +24,107 @@ __author__ = "Baptiste Bauvin" __status__ = "Prototype" # Production, Development, Prototype +class DecisionStumpSCMNew(BaseEstimator, ClassifierMixin): + """docstring for SCM + A hands on class of SCM using decision stump, built with sklearn format in order to use sklearn function on SCM like + CV, gridsearch, and so on ...""" + + def __init__(self, model_type='conjunction', p=0.1, max_rules=10, random_state=42): + super(DecisionStumpSCMNew, self).__init__() + self.model_type = model_type + self.p = p + self.max_rules = max_rules + self.random_state = random_state + + def fit(self, X, y): + self.clf = scm(model_type=self.model_type, max_rules=self.max_rules, p=self.p, random_state=self.random_state) + self.clf.fit(X=X, y=y) + + def predict(self, X): + return self.clf.predict(X) + + def set_params(self, **params): + for key, value in iteritems(params): + if key == 'p': + self.p = value + if key == 'model_type': + self.model_type = value + if key == 'max_rules': + self.max_rules = value + + def get_stats(self): + return {"Binary_attributes": self.clf.model_.rules} + + def canProbas(): return False def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1, **kwargs): - max_attrtibutes = kwargs['0'] - try: - p = kwargs['1'] - except: - p = 1.0 - try: - model_type = kwargs['2'] - except: - model_type = "conjunction" - try: - attributeClassification = kwargs["attributeClassification"] - binaryAttributes = kwargs["binaryAttributes"] - except: - attributeClassification, binaryAttributes, dsetFile, name = transformData(DATASET) - classifier = pyscm.scm.SetCoveringMachine(p=p, max_attributes=max_attrtibutes, model_type=model_type, verbose=False) - classifier.fit(binaryAttributes, CLASS_LABELS, X=None, attribute_classifications=attributeClassification, - iteration_callback=None) - try: - dsetFile.close() - os.remove(name) - except: - pass + modelType = kwargs['0'] + maxRules = int(kwargs['1']) + p = float(kwargs["2"]) + classifier = DecisionStumpSCMNew(model_type=modelType, max_rules=maxRules, p=p, random_state=randomState) + classifier.fit(DATASET, CLASS_LABELS) return classifier def paramsToSet(nIter, randomState): paramsSet = [] for _ in range(nIter): - paramsSet.append([randomState.randint(1, 20), randomState.random_sample(), - randomState.choice(["conjunction", "disjunction"])]) + paramsSet.append([randomState.choice(["conjunction", "disjunction"]), randomState.randint(1, 15), randomState.random_sample()]) return paramsSet def getKWARGS(kwargsList): kwargsDict = {} for (kwargName, kwargValue) in kwargsList: - if kwargName == "CL_SCM_max_rules": - kwargsDict['0'] = int(kwargValue) - elif kwargName == "CL_SCM_p": + if kwargName == "CL_SCM_model_type": + kwargsDict['0'] = kwargValue + elif kwargName == "CL_SCM_max_rules": kwargsDict['1'] = int(kwargValue) - elif kwargName == "CL_SCM_model_type": - kwargsDict['2'] = kwargValue + elif kwargName == "CL_SCM_p": + kwargsDict['2'] = float(kwargValue) return kwargsDict -def randomizedSearch(X_train, y_train, randomState, outputFileName, KFolds=None, metric=["accuracy_score", None], - nIter=30, nbCores=1): +def randomizedSearch(X_train, y_train, randomState, outputFileName, KFolds=4, metric=["accuracy_score", None], nIter=30, + nbCores=1): + pipeline = Pipeline([('classifier', DecisionStumpSCMNew())]) + + param = {"classifier__model_type": ['conjunction', 'disjunction'], + "classifier__p": uniform(), + "classifier__max_rules": randint(1,30)} metricModule = getattr(Metrics, metric[0]) if metric[1] is not None: metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1])) else: metricKWARGS = {} - if metricModule.getConfig()[-14] == "h": - baseScore = -1000.0 - isBetter = "higher" - else: - baseScore = 1000.0 - isBetter = "lower" - config = [] - maxAttributesArray = [] - pArray = [] - modelsArray = [] - for iterIndex in range(nIter): - max_attributes = randomState.randint(1, 20) - maxAttributesArray.append(max_attributes) - p = randomState.random_sample() - pArray.append(p) - model = randomState.choice(["conjunction", "disjunction"]) - modelsArray.append(model) - classifier = pyscm.scm.SetCoveringMachine(p=p, max_attributes=max_attributes, model_type=model, verbose=False) - scores = [] - kFolds = KFolds.split(X_train, y_train) - for foldIdx, (trainIndices, testIndices) in enumerate(kFolds): - attributeClassification, binaryAttributes, dsetFile, name = transformData(X_train[trainIndices]) - try: - classifier.fit(binaryAttributes, y_train[trainIndices], X=None, - attribute_classifications=attributeClassification, iteration_callback=None) - - predictedLabels = classifier.predict(X_train[testIndices]) - score = metricModule.score(y_train[testIndices], predictedLabels) - scores.append(score) - except: - pass - dsetFile.close() - os.remove(name) - if scores == []: - score = baseScore - else: - score = np.mean(np.array(scores)) - - if isBetter == "higher" and score > baseScore: - baseScore = score - config = [max_attributes, p, model] - if isBetter == "lower" and score < baseScore: - baseScore = score - config = [max_attributes, p, model] - - assert config != [], "No good configuration found for SCM" - scoresArray = scores - params = [("maxAttributes", np.array(maxAttributesArray)), - ("p", np.array(pArray)), - ("model", np.array(modelsArray))] + scorer = metricModule.get_scorer(**metricKWARGS) + grid = RandomizedSearchCV(pipeline, n_iter=nIter, param_distributions=param, refit=True, n_jobs=nbCores, + scoring=scorer, cv=KFolds, random_state=randomState) + detector = grid.fit(X_train, y_train) + desc_estimators = [detector.best_params_["classifier__model_type"], + detector.best_params_["classifier__max_rules"], + detector.best_params_["classifier__p"]] + + scoresArray = detector.cv_results_['mean_test_score'] + params = [("model_type", np.array(detector.cv_results_['param_classifier__model_type'])), + ("maxRules", np.array(detector.cv_results_['param_classifier__max_rules'])), + ("p", np.array(detector.cv_results_['param_classifier__p']))] genHeatMaps(params, scoresArray, outputFileName) - return config + return desc_estimators def getConfig(config): if type(config) not in [list, dict]: - return "\n\t\t- SCM with max_attributes : " + str( - config.max_attributes) + ", model type : " + config.model_type + ", p : " + str(config.p) + return "\n\t\t- SCM with model_type: " + config.model_type + ", max_rules : " + str(config.max_rules) +\ + ", p : " + str(config.p) else: try: - return "\n\t\t- SCM with max_attributes : " + str(config[0]) + ", p : " + str( - config[1]) + ", model type : " + str(config[2]) + return "\n\t\t- SCM with model_type: " + config[0] + ", max_rules : " + str(config[1]) + ", p : " +\ + str(config[2]) except: - return "\n\t\t- SCM with max_attributes : " + str(config["0"]) + ", p : " + str( - config["1"]) + ", model type : " + str(config["2"]) - - -def transformData(dataArray): - dataArray = dataArray.astype(np.uint8) - if isBinary(dataArray): - nbExamples = dataArray.shape[0] - featureSequence = [str(featureIndex) for featureIndex in range(dataArray.shape[1])] - featureIndexByRule = np.arange(dataArray.shape[1], dtype=np.uint32) - binaryAttributes = LazyBaptisteRuleList(featureSequence, featureIndexByRule) - packedData = _pack_binary_bytes_to_ints(dataArray, 64) - del dataArray - nameb = "temp_scm" - if not os.path.isfile(nameb): - dsetFile = h5py.File(nameb, "w") - name = nameb - else: - fail = True - i = 0 - name = nameb - while fail: - if not os.path.isfile(name): - dsetFile = h5py.File(name, "w") - fail = False - else: - i += 1 - name = nameb + str(i) - - packedDataset = dsetFile.create_dataset("temp_scm", data=packedData) - dsetFile.close() - dsetFile = h5py.File(name, "r") - packedDataset = dsetFile.get("temp_scm") - attributeClassification = BaptisteRuleClassifications(packedDataset, nbExamples) - return attributeClassification, binaryAttributes, dsetFile, name - - -def isBinary(dataset): - if type(dataset[0, 0]) is np.uint8: - return True - for line in dataset: - for data in line: - if data != 0 or data != 1: - return False - return True - - -# !/usr/bin/env python -""" - Kover: Learn interpretable computational phenotyping models from k-merized genomic data - Copyright (C) 2015 Alexandre Drouin - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -""" - -import numpy as np - -from math import ceil - -from pyscm.binary_attributes.classifications.popcount import inplace_popcount_32, inplace_popcount_64 -from pyscm.utils import _unpack_binary_bytes_from_ints - - -def _minimum_uint_size(max_value): - """ - Find the minimum size unsigned integer type that can store values of at most max_value - From A.Drouin's Kover - """ - if max_value <= np.iinfo(np.uint8).max: - return np.uint8 - elif max_value <= np.iinfo(np.uint16).max: - return np.uint16 - elif max_value <= np.iinfo(np.uint32).max: - return np.uint32 - elif max_value <= np.iinfo(np.uint64).max: - return np.uint64 - else: - return np.uint128 - - -class BaptisteRule(object): - def __init__(self, feature_index, kmer_sequence, type): - """ - A k-mer rule - Parameters: - ----------- - feature_index: uint - The index of the k-mer - kmer_sequence: string - The nucleotide sequence of the k-mer - type: string - The type of rule: presence or absence (use p or a) - """ - self.feature_index = feature_index - self.kmer_sequence = kmer_sequence - self.type = type - - def classify(self, X): - if self.type == "absence": - return (X[:, self.feature_index] == 0).astype(np.uint8) - else: - return (X[:, self.feature_index] == 1).astype(np.uint8) - - def inverse(self): - return BaptisteRule(feature_index=self.feature_index, kmer_sequence=self.kmer_sequence, - type="absence" if self.type == "presence" else "presence") - - def __str__(self): - return ("Absence(" if self.type == "absence" else "Presence(") + self.kmer_sequence + ")" - - -class LazyBaptisteRuleList(object): - """ - By convention, the first half of the list contains presence rules and the second half contains the absence rules in - the same order. - """ - - def __init__(self, kmer_sequences, feature_index_by_rule): - self.n_rules = feature_index_by_rule.shape[0] * 2 - self.kmer_sequences = kmer_sequences - self.feature_index_by_rule = feature_index_by_rule - super(LazyBaptisteRuleList, self).__init__() - - def __getitem__(self, idx): - if idx >= self.n_rules: - raise ValueError("Index %d is out of range for list of size %d" % (idx, self.n_rules)) - if idx >= len(self.kmer_sequences): - type = "absence" - feature_idx = self.feature_index_by_rule[idx % len(self.kmer_sequences)] - else: - type = "presence" - feature_idx = self.feature_index_by_rule[idx] - return BaptisteRule(idx % len(self.kmer_sequences), self.kmer_sequences[feature_idx], type) - - def __len__(self): - return self.n_rules - - -class BaseRuleClassifications(object): - def __init__(self): - pass - - def get_columns(self, columns): - raise NotImplementedError() - - def remove_rows(self, rows): - raise NotImplementedError() - - @property - def shape(self): - raise NotImplementedError() - - def sum_rows(self, rows): - raise NotImplementedError() - - -class BaptisteRuleClassifications(BaseRuleClassifications): - """ - Methods involving columns account for presence and absence rules - """ - - # TODO: Clean up. Get rid of the code to handle deleted rows. We don't need this. - def __init__(self, dataset, n_rows, block_size=None): - self.dataset = dataset - self.dataset_initial_n_rows = n_rows - self.dataset_n_rows = n_rows - self.dataset_removed_rows = [] - self.dataset_removed_rows_mask = np.zeros(self.dataset_initial_n_rows, dtype=np.bool) - self.block_size = (None, None) - - if block_size is None: - if self.dataset.chunks is None: - self.block_size = (1, self.dataset.shape[1]) - else: - self.block_size = self.dataset.chunks - else: - if len(block_size) != 2 or not isinstance(block_size[0], int) or not isinstance(block_size[1], int): - raise ValueError("The block size must be a tuple of 2 integers.") - self.block_size = block_size - - # Get the size of the ints used to store the data - if self.dataset.dtype == np.uint32: - self.dataset_pack_size = 32 - self.inplace_popcount = inplace_popcount_32 - elif self.dataset.dtype == np.uint64: - self.dataset_pack_size = 64 - self.inplace_popcount = inplace_popcount_64 - else: - raise ValueError("Unsupported data type for packed attribute classifications array. The supported data" + - " types are np.uint32 and np.uint64.") - - super(BaseRuleClassifications, self).__init__() - - def get_columns(self, columns): - """ - Columns can be an integer (or any object that implements __index__) or a sorted list/ndarray. - """ - # TODO: Support slicing, make this more efficient than getting the columns individually. - columns_is_int = False - if hasattr(columns, "__index__"): # All int types implement the __index__ method (PEP 357) - columns = [columns.__index__()] - columns_is_int = True - elif isinstance(columns, np.ndarray): - columns = columns.tolist() - elif isinstance(columns, list): - pass - else: - columns = list(columns) - # Detect where an inversion is needed (columns corresponding to absence rules) - columns, invert_result = zip(*(((column if column < self.dataset.shape[1] else column % self.dataset.shape[1]), - (True if column > self.dataset.shape[1] else False)) for column in columns)) - columns = list(columns) - invert_result = np.array(invert_result) - - # Don't return rows that have been deleted - row_mask = np.ones(self.dataset.shape[0] * self.dataset_pack_size, dtype=np.bool) - row_mask[self.dataset_initial_n_rows:] = False - row_mask[self.dataset_removed_rows] = False - - # h5py requires that the column indices are sorted - unique, inverse = np.unique(columns, return_inverse=True) - result = _unpack_binary_bytes_from_ints(self.dataset[:, unique.tolist()])[row_mask] - result = result[:, inverse] - result[:, invert_result] = 1 - result[:, invert_result] - - if columns_is_int: - return result.reshape(-1) - else: - return result - - @property - def shape(self): - return self.dataset_n_rows, self.dataset.shape[1] * 2 - - # TODO: allow summing over multiple lists of rows at a time (saves i/o operations) - def sum_rows(self, rows): - """ - Note: Assumes that the rows argument does not contain duplicate elements. Rows will not be considered more than once. - """ - rows = np.asarray(rows) - result_dtype = _minimum_uint_size(rows.shape[0]) - result = np.zeros(self.dataset.shape[1] * 2, dtype=result_dtype) - - # Builds a mask to turn off the bits of the rows we do not want to count in the sum. - def build_row_mask(example_idx, n_examples, mask_n_bits): - if mask_n_bits not in [8, 16, 32, 64, 128]: - raise ValueError("Unsupported mask format. Use 8, 16, 32, 64 or 128 bits.") - - n_masks = int(ceil(float(n_examples) / mask_n_bits)) - masks = [0] * n_masks - - for idx in example_idx: - example_mask = idx / mask_n_bits - example_mask_idx = mask_n_bits - (idx - mask_n_bits * example_mask) - 1 - masks[example_mask] |= 1 << example_mask_idx - - return np.array(masks, dtype="u" + str(mask_n_bits / 8)) - - # Find the rows that occur in each dataset and their relative index - rows = np.sort(rows) - dataset_relative_rows = [] - for row_idx in rows: - # Find which row in the dataset corresponds to the requested row - # TODO: This is inefficient! Could exploit the fact that rows is sorted to reuse previous iterations. - current_idx = -1 - n_active_elements_seen = 0 - while n_active_elements_seen <= row_idx: - current_idx += 1 - if not self.dataset_removed_rows_mask[current_idx]: - n_active_elements_seen += 1 - dataset_relative_rows.append(current_idx) - - # Create a row mask for each dataset - row_mask = build_row_mask(dataset_relative_rows, self.dataset_initial_n_rows, self.dataset_pack_size) - del dataset_relative_rows - - # For each dataset load the rows for which the mask is not 0. Support column slicing aswell - n_col_blocks = int(ceil(1.0 * self.dataset.shape[1] / self.block_size[1])) - rows_to_load = np.where(row_mask != 0)[0] - n_row_blocks = int(ceil(1.0 * len(rows_to_load) / self.block_size[0])) - - for row_block in xrange(n_row_blocks): - block_row_mask = row_mask[rows_to_load[row_block * self.block_size[0]:(row_block + 1) * self.block_size[0]]] - - for col_block in xrange(n_col_blocks): - - # Load the appropriate rows/columns based on the block sizes - block = self.dataset[rows_to_load[row_block * self.block_size[0]:(row_block + 1) * self.block_size[0]], - col_block * self.block_size[1]:(col_block + 1) * self.block_size[1]] - - # Popcount - if len(block.shape) == 1: - block = block.reshape(1, -1) - self.inplace_popcount(block, block_row_mask) - - # Increment the sum - result[col_block * self.block_size[1]:min((col_block + 1) * self.block_size[1], - self.dataset.shape[1])] += np.sum(block, axis=0) - - # Compute the sum for absence rules - result[self.dataset.shape[1]:] = len(rows) - result[: self.dataset.shape[1]] - - return result + return "\n\t\t- SCM with model_type: " + config["0"] + ", max_rules : " + str(config["1"]) + ", p : " + \ + str(config["2"]) \ No newline at end of file diff --git a/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusion.py b/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusion.py index 5e970bd4..34b17cbc 100644 --- a/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusion.py +++ b/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusion.py @@ -28,10 +28,14 @@ def fitMonoviewClassifier(classifierName, data, labels, classifierConfig, needPr classifier = monoviewClassifier.fit(data, labels, randomState, DTConfig) return classifier else: + if type(classifierConfig) is dict: + pass + else: + classifierConfig = dict((str(configIndex), config) + for configIndex, config in enumerate(classifierConfig)) + classifier = monoviewClassifier.fit(data, labels, randomState, - **dict((str(configIndex), config) for configIndex, config in - enumerate(classifierConfig - ))) + **classifierConfig) return classifier @@ -68,6 +72,9 @@ def intersect(allClassifersNames, directory, viewsIndices, resultsMonoview, clas bestCombination = combination return [classifiersNames[viewIndex][index] for viewIndex, index in enumerate(bestCombination)] +def allMonoviewClassifiers(allClassifersNames, directory, viewsIndices, resultsMonoview, classificationIndices): + return allClassifersNames + def bestScore(allClassifersNames, directory, viewsIndices, resultsMonoview, classificationIndices): nbViews = len(viewsIndices) @@ -133,8 +140,8 @@ class LateFusionClassifier(object): trainIndices = range(DATASET.get("Metadata").attrs["datasetLength"]) self.monoviewClassifiers = Parallel(n_jobs=self.nbCores)( - delayed(fitMonoviewClassifier)(self.monoviewClassifiersNames[index], - getV(DATASET, viewIndex, trainIndices), - DATASET.get("Labels").value[trainIndices], - self.monoviewClassifiersConfigs[index], self.needProbas, self.randomState) - for index, viewIndex in enumerate(viewsIndices)) + delayed(fitMonoviewClassifier)(self.monoviewClassifiersNames[index], + getV(DATASET, viewIndex, trainIndices), + DATASET.get("Labels").value[trainIndices], + self.monoviewClassifiersConfigs[index], self.needProbas, self.randomState) + for index, viewIndex in enumerate(viewsIndices)) diff --git a/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/MajorityVoting.py b/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/MajorityVoting.py index 77986387..966b0ca3 100644 --- a/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/MajorityVoting.py +++ b/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/MajorityVoting.py @@ -26,6 +26,9 @@ def getArgs(benchmark, args, views, viewsIndices, directory, resultsMonoview, cl viewsIndices, resultsMonoview, classificationIndices) monoviewClassifierModules = [getattr(MonoviewClassifiers, classifierName) for classifierName in args.FU_L_cl_names] + if args.FU_L_cl_names == [""] and args.CL_type == ["Multiview"]: + raise AttributeError("You must perform Monoview classification or specify " + "which monoview classifier to use Late Fusion") if args.FU_L_cl_config != ['']: classifiersConfigs = [ monoviewClassifierModule.getKWARGS([arg.split(":") for arg in classifierConfig.split(",")]) diff --git a/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SCMForLinear.py b/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SCMForLinear.py index ad35ce1e..d4b6980c 100644 --- a/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SCMForLinear.py +++ b/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SCMForLinear.py @@ -1,10 +1,17 @@ import numpy as np import pyscm -from pyscm.utils import _pack_binary_bytes_to_ints +# from pyscm.utils import _pack_binary_bytes_to_ints import os import h5py -from pyscm.binary_attributes.classifications.popcount import inplace_popcount_32, inplace_popcount_64 -from pyscm.utils import _unpack_binary_bytes_from_ints +# from pyscm.binary_attributes.classifications.popcount import inplace_popcount_32, inplace_popcount_64 +# from pyscm.utils import _unpack_binary_bytes_from_ints + +from pyscm.scm import SetCoveringMachineClassifier as scm +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.pipeline import Pipeline +from sklearn.model_selection import RandomizedSearchCV +from sklearn.externals.six import iteritems, iterkeys, itervalues + from math import ceil import random from sklearn.metrics import accuracy_score @@ -15,6 +22,39 @@ from ..LateFusion import LateFusionClassifier, getClassifiers, getConfig from ..... import MonoviewClassifiers from .....utils.Dataset import getV + +class DecisionStumpSCMNew(BaseEstimator, ClassifierMixin): + """docstring for SCM + A hands on class of SCM using decision stump, built with sklearn format in order to use sklearn function on SCM like + CV, gridsearch, and so on ...""" + + def __init__(self, model_type='conjunction', p=0.1, max_rules=10, random_state=42): + super(DecisionStumpSCMNew, self).__init__() + self.model_type = model_type + self.p = p + self.max_rules = max_rules + self.random_state = random_state + + def fit(self, X, y): + self.clf = scm(model_type=self.model_type, max_rules=self.max_rules, p=self.p, random_state=self.random_state) + self.clf.fit(X=X, y=y) + + def predict(self, X): + return self.clf.predict(X) + + def set_params(self, **params): + for key, value in iteritems(params): + if key == 'p': + self.p = value + if key == 'model_type': + self.model_type = value + if key == 'max_rules': + self.max_rules = value + + def get_stats(self): + return {"Binary_attributes": self.clf.model_.rules} + + def genParamsSets(classificationKWARGS, randomState, nIter=1): nbView = classificationKWARGS["nbView"] paramsSets = [] @@ -36,6 +76,9 @@ def getArgs(benchmark, args, views, viewsIndices, directory, resultsMonoview, cl viewsIndices, resultsMonoview, classificationIndices) monoviewClassifierModules = [getattr(MonoviewClassifiers, classifierName) for classifierName in args.FU_L_cl_names] + if args.FU_L_cl_names == [""] and args.CL_type == ["Multiview"]: + raise AttributeError("You must perform Monoview classification or specify " + "which monoview classifier to use Late Fusion") if args.FU_L_cl_config != ['']: classifiersConfigs = [ monoviewClassifierModule.getKWARGS([arg.split(":") for arg in classifierConfig.split(",")]) @@ -80,6 +123,7 @@ class SCMForLinear(LateFusionClassifier): self.p = paramsSet[0] self.maxAttributes = paramsSet[1] self.order = paramsSet[3] + self.order = 2 self.modelType = paramsSet[2] def fit_hdf5(self, DATASET, trainIndices=None, viewsIndices=None): @@ -118,55 +162,21 @@ class SCMForLinear(LateFusionClassifier): viewsIndices = np.arange(DATASET.get("Metadata").attrs["nbView"]) nbView = len(viewsIndices) - self.SCMClassifier = pyscm.scm.SetCoveringMachine(p=self.p, max_attributes=self.maxAttributes, - model_type=self.modelType, verbose=False) + self.SCMClassifier = DecisionStumpSCMNew(p=self.p, max_rules=self.maxAttributes, model_type=self.modelType, + random_state=self.randomState) monoViewDecisions = np.zeros((len(usedIndices), nbView), dtype=int) for index, viewIndex in enumerate(viewsIndices): monoViewDecisions[:, index] = self.monoviewClassifiers[index].predict( getV(DATASET, viewIndex, usedIndices)) features = self.generateInteractions(monoViewDecisions) - featureSequence = [str(index) for index in range(nbView)] - for orderIndex in range(self.order - 1): - featureSequence += [str(featureIndex) for featureIndex in - itertools.combinations(range(monoViewDecisions.shape[1]), orderIndex + 2)] - featureIndexByRule = np.arange(features.shape[1], dtype=np.uint32) - binaryAttributes = LazyBaptisteRuleList(featureSequence, featureIndexByRule) - packedData = _pack_binary_bytes_to_ints(features, 64) - nameb = "temp_scm_fusion" - if not os.path.isfile(nameb): - dsetFile = h5py.File(nameb, "w") - name = nameb - else: - fail = True - i = 0 - name = nameb - while fail: - if not os.path.isfile(name): - dsetFile = h5py.File(name, "w") - fail = False - else: - i += 1 - name = nameb + str(i) - - packedDataset = dsetFile.create_dataset("temp_scm", data=packedData) - dsetFile.close() - dsetFile = h5py.File(name, "r") - packedDataset = dsetFile.get("temp_scm") - attributeClassification = BaptisteRuleClassifications(packedDataset, features.shape[0]) - self.SCMClassifier.fit(binaryAttributes, DATASET.get("Labels").value[usedIndices], - attribute_classifications=attributeClassification) - try: - dsetFile.close() - os.remove(name) - except: - pass + features = np.array([np.array([feat for feat in feature]) for feature in features]) + self.SCMClassifier.fit(features, DATASET.get("Labels").value[usedIndices].astype(int)) def generateInteractions(self, monoViewDecisions): if type(self.order) == type(None): - order = monoViewDecisions.shape[1] + self.order = monoViewDecisions.shape[1] if self.order == 1: return monoViewDecisions - else: genratedIntercations = [monoViewDecisions[:, i] for i in range(monoViewDecisions.shape[1])] for orderIndex in range(self.order - 1): @@ -181,256 +191,14 @@ class SCMForLinear(LateFusionClassifier): generatedDecision = np.logical_or(generatedDecision, monoViewDecisions[:, combin[index + 1]]) genratedIntercations.append(generatedDecision) - return np.transpose(np.array(genratedIntercations).astype(np.uint8)) + return np.transpose(np.array(genratedIntercations)) def getConfig(self, fusionMethodConfig, monoviewClassifiersNames, monoviewClassifiersConfigs): configString = "with SCM for linear with max_attributes : " + str(self.maxAttributes) + ", p : " + str(self.p) + \ - " model_type : " + str(self.modelType) + " has chosen " + \ - str(len(self.SCMClassifier.attribute_importances)) + " rule(s) \n\t-With monoview classifiers : " + " model_type : " + str(self.modelType) + " order : " + str(self.order)+ " has chosen " + \ + str(0.1) + " rule(s) \n\t-With monoview classifiers : " for monoviewClassifierConfig, monoviewClassifierName in zip(monoviewClassifiersConfigs, monoviewClassifiersNames): monoviewClassifierModule = getattr(MonoviewClassifiers, monoviewClassifierName) configString += monoviewClassifierModule.getConfig(monoviewClassifierConfig) - return configString - - -def _minimum_uint_size(max_value): - """ - Find the minimum size unsigned integer type that can store values of at most max_value - From A.Drouin's Kover - """ - if max_value <= np.iinfo(np.uint8).max: - return np.uint8 - elif max_value <= np.iinfo(np.uint16).max: - return np.uint16 - elif max_value <= np.iinfo(np.uint32).max: - return np.uint32 - elif max_value <= np.iinfo(np.uint64).max: - return np.uint64 - else: - return np.uint128 - - -class BaptisteRule(object): - def __init__(self, feature_index, kmer_sequence, type): - """ - A k-mer rule - Parameters: - ----------- - feature_index: uint - The index of the k-mer - kmer_sequence: string - The nucleotide sequence of the k-mer - type: string - The type of rule: presence or absence (use p or a) - """ - self.feature_index = feature_index - self.kmer_sequence = kmer_sequence - self.type = type - - def classify(self, X): - if self.type == "absence": - return (X[:, self.feature_index] == 0).astype(np.uint8) - else: - return (X[:, self.feature_index] == 1).astype(np.uint8) - - def inverse(self): - return BaptisteRule(feature_index=self.feature_index, kmer_sequence=self.kmer_sequence, - type="absence" if self.type == "presence" else "presence") - - def __str__(self): - return ("Absence(" if self.type == "absence" else "Presence(") + self.kmer_sequence + ")" - - -class LazyBaptisteRuleList(object): - """ - By convention, the first half of the list contains presence rules and the second half contains the absence rules in - the same order. - """ - - def __init__(self, kmer_sequences, feature_index_by_rule): - self.n_rules = feature_index_by_rule.shape[0] * 2 - self.kmer_sequences = kmer_sequences - self.feature_index_by_rule = feature_index_by_rule - super(LazyBaptisteRuleList, self).__init__() - - def __getitem__(self, idx): - if idx >= self.n_rules: - raise ValueError("Index %d is out of range for list of size %d" % (idx, self.n_rules)) - if idx >= len(self.kmer_sequences): - type = "absence" - feature_idx = self.feature_index_by_rule[idx % len(self.kmer_sequences)] - else: - type = "presence" - feature_idx = self.feature_index_by_rule[idx] - return BaptisteRule(idx % len(self.kmer_sequences), self.kmer_sequences[feature_idx], type) - - def __len__(self): - return self.n_rules - - -class BaseRuleClassifications(object): - def __init__(self): - pass - - def get_columns(self, columns): - raise NotImplementedError() - - def remove_rows(self, rows): - raise NotImplementedError() - - @property - def shape(self): - raise NotImplementedError() - - def sum_rows(self, rows): - raise NotImplementedError() - - -class BaptisteRuleClassifications(BaseRuleClassifications): - """ - Methods involving columns account for presence and absence rules - """ - - # TODO: Clean up. Get rid of the code to handle deleted rows. We don't need this. - def __init__(self, dataset, n_rows, block_size=None): - self.dataset = dataset - self.dataset_initial_n_rows = n_rows - self.dataset_n_rows = n_rows - self.dataset_removed_rows = [] - self.dataset_removed_rows_mask = np.zeros(self.dataset_initial_n_rows, dtype=np.bool) - self.block_size = (None, None) - - if block_size is None: - if self.dataset.chunks is None: - self.block_size = (1, self.dataset.shape[1]) - else: - self.block_size = self.dataset.chunks - else: - if len(block_size) != 2 or not isinstance(block_size[0], int) or not isinstance(block_size[1], int): - raise ValueError("The block size must be a tuple of 2 integers.") - self.block_size = block_size - - # Get the size of the ints used to store the data - if self.dataset.dtype == np.uint32: - self.dataset_pack_size = 32 - self.inplace_popcount = inplace_popcount_32 - elif self.dataset.dtype == np.uint64: - self.dataset_pack_size = 64 - self.inplace_popcount = inplace_popcount_64 - else: - raise ValueError("Unsupported data type for packed attribute classifications array. The supported data" + - " types are np.uint32 and np.uint64.") - - super(BaseRuleClassifications, self).__init__() - - def get_columns(self, columns): - """ - Columns can be an integer (or any object that implements __index__) or a sorted list/ndarray. - """ - # TODO: Support slicing, make this more efficient than getting the columns individually. - columns_is_int = False - if hasattr(columns, "__index__"): # All int types implement the __index__ method (PEP 357) - columns = [columns.__index__()] - columns_is_int = True - elif isinstance(columns, np.ndarray): - columns = columns.tolist() - elif isinstance(columns, list): - pass - else: - columns = list(columns) - # Detect where an inversion is needed (columns corresponding to absence rules) - columns, invert_result = zip(*(((column if column < self.dataset.shape[1] else column % self.dataset.shape[1]), - (True if column > self.dataset.shape[1] else False)) for column in columns)) - columns = list(columns) - invert_result = np.array(invert_result) - - # Don't return rows that have been deleted - row_mask = np.ones(self.dataset.shape[0] * self.dataset_pack_size, dtype=np.bool) - row_mask[self.dataset_initial_n_rows:] = False - row_mask[self.dataset_removed_rows] = False - - # h5py requires that the column indices are sorted - unique, inverse = np.unique(columns, return_inverse=True) - result = _unpack_binary_bytes_from_ints(self.dataset[:, unique.tolist()])[row_mask] - result = result[:, inverse] - result[:, invert_result] = 1 - result[:, invert_result] - - if columns_is_int: - return result.reshape(-1) - else: - return result - - @property - def shape(self): - return self.dataset_n_rows, self.dataset.shape[1] * 2 - - # TODO: allow summing over multiple lists of rows at a time (saves i/o operations) - def sum_rows(self, rows): - """ - Note: Assumes that the rows argument does not contain duplicate elements. Rows will not be considered more than once. - """ - rows = np.asarray(rows) - result_dtype = _minimum_uint_size(rows.shape[0]) - result = np.zeros(self.dataset.shape[1] * 2, dtype=result_dtype) - - # Builds a mask to turn off the bits of the rows we do not want to count in the sum. - def build_row_mask(example_idx, n_examples, mask_n_bits): - if mask_n_bits not in [8, 16, 32, 64, 128]: - raise ValueError("Unsupported mask format. Use 8, 16, 32, 64 or 128 bits.") - - n_masks = int(ceil(float(n_examples) / mask_n_bits)) - masks = [0] * n_masks - - for idx in example_idx: - example_mask = idx / mask_n_bits - example_mask_idx = mask_n_bits - (idx - mask_n_bits * example_mask) - 1 - masks[example_mask] |= 1 << example_mask_idx - - return np.array(masks, dtype="u" + str(mask_n_bits / 8)) - - # Find the rows that occur in each dataset and their relative index - rows = np.sort(rows) - dataset_relative_rows = [] - for row_idx in rows: - # Find which row in the dataset corresponds to the requested row - # TODO: This is inefficient! Could exploit the fact that rows is sorted to reuse previous iterations. - current_idx = -1 - n_active_elements_seen = 0 - while n_active_elements_seen <= row_idx: - current_idx += 1 - if not self.dataset_removed_rows_mask[current_idx]: - n_active_elements_seen += 1 - dataset_relative_rows.append(current_idx) - - # Create a row mask for each dataset - row_mask = build_row_mask(dataset_relative_rows, self.dataset_initial_n_rows, self.dataset_pack_size) - del dataset_relative_rows - - # For each dataset load the rows for which the mask is not 0. Support column slicing aswell - n_col_blocks = int(ceil(1.0 * self.dataset.shape[1] / self.block_size[1])) - rows_to_load = np.where(row_mask != 0)[0] - n_row_blocks = int(ceil(1.0 * len(rows_to_load) / self.block_size[0])) - - for row_block in xrange(n_row_blocks): - block_row_mask = row_mask[rows_to_load[row_block * self.block_size[0]:(row_block + 1) * self.block_size[0]]] - - for col_block in xrange(n_col_blocks): - - # Load the appropriate rows/columns based on the block sizes - block = self.dataset[rows_to_load[row_block * self.block_size[0]:(row_block + 1) * self.block_size[0]], - col_block * self.block_size[1]:(col_block + 1) * self.block_size[1]] - - # Popcount - if len(block.shape) == 1: - block = block.reshape(1, -1) - self.inplace_popcount(block, block_row_mask) - - # Increment the sum - result[col_block * self.block_size[1]:min((col_block + 1) * self.block_size[1], - self.dataset.shape[1])] += np.sum(block, axis=0) - - # Compute the sum for absence rules - result[self.dataset.shape[1]:] = len(rows) - result[: self.dataset.shape[1]] - - return result + return configString \ No newline at end of file diff --git a/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SVMForLinear.py b/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SVMForLinear.py index 8259b53c..b4b8ef04 100644 --- a/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SVMForLinear.py +++ b/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SVMForLinear.py @@ -24,6 +24,9 @@ def getArgs(benchmark, args, views, viewsIndices, directory, resultsMonoview, cl viewsIndices, resultsMonoview, classificationIndices) monoviewClassifierModules = [getattr(MonoviewClassifiers, classifierName) for classifierName in args.FU_L_cl_names] + if args.FU_L_cl_names == [""] and args.CL_type == ["Multiview"]: + raise AttributeError("You must perform Monoview classification or specify " + "which monoview classifier to use Late Fusion") if args.FU_L_cl_config != ['']: classifiersConfigs = [ monoviewClassifierModule.getKWARGS([arg.split(":") for arg in classifierConfig.split(",")]) @@ -62,12 +65,16 @@ class SVMForLinear(LateFusionClassifier): if type(self.monoviewClassifiersConfigs[0]) == dict: for index, viewIndex in enumerate(viewsIndices): monoviewClassifier = getattr(MonoviewClassifiers, self.monoviewClassifiersNames[index]) + if type(self.monoviewClassifiersConfigs[index]) is dict: + pass + else: + self.monoviewClassifiersConfigs[index] = dict((str(configIndex), config) + for configIndex, config in enumerate(self.monoviewClassifiersConfigs[index])) self.monoviewClassifiers.append( monoviewClassifier.fit(getV(DATASET, viewIndex, trainIndices), DATASET.get("Labels").value[trainIndices], self.randomState, NB_CORES=self.nbCores, - **dict((str(configIndex), config) for configIndex, config in - enumerate(self.monoviewClassifiersConfigs[index])))) + **self.monoviewClassifiersConfigs[index])) else: self.monoviewClassifiers = self.monoviewClassifiersConfigs self.SVMForLinearFusionFit(DATASET, usedIndices=trainIndices, viewsIndices=viewsIndices) diff --git a/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/WeightedLinear.py b/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/WeightedLinear.py index 24de1714..f46aa43a 100644 --- a/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/WeightedLinear.py +++ b/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/WeightedLinear.py @@ -26,6 +26,9 @@ def getArgs(benchmark, args, views, viewsIndices, directory, resultsMonoview, cl viewsIndices, resultsMonoview, classificationIndices) monoviewClassifierModules = [getattr(MonoviewClassifiers, classifierName) for classifierName in args.FU_L_cl_names] + if args.FU_L_cl_names == [""] and args.CL_type == ["Multiview"]: + raise AttributeError("You must perform Monoview classification or specify " + "which monoview classifier to use Late Fusion") if args.FU_L_cl_config != ['']: classifiersConfigs = [ monoviewClassifierModule.getKWARGS([arg.split(":") for arg in classifierConfig.split(",")]) -- GitLab