Skip to content
Snippets Groups Projects
Commit 438d04bb authored by bbauvin's avatar bbauvin
Browse files

Added SCM before i had

parent 1cfc1c47
No related branches found
No related tags found
No related merge requests found
......@@ -129,6 +129,10 @@ groupSGD = parser.add_argument_group('KNN arguments')
groupSGD.add_argument('--CL_KNN_neigh', metavar='STRING', action='store',
help='GridSearch: Determine number of neighbors for KNN', default='1:5:10:15')
groupSGD = parser.add_argument_group('SCM arguments')
groupSGD.add_argument('--CL_SCM_max_rules', metavar='STRING', action='store',
help='GridSearch: Determine number of neighbors for KNN', default='1')
groupMumbo = parser.add_argument_group('Mumbo arguments')
groupMumbo.add_argument('--MU_types', metavar='STRING', action='store',
help='Determine which monoview classifier to use with Mumbo',default='DecisionTree')
......@@ -142,13 +146,13 @@ groupMumbo.add_argument('--MU_iter', metavar='INT', action='store', nargs=3,
groupFusion = parser.add_argument_group('Fusion arguments')
groupFusion.add_argument('--FU_types', metavar='STRING', action='store',
help='Determine which type of fusion to use, if multiple separate with :',
default='LateFusion')
default='LateFusion:EarlyFusion')
groupFusion.add_argument('--FU_ealy_methods', metavar='STRING', action='store',
help='Determine which early fusion method of fusion to use, if multiple separate with :',
default='WeightedLinear')
default='')
groupFusion.add_argument('--FU_late_methods', metavar='STRING', action='store',
help='Determine which late fusion method of fusion to use, if multiple separate with :',
default='WeightedLinear')
default='')
groupFusion.add_argument('--FU_method_config', metavar='STRING', action='store', nargs='+',
help='Configuration for the fusion method', default=['1:1:1:1'])
groupFusion.add_argument('--FU_cl_names', metavar='STRING', action='store',
......@@ -253,6 +257,7 @@ if "Multiview" in args.CL_type.strip(":"):
benchmark["Multiview"]["Fusion"]= {}
benchmark["Multiview"]["Fusion"]["Methods"] = dict((fusionType, []) for fusionType in args.FU_types.split(":"))
if "LateFusion" in args.FU_types.split(":"):
benchmark["Multiview"]["Fusion"]["Methods"]["LateFusion"] = args.FU_late_methods.split(":")
if "EarlyFusion" in args.FU_types.split(":"):
benchmark["Multiview"]["Fusion"]["Methods"]["EarlyFusion"] = args.FU_early_methods.split(":")
......@@ -277,6 +282,7 @@ SGDKWARGSInit = {"2": map(float, args.CL_SGD_alpha.split(":"))[0], "1": args.CL_
"0":args.CL_SGD_loss.split(":")[0]}
KNNKWARGSInit = {"0": map(float, args.CL_KNN_neigh.split(":"))[0]}
AdaboostKWARGSInit = {"0": args.CL_Ada_n_est.split(":")[0], "1": args.CL_Ada_b_est.split(":")[0]}
SCMKWARGSInit = {"0":args.CL_SCM_max_rules.split(":")[0]}
dataBaseTime = time.time()-start
argumentDictionaries = {"Monoview": {}, "Multiview": []}
......
......@@ -45,6 +45,7 @@ def ExecMonoview_multicore(name, learningRate, nbFolds, datasetFileIndex, databa
def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path, gridSearch=True,
metrics=[["accuracy_score", None]], nIter=30, **args):
logging.debug("Start:\t Loading data")
try:
kwargs = args["args"]
except:
......@@ -60,10 +61,9 @@ def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path,
X = getValue(X)
datasetLength = X.shape[0]
clKWARGS = kwargs[kwargs["CL_type"]+"KWARGS"]
logging.debug("Done:\t Loading data")
# Determine the Database to extract features
logging.debug("### Main Programm for Classification MonoView")
logging.debug("### Classification - Database:" + str(name) + " Feature:" + str(feat) + " train_size:" + str(learningRate) + ", CrossValidation k-folds:" + str(nbFolds) + ", cores:" + str(nbCores)+", algorithm : "+CL_type)
logging.debug("Info:\t Classification - Database:" + str(name) + " Feature:" + str(feat) + " train_size:" + str(learningRate) + ", CrossValidation k-folds:" + str(nbFolds) + ", cores:" + str(nbCores)+", algorithm : "+CL_type)
# Calculate Train/Test data
......
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.grid_search import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
import Metrics
from pyscm.utils import _pack_binary_bytes_to_ints
import pyscm
from scipy.stats import randint
from utils.Dataset import getShape
import h5py
from pyscm.binary_attributes.base import BaseBinaryAttributeList
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
def fit(DATASET, CLASS_LABELS, NB_CORES=1,**kwargs):
try:
dataset = kwargs["dataset"]
attributeClassification = kwargs["attributeClassification"]
binaryAttributes = kwargs["binaryAttributes"]
except:
attributeClassification, binaryAttributes = transformData(DATASET)
# featureSequence = ["" for featureIndex in range(getShape(DATASET)[1])]
# featureIndexByRule = np.arange(getShape(DATASET)[1], dtype=np.uint32)
# binaryAttributes = LazyBaptisteRuleList(featureSequence, featureIndexByRule)
# attributeClassification = BaptisteRuleClassifications(dataset, n_rows)
classifier = pyscm.scm.SetCoveringMachine(p=1.0, max_attributes=10, verbose=False)
classifier.fit(binaryAttributes, CLASS_LABELS, X=None, attribute_classifications=attributeClassification, iteration_callback=None)
classifier.fit(DATASET, CLASS_LABELS)
return classifier
def gridSearch(X_train, y_train, nbFolds=4, metric=["accuracy_score", None], nIter=30, nbCores=1):
pipeline = Pipeline([('classifier', pyscm.scm.SetCoveringMachine())])
param= {"classifier__max_attributes": randint(1, 15),
"classifier__c":[1.0] ,
"classifier__p":[1.0] }
metricModule = getattr(Metrics, metric[0])
if metric[1]!=None:
metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1]))
else:
metricKWARGS = {}
scorer = metricModule.get_scorer(**metricKWARGS)
grid = RandomizedSearchCV(pipeline, n_iter=nIter, param_distributions=param,refit=True,n_jobs=nbCores,scoring=scorer,cv=nbFolds)
detector = grid.fit(X_train, y_train)
desc_estimators = [detector.best_params_["classifier__max_attributes"],
detector.best_params_["classifier__c"], detector.best_params_["classifier__p"]]
return desc_estimators
def getConfig(config):
try :
return "\n\t\t- SCM with max_attributes : "+str(config[0])+", c : "+str(config[1])+", p : "+str(config[2])
except:
return "\n\t\t- SCM with max_attributes : "+str(config["0"])+", c : "+str(config["1"])+", p : "+str(config["2"])
def transformData(dataArray): #Je prend en entree un numpy array contenant des donnees binaires
if isBinary(dataArray):
nbExamples = dataArray.shape[0] # Je recupere le nombre de lignes
featureSequence = ["" for featureIndex in range(dataArray.shape[1])] # je fais une fake feature sequence
featureIndexByRule = np.arange(dataArray.shape[1], dtype=np.uint32) # Indices des features pour la partie non virtuelle des donnees
binaryAttributes = LazyBaptisteRuleList(featureSequence, featureIndexByRule) # Je cree mes attributs binaires
packedData = _pack_binary_bytes_to_ints(dataArray, 64) #Je pack mon dataset
dsetFile = h5py.File("temp_scm", "w")
packedDataset = dsetFile.create_dataset("temp_scm", data=packedData) # Je le met en hdf5
attributeClassification = BaptisteRuleClassifications(packedDataset, nbExamples) # et je pre-calcule
return attributeClassification, binaryAttributes
# mon appel de fit : classifier.fit(binaryAttributes, labels, X=None, attribute_classifications=attributeClassification, iteration_callback=None)
def isBinary(dataset):
if type(dataset[0,0]) is bool:
return True
for line in dataset:
for data in line:
if data==0 or data==1:
pass
else:
return False
return True
#!/usr/bin/env python
"""
Kover: Learn interpretable computational phenotyping models from k-merized genomic data
Copyright (C) 2015 Alexandre Drouin
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import numpy as np
from math import ceil
from pyscm.binary_attributes.classifications.popcount import inplace_popcount_32, inplace_popcount_64
from pyscm.utils import _unpack_binary_bytes_from_ints
def _minimum_uint_size(max_value):
"""
Find the minimum size unsigned integer type that can store values of at most max_value
From A.Drouin's Kover
"""
if max_value <= np.iinfo(np.uint8).max:
return np.uint8
elif max_value <= np.iinfo(np.uint16).max:
return np.uint16
elif max_value <= np.iinfo(np.uint32).max:
return np.uint32
elif max_value <= np.iinfo(np.uint64).max:
return np.uint64
else:
return np.uint128
class BaptisteRule(object):
def __init__(self, feature_index, kmer_sequence, type):
"""
A k-mer rule
Parameters:
-----------
feature_index: uint
The index of the k-mer
kmer_sequence: string
The nucleotide sequence of the k-mer
type: string
The type of rule: presence or absence (use p or a)
"""
self.feature_index = feature_index
self.kmer_sequence = kmer_sequence
self.type = type
def classify(self, X):
if self.type == "absence":
return (X[:, self.feature_index] == 0).astype(np.uint8)
else:
return (X[:, self.feature_index] == 1).astype(np.uint8)
def inverse(self):
return BaptisteRule(feature_index=self.feature_index, kmer_sequence=self.kmer_sequence, type="absence" if self.type == "presence" else "presence")
def __str__(self):
return ("Absence(" if self.type == "absence" else "Presence(") + self.kmer_sequence + ")"
class LazyBaptisteRuleList(object):
"""
By convention, the first half of the list contains presence rules and the second half contains the absence rules in
the same order.
"""
def __init__(self, kmer_sequences, feature_index_by_rule):
self.n_rules = feature_index_by_rule.shape[0] * 2
self.kmer_sequences = kmer_sequences
self.feature_index_by_rule = feature_index_by_rule
super(LazyBaptisteRuleList, self).__init__()
def __getitem__(self, idx):
if idx >= self.n_rules:
raise ValueError("Index %d is out of range for list of size %d" % (idx, self.n_rules))
if idx >= len(self.kmer_sequences):
type = "absence"
feature_idx = self.feature_index_by_rule[idx % len(self.kmer_sequences)]
else:
type = "presence"
feature_idx = self.feature_index_by_rule[idx]
return BaptisteRule(idx % len(self.kmer_sequences), self.kmer_sequences[feature_idx], type)
def __len__(self):
return self.n_rules
class BaseRuleClassifications(object):
def __init__(self):
pass
def get_columns(self, columns):
raise NotImplementedError()
def remove_rows(self, rows):
raise NotImplementedError()
@property
def shape(self):
raise NotImplementedError()
def sum_rows(self, rows):
raise NotImplementedError()
class BaptisteRuleClassifications(BaseRuleClassifications):
"""
Methods involving columns account for presence and absence rules
"""
# TODO: Clean up. Get rid of the code to handle deleted rows. We don't need this.
def __init__(self, dataset, n_rows, block_size=None):
self.dataset = dataset
self.dataset_initial_n_rows = n_rows
self.dataset_n_rows = n_rows
self.dataset_removed_rows = []
self.dataset_removed_rows_mask = np.zeros(self.dataset_initial_n_rows, dtype=np.bool)
self.block_size = (None, None)
if block_size is None:
if self.dataset.chunks is None:
self.block_size = (1, self.dataset.shape[1])
else:
self.block_size = self.dataset.chunks
else:
if len(block_size) != 2 or not isinstance(block_size[0], int) or not isinstance(block_size[1], int):
raise ValueError("The block size must be a tuple of 2 integers.")
self.block_size = block_size
# Get the size of the ints used to store the data
if self.dataset.dtype == np.uint32:
self.dataset_pack_size = 32
self.inplace_popcount = inplace_popcount_32
elif self.dataset.dtype == np.uint64:
self.dataset_pack_size = 64
self.inplace_popcount = inplace_popcount_64
else:
raise ValueError("Unsupported data type for packed attribute classifications array. The supported data" +
" types are np.uint32 and np.uint64.")
super(BaseRuleClassifications, self).__init__()
def get_columns(self, columns):
"""
Columns can be an integer (or any object that implements __index__) or a sorted list/ndarray.
"""
#TODO: Support slicing, make this more efficient than getting the columns individually.
columns_is_int = False
if hasattr(columns, "__index__"): # All int types implement the __index__ method (PEP 357)
columns = [columns.__index__()]
columns_is_int = True
elif isinstance(columns, np.ndarray):
columns = columns.tolist()
elif isinstance(columns, list):
pass
else:
columns = list(columns)
# Detect where an inversion is needed (columns corresponding to absence rules)
columns, invert_result = zip(* (((column if column < self.dataset.shape[1] else column % self.dataset.shape[1]),
(True if column > self.dataset.shape[1] else False)) for column in columns))
columns = list(columns)
invert_result = np.array(invert_result)
# Don't return rows that have been deleted
row_mask = np.ones(self.dataset.shape[0] * self.dataset_pack_size, dtype=np.bool)
row_mask[self.dataset_initial_n_rows:] = False
row_mask[self.dataset_removed_rows] = False
# h5py requires that the column indices are sorted
unique, inverse = np.unique(columns, return_inverse=True)
result = _unpack_binary_bytes_from_ints(self.dataset[:, unique.tolist()])[row_mask]
result = result[:, inverse]
result[:, invert_result] = 1 - result[:, invert_result]
if columns_is_int:
return result.reshape(-1)
else:
return result
@property
def shape(self):
return self.dataset_n_rows, self.dataset.shape[1] * 2
# TODO: allow summing over multiple lists of rows at a time (saves i/o operations)
def sum_rows(self, rows):
"""
Note: Assumes that the rows argument does not contain duplicate elements. Rows will not be considered more than once.
"""
rows = np.asarray(rows)
result_dtype = _minimum_uint_size(rows.shape[0])
result = np.zeros(self.dataset.shape[1] * 2, dtype=result_dtype)
# Builds a mask to turn off the bits of the rows we do not want to count in the sum.
def build_row_mask(example_idx, n_examples, mask_n_bits):
if mask_n_bits not in [8, 16, 32, 64, 128]:
raise ValueError("Unsupported mask format. Use 8, 16, 32, 64 or 128 bits.")
n_masks = int(ceil(float(n_examples) / mask_n_bits))
masks = [0] * n_masks
for idx in example_idx:
example_mask = idx / mask_n_bits
example_mask_idx = mask_n_bits - (idx - mask_n_bits * example_mask) - 1
masks[example_mask] |= 1 << example_mask_idx
return np.array(masks, dtype="u" + str(mask_n_bits / 8))
# Find the rows that occur in each dataset and their relative index
rows = np.sort(rows)
dataset_relative_rows = []
for row_idx in rows:
# Find which row in the dataset corresponds to the requested row
# TODO: This is inefficient! Could exploit the fact that rows is sorted to reuse previous iterations.
current_idx = -1
n_active_elements_seen = 0
while n_active_elements_seen <= row_idx:
current_idx += 1
if not self.dataset_removed_rows_mask[current_idx]:
n_active_elements_seen += 1
dataset_relative_rows.append(current_idx)
# Create a row mask for each dataset
row_mask = build_row_mask(dataset_relative_rows, self.dataset_initial_n_rows, self.dataset_pack_size)
del dataset_relative_rows
# For each dataset load the rows for which the mask is not 0. Support column slicing aswell
n_col_blocks = int(ceil(1.0 * self.dataset.shape[1] / self.block_size[1]))
rows_to_load = np.where(row_mask != 0)[0]
n_row_blocks = int(ceil(1.0 * len(rows_to_load) / self.block_size[0]))
for row_block in xrange(n_row_blocks):
block_row_mask = row_mask[rows_to_load[row_block * self.block_size[0]:(row_block + 1) * self.block_size[0]]]
for col_block in xrange(n_col_blocks):
# Load the appropriate rows/columns based on the block sizes
block = self.dataset[rows_to_load[row_block * self.block_size[0]:(row_block + 1) * self.block_size[0]],
col_block * self.block_size[1]:(col_block + 1) * self.block_size[1]]
# Popcount
if len(block.shape) == 1:
block = block.reshape(1, -1)
self.inplace_popcount(block, block_row_mask)
# Increment the sum
result[col_block * self.block_size[1]:min((col_block + 1) * self.block_size[1], self.dataset.shape[1])] += np.sum(block, axis=0)
# Compute the sum for absence rules
result[self.dataset.shape[1] : ] = len(rows) - result[: self.dataset.shape[1]]
return result
\ No newline at end of file
......@@ -29,8 +29,14 @@ def getFakeDBhdf5(features, pathF, name , NB_CLASS, LABELS_NAME):
datasetFile = h5py.File(pathF+"Fake.hdf5", "w")
for index, viewData in enumerate(DATA.values()):
if index==0:
viewData = np.random.randint(0, 1, (DATASET_LENGTH,300), dtype=bool)#np.zeros(viewData.shape, dtype=bool)+np.ones((viewData.shape[0], viewData.shape[1]/2), dtype=bool)
viewDset = datasetFile.create_dataset("View"+str(index), viewData.shape)
viewDset[...] = viewData
viewDset.attrs["name"] = "View"+str(index)
viewDset.attrs["sparse"] = False
elif index == 1:
viewData = sparse.csr_matrix(viewData)
viewGrp = datasetFile.create_group("View0")
viewGrp = datasetFile.create_group("View"+str(index))
dataDset = viewGrp.create_dataset("data", viewData.data.shape, data=viewData.data)
indicesDset = viewGrp.create_dataset("indices", viewData.indices.shape, data=viewData.indices)
indptrDset = viewGrp.create_dataset("indptr", viewData.indptr.shape, data=viewData.indptr)
......@@ -359,7 +365,7 @@ def findParams(arrayLen, nbPatients, maxNbBins=5000, maxLenBin=300, minOverlappi
for lenBin in range(arrayLen-1):
if lenBin+1<maxLenBin:
for overlapping in sorted(range(lenBin+1-1), reverse=True):
if overlapping+1>minOverlapping and overlapping>lenBin/minNbBinsOverlapped:
if overlapping+1>minOverlapping and math.ceil(float(lenBin)/(lenBin-overlapping))>=minNbBinsOverlapped:
for nbBins in sorted(range(arrayLen-1), reverse=True):
if nbBins+1<maxNbBins:
if arrayLen == (nbBins+1-1)*(lenBin+1-overlapping+1)+lenBin+1:
......@@ -376,12 +382,14 @@ def findBins(nbBins, overlapping, lenBin):
return bins
def getBins(array, bins):
def getBins(array, bins, lenBin, overlapping):
binnedcoord = []
for coordIndex, coord in enumerate(array):
nbBinsFull = 0
for binIndex, bin in enumerate(bins):
if coordIndex in bin:
binnedcoord.append(binIndex+(coord*len(bins)))
return np.array(binnedcoord)
......@@ -392,14 +400,12 @@ def makeSparseTotalMatrix(sortedRNASeq):
overlapping = params["overlapping"]
lenBin = params["lenBin"]
bins = findBins(nbBins, overlapping, lenBin)
nbBins = nbBins+1
sparseFull = sparse.csc_matrix((nbPatients, nbGenes*nbBins))
for patientIndex, patient in enumerate(sortedRNASeq):
binnedcoord = getBins(patient, bins)
columIndices = binnedcoord
rowIndices = np.zeros(len(binnedcoord), dtype=int)+patientIndex
data = np.ones(len(binnedcoord), dtype=bool)
sparseFull = sparseFull+sparse.csc_matrix((data, (rowIndices, columIndices)), shape=(nbPatients, nbGenes*nbBins))
columnIndices = getBins(patient, bins, lenBin, overlapping)
rowIndices = np.zeros(len(columnIndices), dtype=int)+patientIndex
data = np.ones(len(columnIndices), dtype=bool)
sparseFull = sparseFull+sparse.csc_matrix((data, (rowIndices, columnIndices)), shape=(nbPatients, nbGenes*nbBins))
return sparseFull
......
......@@ -271,10 +271,10 @@ class Mumbo:
else:
return 0
def allViewsClassifyWell(self, predictions, pastIterIndice, NB_VIEW, CLASS_LABEL, exampleIndice):
def allViewsClassifyBadly(self, predictions, pastIterIndice, NB_VIEW, CLASS_LABEL, exampleIndice):
boolean = True
for viewIndice in range(NB_VIEW):
if predictions[pastIterIndice, viewIndice, exampleIndice] != CLASS_LABEL:
if predictions[pastIterIndice, viewIndice, exampleIndice] == CLASS_LABEL:
boolean = False
return boolean
......@@ -286,7 +286,7 @@ class Mumbo:
if self.predictions[pastIterIndice, viewIndice, exampleIndice] \
== \
CLASS_LABELS[exampleIndice] \
or self.allViewsClassifyWell(self.predictions, pastIterIndice,
or self.allViewsClassifyBadly(self.predictions, pastIterIndice,
NB_VIEW, CLASS_LABELS[exampleIndice],
exampleIndice):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment