Commit 9888c471 authored by Baptiste Bauvin's avatar Baptiste Bauvin
Browse files

Fresh verison

parent f52062e9
__pycache__
demo/*.hdf5
demo/*.html
multiview_generator.egg-info
\ No newline at end of file
multiview_generator.egg-info
demo/tutorials/.ipy*
demo/tutorials/supplementray_material/tuto/
\ No newline at end of file
from . import multiview_generator
from . import demo
......@@ -49,6 +49,7 @@ def make_fig(conf, confusion_output, n_views, n_classes, generator):
{'type': 'scatter3d'}, ]])
row = 1
col = 1
show_legend = True
for view_index in range(n_views):
for lab_index in range(n_classes):
concerned_examples = np.where(generator.y == lab_index)[0]
......@@ -59,11 +60,14 @@ def make_fig(conf, confusion_output, n_views, n_classes, generator):
z=generator.view_data[view_index][concerned_examples, 2],
text=[generator.example_ids[ind] for ind in concerned_examples],
hoverinfo='text',
legendgroup="Class {}".format(lab_index),
mode='markers', marker=dict(
size=1, # set color to an array/list of desired values
color=DEFAULT_PLOTLY_COLORS[lab_index],
opacity=0.8
), name="Class {}".format(lab_index)), row=row, col=col)
), name="Class {}".format(lab_index), showlegend=show_legend),
row=row, col=col)
show_legend = False
# fig.update_layout(
# scene=dict(
# xaxis=dict(nticks=4, range=[low_range, high_range], ),
......
n_views: 4
n_classes: 3
confusion_matrix:
error_matrix:
- [0.4, 0.4, 0.4, 0.4]
- [0.55, 0.4, 0.4, 0.4]
- [0.4, 0.5, 0.52, 0.55]
# - [0.4, 0.5, 0.5, 0.4]
# - [0.4, 0.4, 0.4, 0.4]
# - [0.4, 0.4, 0.4, 0.4]
# - [0.4, 0.4, 0.4, 0.4]
# - [0.4, 0.4, 0.4, 0.4]
n_samples: 2000
n_features: 3
n_informative: 3
class_seps: 10
class_weights: [0.125, 0.125, 0.125,]# 0.125, 0.125, 0.125, 0.125, 0.125,]
mutual_error: 0.2
redundancy: 0.1
complementarity: 0.35
name: "doc_summit"
mutual_error: [0.4, 0.4,0.4]
redundancy: [0.5,0.4,0.4]
complementarity: [0.1, 0.05,0.05]
name: "demo"
sub_problem_type: ["base", "base", "base", "gaussian"]
......@@ -4,10 +4,11 @@ from classify_generated import gen_folds, make_fig, test_dataset
n_views = 4
n_classes = 3
gene = MultiViewSubProblemsGenerator(config_file="config_generator.yml")
gene = MultiViewSubProblemsGenerator(config_file="config_demo.yml")
conf = np.ones((n_classes, n_views))*0.4
gene.generate_multi_view_dataset()
gene.to_hdf5_mc()
print(gene.gen_report())
folds = gen_folds(random_state=42, generator=gene)
output_confusion = test_dataset(folds, n_views, n_classes, gene)
......
{
"cells": [
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true,
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"name": "stdout",
"text": [
"[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199], [200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299]]\n"
],
"output_type": "stream"
}
],
"source": [
"import numpy as np\n",
"from multiview_generator.multiple_sub_problems import MultiViewSubProblemsGenerator\n",
"\n",
"n_views=3\n",
"n_classes=3\n",
"complementarity = np.array([0.3 for _ in range(n_classes)]).reshape((n_classes, 1))\n",
"complementarity_level = np.array([0.5 for _ in range(n_classes)]).reshape((n_classes, 1))\n",
"n_examples_per_class = np.array([100 for _ in range(n_classes)])\n",
"available_init_indices = [[i+(100*class_ind) for i in range(100)] for class_ind in range(n_classes)]\n",
"error_matrix = np.zeros((12,12))\n",
"complementarity_examples = [_ for _ in range(n_classes)]\n",
"good_views_indices = [_ for _ in range(n_classes)]\n",
"bad_views_indices = [_ for _ in range(n_classes)]\n",
"rs = np.random.RandomState(42)\n",
"example_ids = np.zeros(sum(n_examples_per_class), dtype=\"S100\")\n",
"\n",
"print(available_init_indices)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"name": "stdout",
"text": [
"[[array([0, 2]), array([0, 2]), array([0, 2]), array([0, 1]), array([1, 2]), array([0, 1]), array([0, 2]), array([0, 1]), array([0, 2]), array([1, 2]), array([0, 2]), array([0, 1]), array([0, 2]), array([0, 2]), array([1, 2]), array([0, 2]), array([0, 1]), array([0, 2]), array([0, 1]), array([0, 2]), array([0, 2]), array([1, 2]), array([0, 2]), array([1, 2]), array([0, 2]), array([0, 1]), array([0, 1]), array([0, 2]), array([0, 2]), array([0, 2])], [array([0, 2]), array([1, 2]), array([0, 2]), array([0, 2]), array([0, 1]), array([1, 2]), array([0, 1]), array([1, 2]), array([1, 2]), array([0, 1]), array([0, 1]), array([0, 1]), array([1, 2]), array([0, 2]), array([0, 2]), array([0, 2]), array([0, 2]), array([1, 2]), array([1, 2]), array([0, 1]), array([0, 2]), array([1, 2]), array([0, 1]), array([0, 1]), array([0, 2]), array([0, 1]), array([1, 2]), array([1, 2]), array([1, 2]), array([0, 2])], [array([1, 2]), array([0, 2]), array([0, 1]), array([0, 1]), array([1, 2]), array([0, 1]), array([1, 2]), array([0, 1]), array([1, 2]), array([1, 2]), array([1, 2]), array([0, 2]), array([0, 1]), array([0, 2]), array([0, 1]), array([0, 1]), array([0, 2]), array([0, 1]), array([1, 2]), array([0, 2]), array([0, 2]), array([0, 1]), array([0, 1]), array([0, 1]), array([0, 1]), array([0, 2]), array([0, 2]), array([1, 2]), array([0, 2]), array([1, 2])]]\n"
],
"output_type": "stream"
}
],
"source": [
"def _remove_available(available_indices, to_remove, class_index):\n",
" \"\"\"\n",
" Removes indices from the available ones array\n",
" \"\"\"\n",
" available_indices[class_index] = [ind\n",
" for ind\n",
" in available_indices[class_index]\n",
" if ind not in to_remove]\n",
" return available_indices\n",
"\n",
"def _update_example_indices(target, target_name, class_ind):\n",
" for ind, target_ind in enumerate(target):\n",
" example_ids[target_ind] = target_name + \"_{}_{}\".format(ind, class_ind)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": false
}
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"data": {
"text/plain": "[1, 1, 1]"
},
"metadata": {},
"output_type": "execute_result",
"execution_count": 10
}
],
"source": [
"n_bad = [int(complementarity_level[class_index]*n_views)\n",
" for class_index in range(n_classes)]\n",
"n_bad"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": false
}
}
},
{
"cell_type": "markdown",
"source": [
"Complementarity is defined by class. Which means that samples of class i can be very complementary for example.\n",
"To check if the setting is compatible with the error matrix, \n",
"now we check if there is enough available indices that are not redundant or mutual error.\n",
" "
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"data": {
"text/plain": "array([False, False, False])"
},
"metadata": {},
"output_type": "execute_result",
"execution_count": 12
}
],
"source": [
"((complementarity * n_examples_per_class)[0] > np.array(\n",
" [len(inds) for inds in available_init_indices]))"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": false
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"for class_index, complementarity in enumerate(complementarity):\n",
" n_comp = int(complementarity_level[class_index]*n_views)\n",
" complementarity_examples[class_index] = rs.choice(\n",
" available_init_indices[class_index],\n",
" size=int(n_examples_per_class[\n",
" class_index] * complementarity),\n",
" replace=False)\n",
" _update_example_indices(\n",
" complementarity_examples[class_index],\n",
" 'Complementary', class_index)\n",
" good_views_indices[class_index] = [\n",
" rs.choice(np.arange(n_views),\n",
" size=n_bad,\n",
" replace=False)\n",
" for _ in complementarity_examples[class_index]]\n",
" bad_views_indices[class_index] = [np.array([ind\n",
" for ind\n",
" in range(\n",
" n_views)\n",
" if ind not in\n",
" good_views_indices[\n",
" class_index][\n",
" ex_ind]])\n",
" for ex_ind, _ in\n",
" enumerate(\n",
" complementarity_examples[\n",
" class_index])]\n",
" _remove_available(available_init_indices,\n",
" complementarity_examples[\n",
" class_index],\n",
" class_index)\n",
"print(bad_views_indices) \n",
"\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 0
}
\ No newline at end of file
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": true,
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"# Multiview Dataset Generator Demo\n",
"\n",
"Once you have [installed](link_ton_install) SMuDGE, you are able to run it with this notebook."
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%% \n"
}
},
"outputs": [],
"source": [
"from multiview_generator.multiple_sub_problems import MultiViewSubProblemsGenerator\n",
"from tabulate import tabulate\n",
"import numpy as np\n",
"\n",
"random_state = np.random.RandomState(42)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Basic configuration\n",
"\n",
"Let us suppose that you want to build a multiview dataset with 4 views and three classes : "
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%% \n"
}
},
"outputs": [],
"source": [
"name = \"demo\"\n",
"n_views = 4\n",
"n_classes = 3"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"In order to configure the dataset, you have to provide the error matrix that gives the expected error of the Byaes classifier for Class i on View j as the value in row i column j : "
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"text": [
"+------+-----+------+------+\n",
"| 0.4 | 0.4 | 0.4 | 0.4 |\n",
"+------+-----+------+------+\n",
"| 0.55 | 0.4 | 0.4 | 0.4 |\n",
"+------+-----+------+------+\n",
"| 0.4 | 0.5 | 0.52 | 0.55 |\n",
"+------+-----+------+------+\n"
],
"output_type": "stream"
}
],
"source": [
"error_matrix = [\n",
" [0.4, 0.4, 0.4, 0.4],\n",
" [0.55, 0.4, 0.4, 0.4],\n",
" [0.4, 0.5, 0.52, 0.55]\n",
"]\n",
"print(tabulate(error_matrix, tablefmt=\"grid\"))"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"Once this has been defined, you can set all the other parameters of the dataset : \n",
"* the number of samples, \n",
"* the number of features of each view,\n",
"* the proportion of samples in each class."
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"n_samples = 2000\n",
"n_features = 3\n",
"class_weights = [0.333, 0.333, 0.333,]"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Generate the dataset\n",
"\n",
"With the basic configuration done, we can generate the dataset :"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%% \n"
}
},
"outputs": [
{
"name": "stdout",
"text": [
"View 1 of shape (1998, 3)\n",
"View 2 of shape (1998, 3)\n",
"View 3 of shape (1998, 3)\n",
"View 4 of shape (1998, 3)\n"
],
"output_type": "stream"
}
],
"source": [
"generator = MultiViewSubProblemsGenerator(name=name, n_views=n_views, \n",
" n_classes=n_classes, \n",
" n_samples=n_samples, \n",
" n_features=n_features, \n",
" class_weights=class_weights, \n",
" error_matrix=error_matrix, \n",
" random_state=random_state) \n",
"\n",
"view_data, y = generator.generate_multi_view_dataset()\n",
"\n",
"for view_index, view_datum in enumerate(view_data):\n",
" print(\"View {} of shape {}\".format(view_index+1, view_datum.shape))\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%% md\n"
}
},
"source": [
"Here, we see that the output shape is 999 instead of 1000 as the classes are supposed to be equivalent. \n",
"\n",
"## Get a description of it\n",
"\n",
"Now, if you wish to get information about the generated dataset, run : "
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"description = generator.gen_report(save=False)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"This will generate a markdown report on the dataset. Here, we used `save=False` so the description is not saved in a file. \n",
"\n",
"To print it in this notebook, we use : "
]
},
{
"cell_type": "code",
"execution_count": 103,
"outputs": [
{
"data": {
"text/plain": "<IPython.core.display.Markdown object>",
"text/markdown": "# Generated dataset description\n\nThe dataset named `demo` has been generated by [SMuDGE](https://gitlab.lis-lab.fr/dev/multiview_generator) and is comprised of \n\n* 1998 examples, splitted in \n* 3 classes, described by \n* 4 views.\n\nThe input error matrix is \n \n| | View 1 | View 2 | View 3 | View 4 |\n|---------|----------|----------|----------|----------|\n| Class 1 | 0.4 | 0.4 | 0.4 | 0.4 |\n| Class 2 | 0.55 | 0.4 | 0.4 | 0.4 |\n| Class 3 | 0.4 | 0.5 | 0.52 | 0.55 |\n\n The classes are balanced as : \n\n* Class 1 : 666 examples (33% of the dataset)\n* Class 2 : 666 examples (33% of the dataset)\n* Class 3 : 666 examples (33% of the dataset)\n\n The views have \n\n* 0.0% redundancy, \n* 0.0% mutual error and \n* 0.0% complementarity,\n\nthe remaining examples are randomly mis-labelled to fit the input error matrix.\n\n## Views description\n\n### View 1\n\nThis view is generated with [`make_classification`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html), with the following configuration : \n```yaml\nclass_sep: 10.0\nflip_y: 0\nhypercube: true\nn_clusters_per_class: 1\nn_features: 3\nn_informative: 3\nn_redundant: 0\nn_repeated: 0\nscale: 1.0\nshift: 0.0\nshuffle: false\n```\n\n### View 2\n\nThis view is generated with [`make_classification`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html), with the following configuration : \n```yaml\nclass_sep: 10.0\nflip_y: 0\nhypercube: true\nn_clusters_per_class: 1\nn_features: 3\nn_informative: 3\nn_redundant: 0\nn_repeated: 0\nscale: 1.0\nshift: 0.0\nshuffle: false\n```\n\n### View 3\n\nThis view is generated with [`make_classification`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html), with the following configuration : \n```yaml\nclass_sep: 10.0\nflip_y: 0\nhypercube: true\nn_clusters_per_class: 1\nn_features: 3\nn_informative: 3\nn_redundant: 0\nn_repeated: 0\nscale: 1.0\nshift: 0.0\nshuffle: false\n```\n\n### View 4\n\nThis view is generated with [`make_classification`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html), with the following configuration : \n```yaml\nclass_sep: 10.0\nflip_y: 0\nhypercube: true\nn_clusters_per_class: 1\nn_features: 3\nn_informative: 3\nn_redundant: 0\nn_repeated: 0\nscale: 1.0\nshift: 0.0\nshuffle: false\n```\n\nThis report has been automatically generated on April 20, 2020 at 13:59:27"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from IPython.display import display,Markdown\n",
"display(Markdown(description))"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": false
}
}
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"But if you just want to save it, you can use : "
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%% \n"
}
},
"outputs": [],
"source": [
"%%capture\n",
"\n",
"generator.gen_report(output_path=\".\", save=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"This will save the description in the current directory, in a file called `demo.md` as the name of the dataset is \"demo\".\n",
"\n",
"## Save it in an HDF5 file \n",
"\n",
"Moreover, it is possible to save tha dataset in an HDF5 file, compatible with [SuMMIT](https://gitlab.lis-lab.fr/baptiste.bauvin/summit/) with \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"generator.to_hdf5_mc(saving_path='.')"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Visualizing the dataset with [plotly](https://plotly.com/)\n",
"\n",
"Here, we purposely used ony 3 featrues per view, so the generated dataset is easily plottable in 3D. \n",
"\n",
"Let us plot each view : "
]
},
{
"cell_type": "code",