Commit 2bef9f09 authored by Baptiste Bauvin's avatar Baptiste Bauvin
Browse files

Doc and tuto reorganized

parent 9888c471
......@@ -3,4 +3,7 @@ demo/*.hdf5
demo/*.html
multiview_generator.egg-info
demo/tutorials/.ipy*
demo/tutorials/supplementray_material/tuto/
\ No newline at end of file
demo/tutorials/supplementary_material/demo.hdf5
demo/tutorials/supplementary_material/tuto.hdf5
demo/tutorials/supplementary_material/report.md
demo/tutorials/supplementary_material/tuto/
\ No newline at end of file
{
"cells": [
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true,
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"name": "stdout",
"text": [
"[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199], [200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299]]\n"
],
"output_type": "stream"
}
],
"source": [
"import numpy as np\n",
"from multiview_generator.multiple_sub_problems import MultiViewSubProblemsGenerator\n",
"\n",
"n_views=3\n",
"n_classes=3\n",
"complementarity = np.array([0.3 for _ in range(n_classes)]).reshape((n_classes, 1))\n",
"complementarity_level = np.array([0.5 for _ in range(n_classes)]).reshape((n_classes, 1))\n",
"n_examples_per_class = np.array([100 for _ in range(n_classes)])\n",
"available_init_indices = [[i+(100*class_ind) for i in range(100)] for class_ind in range(n_classes)]\n",
"error_matrix = np.zeros((12,12))\n",
"complementarity_examples = [_ for _ in range(n_classes)]\n",
"good_views_indices = [_ for _ in range(n_classes)]\n",
"bad_views_indices = [_ for _ in range(n_classes)]\n",
"rs = np.random.RandomState(42)\n",
"example_ids = np.zeros(sum(n_examples_per_class), dtype=\"S100\")\n",
"\n",
"print(available_init_indices)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"name": "stdout",
"text": [
"[[array([0, 2]), array([0, 2]), array([0, 2]), array([0, 1]), array([1, 2]), array([0, 1]), array([0, 2]), array([0, 1]), array([0, 2]), array([1, 2]), array([0, 2]), array([0, 1]), array([0, 2]), array([0, 2]), array([1, 2]), array([0, 2]), array([0, 1]), array([0, 2]), array([0, 1]), array([0, 2]), array([0, 2]), array([1, 2]), array([0, 2]), array([1, 2]), array([0, 2]), array([0, 1]), array([0, 1]), array([0, 2]), array([0, 2]), array([0, 2])], [array([0, 2]), array([1, 2]), array([0, 2]), array([0, 2]), array([0, 1]), array([1, 2]), array([0, 1]), array([1, 2]), array([1, 2]), array([0, 1]), array([0, 1]), array([0, 1]), array([1, 2]), array([0, 2]), array([0, 2]), array([0, 2]), array([0, 2]), array([1, 2]), array([1, 2]), array([0, 1]), array([0, 2]), array([1, 2]), array([0, 1]), array([0, 1]), array([0, 2]), array([0, 1]), array([1, 2]), array([1, 2]), array([1, 2]), array([0, 2])], [array([1, 2]), array([0, 2]), array([0, 1]), array([0, 1]), array([1, 2]), array([0, 1]), array([1, 2]), array([0, 1]), array([1, 2]), array([1, 2]), array([1, 2]), array([0, 2]), array([0, 1]), array([0, 2]), array([0, 1]), array([0, 1]), array([0, 2]), array([0, 1]), array([1, 2]), array([0, 2]), array([0, 2]), array([0, 1]), array([0, 1]), array([0, 1]), array([0, 1]), array([0, 2]), array([0, 2]), array([1, 2]), array([0, 2]), array([1, 2])]]\n"
],
"output_type": "stream"
}
],
"source": [
"def _remove_available(available_indices, to_remove, class_index):\n",
" \"\"\"\n",
" Removes indices from the available ones array\n",
" \"\"\"\n",
" available_indices[class_index] = [ind\n",
" for ind\n",
" in available_indices[class_index]\n",
" if ind not in to_remove]\n",
" return available_indices\n",
"\n",
"def _update_example_indices(target, target_name, class_ind):\n",
" for ind, target_ind in enumerate(target):\n",
" example_ids[target_ind] = target_name + \"_{}_{}\".format(ind, class_ind)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": false
}
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"data": {
"text/plain": "[1, 1, 1]"
},
"metadata": {},
"output_type": "execute_result",
"execution_count": 10
}
],
"source": [
"n_bad = [int(complementarity_level[class_index]*n_views)\n",
" for class_index in range(n_classes)]\n",
"n_bad"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": false
}
}
},
{
"cell_type": "markdown",
"source": [
"Complementarity is defined by class. Which means that samples of class i can be very complementary for example.\n",
"To check if the setting is compatible with the error matrix, \n",
"now we check if there is enough available indices that are not redundant or mutual error.\n",
" "
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"data": {
"text/plain": "array([False, False, False])"
},
"metadata": {},
"output_type": "execute_result",
"execution_count": 12
}
],
"source": [
"((complementarity * n_examples_per_class)[0] > np.array(\n",
" [len(inds) for inds in available_init_indices]))"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": false
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"for class_index, complementarity in enumerate(complementarity):\n",
" n_comp = int(complementarity_level[class_index]*n_views)\n",
" complementarity_examples[class_index] = rs.choice(\n",
" available_init_indices[class_index],\n",
" size=int(n_examples_per_class[\n",
" class_index] * complementarity),\n",
" replace=False)\n",
" _update_example_indices(\n",
" complementarity_examples[class_index],\n",
" 'Complementary', class_index)\n",
" good_views_indices[class_index] = [\n",
" rs.choice(np.arange(n_views),\n",
" size=n_bad,\n",
" replace=False)\n",
" for _ in complementarity_examples[class_index]]\n",
" bad_views_indices[class_index] = [np.array([ind\n",
" for ind\n",
" in range(\n",
" n_views)\n",
" if ind not in\n",
" good_views_indices[\n",
" class_index][\n",
" ex_ind]])\n",
" for ex_ind, _ in\n",
" enumerate(\n",
" complementarity_examples[\n",
" class_index])]\n",
" _remove_available(available_init_indices,\n",
" complementarity_examples[\n",
" class_index],\n",
" class_index)\n",
"print(bad_views_indices) \n",
"\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 0
}
\ No newline at end of file
......@@ -16,7 +16,7 @@
},
{
"cell_type": "code",
"execution_count": 97,
"execution_count": 1,
"metadata": {
"pycharm": {
"is_executing": false,
......@@ -47,7 +47,7 @@
},
{
"cell_type": "code",
"execution_count": 98,
"execution_count": 2,
"metadata": {
"pycharm": {
"is_executing": false,
......@@ -74,7 +74,7 @@
},
{
"cell_type": "code",
"execution_count": 99,
"execution_count": 3,
"metadata": {
"pycharm": {
"is_executing": false,
......@@ -84,6 +84,7 @@
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------+-----+------+------+\n",
"| 0.4 | 0.4 | 0.4 | 0.4 |\n",
......@@ -92,8 +93,7 @@
"+------+-----+------+------+\n",
"| 0.4 | 0.5 | 0.52 | 0.55 |\n",
"+------+-----+------+------+\n"
],
"output_type": "stream"
]
}
],
"source": [
......@@ -121,7 +121,7 @@
},
{
"cell_type": "code",
"execution_count": 100,
"execution_count": 4,
"metadata": {
"pycharm": {
"is_executing": false,
......@@ -150,7 +150,7 @@
},
{
"cell_type": "code",
"execution_count": 101,
"execution_count": 5,
"metadata": {
"pycharm": {
"is_executing": false,
......@@ -160,13 +160,27 @@
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[array([399, 399, 399, 399]), array([299, 399, 399, 399]), array([399, 333, 319, 299])]\n",
"400.0\n",
"0 0 399 0\n",
"1 0 399 0\n",
"2 0 399 0\n",
"3 0 399 0\n",
"0 1 299 0\n",
"1 1 399 0\n",
"2 1 399 0\n",
"3 1 399 0\n",
"0 2 399 0\n",
"1 2 333 0\n",
"2 2 319 0\n",
"3 2 299 0\n",
"View 1 of shape (1998, 3)\n",
"View 2 of shape (1998, 3)\n",
"View 3 of shape (1998, 3)\n",
"View 4 of shape (1998, 3)\n"
],
"output_type": "stream"
]
}
],
"source": [
......@@ -202,7 +216,7 @@
},
{
"cell_type": "code",
"execution_count": 102,
"execution_count": 6,
"metadata": {
"pycharm": {
"is_executing": false,
......@@ -229,12 +243,122 @@
},
{
"cell_type": "code",
"execution_count": 103,
"execution_count": 7,
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": "<IPython.core.display.Markdown object>",
"text/markdown": "# Generated dataset description\n\nThe dataset named `demo` has been generated by [SMuDGE](https://gitlab.lis-lab.fr/dev/multiview_generator) and is comprised of \n\n* 1998 examples, splitted in \n* 3 classes, described by \n* 4 views.\n\nThe input error matrix is \n \n| | View 1 | View 2 | View 3 | View 4 |\n|---------|----------|----------|----------|----------|\n| Class 1 | 0.4 | 0.4 | 0.4 | 0.4 |\n| Class 2 | 0.55 | 0.4 | 0.4 | 0.4 |\n| Class 3 | 0.4 | 0.5 | 0.52 | 0.55 |\n\n The classes are balanced as : \n\n* Class 1 : 666 examples (33% of the dataset)\n* Class 2 : 666 examples (33% of the dataset)\n* Class 3 : 666 examples (33% of the dataset)\n\n The views have \n\n* 0.0% redundancy, \n* 0.0% mutual error and \n* 0.0% complementarity,\n\nthe remaining examples are randomly mis-labelled to fit the input error matrix.\n\n## Views description\n\n### View 1\n\nThis view is generated with [`make_classification`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html), with the following configuration : \n```yaml\nclass_sep: 10.0\nflip_y: 0\nhypercube: true\nn_clusters_per_class: 1\nn_features: 3\nn_informative: 3\nn_redundant: 0\nn_repeated: 0\nscale: 1.0\nshift: 0.0\nshuffle: false\n```\n\n### View 2\n\nThis view is generated with [`make_classification`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html), with the following configuration : \n```yaml\nclass_sep: 10.0\nflip_y: 0\nhypercube: true\nn_clusters_per_class: 1\nn_features: 3\nn_informative: 3\nn_redundant: 0\nn_repeated: 0\nscale: 1.0\nshift: 0.0\nshuffle: false\n```\n\n### View 3\n\nThis view is generated with [`make_classification`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html), with the following configuration : \n```yaml\nclass_sep: 10.0\nflip_y: 0\nhypercube: true\nn_clusters_per_class: 1\nn_features: 3\nn_informative: 3\nn_redundant: 0\nn_repeated: 0\nscale: 1.0\nshift: 0.0\nshuffle: false\n```\n\n### View 4\n\nThis view is generated with [`make_classification`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html), with the following configuration : \n```yaml\nclass_sep: 10.0\nflip_y: 0\nhypercube: true\nn_clusters_per_class: 1\nn_features: 3\nn_informative: 3\nn_redundant: 0\nn_repeated: 0\nscale: 1.0\nshift: 0.0\nshuffle: false\n```\n\nThis report has been automatically generated on April 20, 2020 at 13:59:27"
"text/markdown": [
"# Generated dataset description\n",
"\n",
"The dataset named `demo` has been generated by [SMuDGE](https://gitlab.lis-lab.fr/dev/multiview_generator) and is comprised of \n",
"\n",
"* 1998 examples, splitted in \n",
"* 3 classes, described by \n",
"* 4 views.\n",
"\n",
"The input error matrix is \n",
" \n",
"| | View 1 | View 2 | View 3 | View 4 |\n",
"|---------|----------|----------|----------|----------|\n",
"| Class 1 | 0.4 | 0.4 | 0.4 | 0.4 |\n",
"| Class 2 | 0.55 | 0.4 | 0.4 | 0.4 |\n",
"| Class 3 | 0.4 | 0.5 | 0.52 | 0.55 |\n",
"\n",
" The classes are balanced as : \n",
"\n",
"* Class 1 : 666 examples (33% of the dataset)\n",
"* Class 2 : 666 examples (33% of the dataset)\n",
"* Class 3 : 666 examples (33% of the dataset)\n",
"\n",
" The views have \n",
"\n",
"* 0.0% redundancy, \n",
"* 0.0% mutual error and \n",
"* 0.0% complementarity,\n",
"\n",
"the remaining examples are randomly mis-labelled to fit the input error matrix.\n",
"\n",
"## Views description\n",
"\n",
"### View 1\n",
"\n",
"This view is generated with [`make_classification`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html), with the following configuration : \n",
"```yaml\n",
"class_sep: 10.0\n",
"flip_y: 0\n",
"hypercube: true\n",
"n_clusters_per_class: 1\n",
"n_features: 3\n",
"n_informative: 3\n",
"n_redundant: 0\n",
"n_repeated: 0\n",
"scale: 1.0\n",
"shift: 0.0\n",
"shuffle: false\n",
"```\n",
"\n",
"### View 2\n",
"\n",
"This view is generated with [`make_classification`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html), with the following configuration : \n",
"```yaml\n",
"class_sep: 10.0\n",
"flip_y: 0\n",
"hypercube: true\n",
"n_clusters_per_class: 1\n",
"n_features: 3\n",
"n_informative: 3\n",
"n_redundant: 0\n",
"n_repeated: 0\n",
"scale: 1.0\n",
"shift: 0.0\n",
"shuffle: false\n",
"```\n",
"\n",
"### View 3\n",
"\n",
"This view is generated with [`make_classification`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html), with the following configuration : \n",
"```yaml\n",
"class_sep: 10.0\n",
"flip_y: 0\n",
"hypercube: true\n",
"n_clusters_per_class: 1\n",
"n_features: 3\n",
"n_informative: 3\n",
"n_redundant: 0\n",
"n_repeated: 0\n",
"scale: 1.0\n",
"shift: 0.0\n",
"shuffle: false\n",
"```\n",
"\n",
"### View 4\n",
"\n",
"This view is generated with [`make_classification`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html), with the following configuration : \n",
"```yaml\n",
"class_sep: 10.0\n",
"flip_y: 0\n",
"hypercube: true\n",
"n_clusters_per_class: 1\n",
"n_features: 3\n",
"n_informative: 3\n",
"n_redundant: 0\n",
"n_repeated: 0\n",
"scale: 1.0\n",
"shift: 0.0\n",
"shuffle: false\n",
"```\n",
"\n",
"This report has been automatically generated on April 22, 2020 at 09:19:24"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
......@@ -243,14 +367,7 @@
"source": [
"from IPython.display import display,Markdown\n",
"display(Markdown(description))"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": false
}
}
]
},
{
"cell_type": "markdown",
......@@ -265,7 +382,7 @@
},
{
"cell_type": "code",
"execution_count": 104,
"execution_count": 8,
"metadata": {
"pycharm": {
"is_executing": false,
......@@ -276,7 +393,7 @@
"source": [
"%%capture\n",
"\n",
"generator.gen_report(output_path=\".\", save=True)"
"generator.gen_report(output_path=\"supplementary_material\", save=True)"
]
},
{
......@@ -297,7 +414,7 @@
},
{
"cell_type": "code",
"execution_count": 105,
"execution_count": 9,
"metadata": {
"pycharm": {
"is_executing": false,
......@@ -306,7 +423,7 @@
},
"outputs": [],
"source": [
"generator.to_hdf5_mc(saving_path='.')"
"generator.to_hdf5_mc(saving_path='supplementary_material')"
]
},
{
......@@ -326,7 +443,7 @@
},
{
"cell_type": "code",
"execution_count": 106,
"execution_count": 10,
"metadata": {
"pycharm": {
"is_executing": false,
......@@ -376,6 +493,11 @@
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"The figure shows us the dataset with a 3D-subplot for each view. It is possible to remove the samples of a specific class by clicking on a label in the legend.\n",
"\n",
......@@ -385,17 +507,17 @@
"the [DecisionTree](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html) is a good approximation of the Bayes classifier.\n",
"\n",
"In order to estimate the test error in the dataset for each class with a Decision Tree, we use a [StratifiedKFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold) :"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
]
},
{
"cell_type": "code",
"execution_count": 107,
"execution_count": 11,
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"from sklearn.model_selection import StratifiedKFold\n",
......@@ -409,34 +531,33 @@
"\n",
"# Getting the list of each the sample indices in each fold.\n",
"folds = [[list(train), list(test)] for train, test in folds]"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": false
}
}
]
},
{
"cell_type": "markdown",
"source": [
"Then, we get a Decision Tree of depth 3 (as each view has 3 features), and fit it on each view, for each fold. \n",
"The ouptuted score is the cross-validation score on the 5 folds. "
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
"source": [
"Then, we get a Decision Tree of depth 3 (as each view has 3 features), and fit it on each view, for each fold. \n",
"The ouptuted score is the cross-validation score on the 5 folds. "
]
},
{
"cell_type": "code",
"execution_count": 108,
"execution_count": 12,
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%% \n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Input error matrix : \n",
"+------+-----+------+------+\n",
......@@ -464,8 +585,7 @@
"+-------------+-------------+-------------+-------------+\n",
"| -0.0429429 | -0.0045045 | -0.0145345 | 0.00795796 |\n",
"+-------------+-------------+-------------+-------------+\n"
],
"output_type": "stream"
]
}
],
"source": [
......@@ -500,17 +620,15 @@
" output[class_index, view_index] = 1-confusion_mat[view_index, class_index, class_index]/n_sample_per_class[view_index, class_index]\n",
" \n",
"print(\"Input error matrix : \\n{}\\n\\nOutputted error matrix : \\n{}\\n\\nDifference :\\n{}\".format(tabulate(error_matrix, tablefmt='grid'), tabulate(output, tablefmt='grid'), tabulate(error_matrix-output, tablefmt='grid')))"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% \n",
"is_executing": false
}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"Here, we can see that there is a slight difference between the input error matrix and the ouput one.\n",
"\n",
......@@ -518,13 +636,7 @@
"\n",
"In this demo, we used SMuDGE to generate a basic multiview dataset, and we performed a naive analysis on it. \n",
"The next tutorial will be focused on introducing redundancy, mutual error and complementarity. "
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
]
}
],
"metadata": {
......@@ -548,13 +660,13 @@