diff --git a/README.md b/README.md index ffe9cf482357ece53efd90ae72e6e086ee6bb235..92cc0de369d72c97e4799749e54da12f1506f5cf 100644 --- a/README.md +++ b/README.md @@ -42,35 +42,48 @@ Training usage: python trainer.py --name <name> --train_filename <path> [options] optional arguments: - -h, --help show this help message and exit - --gpus <int> list of gpus to use (-1 = all in CUDA_VISIBLE_DEVICES) - --nodes <int> number of nodes for distributed training (see pytorch_lightning doc) - --name <str> experiment name - --fast_dev_run run one batch to check that training works - --train_filename <path> name of json file containing training/validation instances - --learning_rate <float> learning rate (default=2e-5) - --batch_size <int> size of batch (default=32) - --epochs <int> number of epochs (default=20) - --valid_size_percent <int> validation set size in % (default=10) - --max_len <int> max sequence length (default=256) - --bert_flavor <path> pretrained bert model (default=monologg/biobert_v1.1_pubmed - --selected_features <list> list of features to load from input (default=title abstract) - --dropout <float> dropout after bert - --loss <bce|f1> choose loss function [f1, bce] (default=f1) - --augment_data simulate missing abstract through augmentation (default=do not augment data) + -h, --help show this help message and exit + --gpus GPUS ids of GPUs to use (use -1 for all available GPUs, defaults to CPU) + --nodes NODES number of computation nodes for distributed training (see lightning docs, defaults to 1) + --name NAME name of experiment (required) + --fast_dev_run run a single batch through the whole training loop for catching bugs + --seed SEED set global random seed (defaults to 123) + --stem STEM stem name of json files containing training/validation/test instances (<stem>.{train,valid,test}) + --learning_rate LEARNING_RATE learning rate (default=2e-5) + --batch_size BATCH_SIZE size of batch (default=32) + --epochs EPOCHS number of epochs (default=20) + --valid_size_percent VALID_SIZE_PERCENT validation set size in % (default=10) + --max_len MAX_LEN max sequence length (default=256) + --selected_features SELECTED_FEATURES [SELECTED_FEATURES ...] list of features to load from input (default=title abstract) + --dropout DROPOUT dropout after bert + --loss LOSS choose loss function [f1, bce] (default=bce) + --augment_data simulate missing abstract through augmentation (default=do not augment data) + --transfer TRANSFER transfer weights from checkpoint (default=do not transfer) + --model MODEL model type [rnn, cnn, bert] (default=bert) + --bert_flavor BERT_FLAVOR pretrained bert model (default=monologg/biobert_v1.1_pubmed) + --rnn_embed_size RNN_EMBED_SIZE rnn embedding size (default=128) + --rnn_hidden_size RNN_HIDDEN_SIZE rnn hidden size (default=128) + --rnn_layers RNN_LAYERS rnn number of layers (default=1) + --cnn_embed_size CNN_EMBED_SIZE cnn embedding size (default=128) + --cnn_hidden_size CNN_HIDDEN_SIZE cnn hidden size (default=128) + --cnn_layers CNN_LAYERS cnn number of layers (default=1) + --cnn_kernel_size CNN_KERNEL_SIZE cnn kernel size (default=3) + --scheduler SCHEDULER learning rate schedule [warmup_linear] (default=fixed learning rate) + --scheduler_warmup SCHEDULER_WARMUP learning rate schedule warmup epochs (default=1) ``` Example training command line: ``` -python trainer.py --gpus=-1 --name test1 --train_filename ../scrappers/data/20200529/litcovid.json +python trainer.py --gpus=-1 --name test1 --stem ../scrappers/data/20200615/folds/litcovid-0 ``` -Logs are saved in `lightning_logs/`, best `val_loss` checkpoints in `checkpoints/`. +Logs are saved in `logs/`, for each experiment there is a file `run.json` with hyperparameters and metrics at each epoch, and two checkpoints, the best and last checkpoints. +The best checkpoint is used for testing. -pytorch-lightning provides a tensorboard logger. You can check it with +The logger provides a simplified tensorboard-like facility. Run it with ``` -tensorboard --logdir lightning_logs +python logger.py ``` Then point your browser to http://localhost:6006/. diff --git a/generate_experiments.py b/generate_experiments.py new file mode 100644 index 0000000000000000000000000000000000000000..a12c5c5ad28ebb6d03aa39036af3188f7a43c6d7 --- /dev/null +++ b/generate_experiments.py @@ -0,0 +1,21 @@ + +data_dir = '../scrappers/data/20200615' +exp = 0 +for fold in range(5): + for dataset in ['bibliovid', 'litcovid']: + for loss in ['bce', 'f1']: + for learning_rate in [1e-3, 2e-5]: + for model in ['bert', 'rnn', 'cnn']: + if model in ['rnn', 'cnn']: + for layers in [1, 2, 4]: + for hidden in [128, 256, 512]: + if model == 'cnn': + for kernel in [2, 3, 5]: + print(f"{exp} python trainer.py --gpus -1 --name exp_{exp} --epochs 20 --scheduler warmup_linear --scheduler_warmup 2 --stem '{data_dir}/folds/{dataset}-{fold}' --loss {loss} --model {model} --learning_rate {learning_rate} --{model}_layers {layers} --{model}_embed_size 256 --{model}_hidden_size {hidden} --cnn_kernel_size {kernel}") + exp += 1 + elif model == 'rnn': + print(f"{exp} python trainer.py --gpus -1 --name exp_{exp} --epochs 20 --scheduler warmup_linear --scheduler_warmup 2 --stem '{data_dir}/folds/{dataset}-{fold}' --loss {loss} --model {model} --learning_rate {learning_rate} --{model}_layers {layers} --{model}_embed_size 256 --{model}_hidden_size {hidden}") + exp += 1 + elif model == 'bert': + print(f"{exp} python trainer.py --gpus -1 --name exp_{exp} --epochs 20 --scheduler warmup_linear --scheduler_warmup 2 --stem '{data_dir}/folds/{dataset}-{fold}' --loss {loss} --model {model} --learning_rate {learning_rate}") + exp += 1 diff --git a/logger.py b/logger.py index cc61b64183c0762dd9c82bf8227fe0320bdccc56..1ae7a586129362e3360ce4e80e0e3837f5ab2c9d 100644 --- a/logger.py +++ b/logger.py @@ -4,7 +4,7 @@ import sys import collections class Logger: - def __init__(self, name, checkpoint_metric='val_loss', logdir='logs'): + def __init__(self, name, checkpoint_metric='val_loss', logdir='logs', save_checkpoints=True): self.directory = os.path.join(logdir, name) os.makedirs(self.directory, exist_ok=True) self.metrics = collections.defaultdict(dict) @@ -14,16 +14,19 @@ class Logger: self.best_checkpoint = os.path.join(self.directory, 'best_checkpoint') self.test_metrics = {} self.save_function = None + self.save_checkpoints = save_checkpoints def set_save_function(self, save_function): self.save_function = save_function def log_metrics(self, epoch, metrics): self.metrics[epoch].update(metrics) - self.save_function(os.path.join(self.directory, 'last_checkpoint')) + if self.save_checkpoints: + self.save_function(os.path.join(self.directory, 'last_checkpoint')) if self.checkpoint_metric in metrics and (self.best_loss is None or metrics[self.checkpoint_metric] > self.best_loss): self.best_loss = metrics[self.checkpoint_metric] - self.save_function(os.path.join(self.directory, 'best_checkpoint')) + if self.save_checkpoints: + self.save_function(os.path.join(self.directory, 'best_checkpoint')) self.save() def log_test(self, metrics): @@ -73,7 +76,7 @@ if __name__ == '__main__': for row in logs['metrics'].values(): metrics.update(row.keys()) - buttons = '<div id="buttons">' + ' | '.join(['<a href="#" onclick="update(\'%s\')">%s</a>' % (metric, metric) for metric in sorted(metrics)]) + '</div>' + buttons = '<div id="buttons">' + ' | '.join(['<a href="#" onclick="update(\'%s\')">%s</a>' % (metric, metric) for metric in sorted(metrics)]) + ' | <input name="filter" type="text" id="filter" placeholder="filter"></div>' html = buttons + """<canvas id="canvas"> <script src="https://pageperso.lis-lab.fr/benoit.favre/files/autoplot.js"></script> <script> diff --git a/model.py b/model.py index a310f6cf61248f79cb05aeddbe676963d905fb82..9e34cb59496849679b947a8bee3e766a24542e8b 100644 --- a/model.py +++ b/model.py @@ -13,7 +13,8 @@ from transformers import AutoModel import data # based on https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric -def binary_f1_score_with_logits(y_pred, y_true, epsilon=1e-7): +def binary_f1_score_with_logits(y_pred, y_true): + epsilon = 1e-7 y_pred = torch.sigmoid(y_pred) y_true = y_true.float() @@ -34,8 +35,10 @@ def binary_f1_score_with_logits(y_pred, y_true, epsilon=1e-7): class RNNLayer(nn.Module): def __init__(self, hidden_size=128, dropout=0.3): super().__init__() + rnn_layers = 1 + directions = 2 rnn_output = hidden_size * rnn_layers * directions - self.rnn = nn.GRU(hidden_size, hidden_size, bias=True, num_layers=1, bidirectional=True, batch_first=True) + self.rnn = nn.GRU(hidden_size, hidden_size, bias=True, num_layers=1, bidirectional=(directions == 2), batch_first=True) self.dense = nn.Linear(rnn_output, hidden_size) self.dropout = nn.Dropout(dropout) self.norm = nn.LayerNorm(hidden_size) @@ -54,12 +57,13 @@ class RNN(nn.Module): self.layers = nn.ModuleList([RNNLayer(hidden_size=hidden_size, dropout=dropout) for i in range(num_layers)]) self.dropout = nn.Dropout(dropout) - def forward(self, x_text): - embed = self.dropout(self.embed(x_text)) + def forward(self, x): + embed = self.dropout(self.embed(x)) activations = self.embed_to_rnn(F.gelu(embed)) for layer in self.layers: activations = layer(activations) - return activations + pool = F.max_pool1d(activations.transpose(1, 2), activations.size(1)) + return pool.view(x.shape[0], -1) class CNNLayer(nn.Module): @@ -143,7 +147,8 @@ class Model(LightningModule): else: raise ValueError('invalid model type "%s"' % self.hparams.model) - return self.decision(F.gelu(self.dropout(output))) + decision = self.decision(F.gelu(self.dropout(output))) + return decision def training_step(self, batch, batch_idx): x, y = batch @@ -273,8 +278,8 @@ class Model(LightningModule): parser.add_argument('--loss', default='bce', type=str, help='choose loss function [f1, bce] (default=bce)') parser.add_argument('--augment_data', default=False, action='store_true', help='simulate missing abstract through augmentation (default=do not augment data)') parser.add_argument('--transfer', default=None, type=str, help='transfer weights from checkpoint (default=do not transfer)') - parser.add_argument('--model', default='bert', type=str, help='model type [rnn, bert] (default=bert)') - parser.add_argument('--bert_flavor', default='monologg/biobert_v1.1_pubmed', type=str, help='pretrained bert model (default=monologg/biobert_v1.1_pubmed') + parser.add_argument('--model', default='bert', type=str, help='model type [rnn, cnn, bert] (default=bert)') + parser.add_argument('--bert_flavor', default='monologg/biobert_v1.1_pubmed', type=str, help='pretrained bert model (default=monologg/biobert_v1.1_pubmed)') parser.add_argument('--rnn_embed_size', default=128, type=int, help='rnn embedding size (default=128)') parser.add_argument('--rnn_hidden_size', default=128, type=int, help='rnn hidden size (default=128)') parser.add_argument('--rnn_layers', default=1, type=int, help='rnn number of layers (default=1)')