Skip to content
Snippets Groups Projects
Commit eccc6d81 authored by Benoit Favre's avatar Benoit Favre
Browse files

add script to generate experiments; fix a few things so that experiments run correctly

parent 14243bb9
No related branches found
No related tags found
No related merge requests found
......@@ -43,34 +43,47 @@ usage: python trainer.py --name <name> --train_filename <path> [options]
optional arguments:
-h, --help show this help message and exit
--gpus <int> list of gpus to use (-1 = all in CUDA_VISIBLE_DEVICES)
--nodes <int> number of nodes for distributed training (see pytorch_lightning doc)
--name <str> experiment name
--fast_dev_run run one batch to check that training works
--train_filename <path> name of json file containing training/validation instances
--learning_rate <float> learning rate (default=2e-5)
--batch_size <int> size of batch (default=32)
--epochs <int> number of epochs (default=20)
--valid_size_percent <int> validation set size in % (default=10)
--max_len <int> max sequence length (default=256)
--bert_flavor <path> pretrained bert model (default=monologg/biobert_v1.1_pubmed
--selected_features <list> list of features to load from input (default=title abstract)
--dropout <float> dropout after bert
--loss <bce|f1> choose loss function [f1, bce] (default=f1)
--gpus GPUS ids of GPUs to use (use -1 for all available GPUs, defaults to CPU)
--nodes NODES number of computation nodes for distributed training (see lightning docs, defaults to 1)
--name NAME name of experiment (required)
--fast_dev_run run a single batch through the whole training loop for catching bugs
--seed SEED set global random seed (defaults to 123)
--stem STEM stem name of json files containing training/validation/test instances (<stem>.{train,valid,test})
--learning_rate LEARNING_RATE learning rate (default=2e-5)
--batch_size BATCH_SIZE size of batch (default=32)
--epochs EPOCHS number of epochs (default=20)
--valid_size_percent VALID_SIZE_PERCENT validation set size in % (default=10)
--max_len MAX_LEN max sequence length (default=256)
--selected_features SELECTED_FEATURES [SELECTED_FEATURES ...] list of features to load from input (default=title abstract)
--dropout DROPOUT dropout after bert
--loss LOSS choose loss function [f1, bce] (default=bce)
--augment_data simulate missing abstract through augmentation (default=do not augment data)
--transfer TRANSFER transfer weights from checkpoint (default=do not transfer)
--model MODEL model type [rnn, cnn, bert] (default=bert)
--bert_flavor BERT_FLAVOR pretrained bert model (default=monologg/biobert_v1.1_pubmed)
--rnn_embed_size RNN_EMBED_SIZE rnn embedding size (default=128)
--rnn_hidden_size RNN_HIDDEN_SIZE rnn hidden size (default=128)
--rnn_layers RNN_LAYERS rnn number of layers (default=1)
--cnn_embed_size CNN_EMBED_SIZE cnn embedding size (default=128)
--cnn_hidden_size CNN_HIDDEN_SIZE cnn hidden size (default=128)
--cnn_layers CNN_LAYERS cnn number of layers (default=1)
--cnn_kernel_size CNN_KERNEL_SIZE cnn kernel size (default=3)
--scheduler SCHEDULER learning rate schedule [warmup_linear] (default=fixed learning rate)
--scheduler_warmup SCHEDULER_WARMUP learning rate schedule warmup epochs (default=1)
```
Example training command line:
```
python trainer.py --gpus=-1 --name test1 --train_filename ../scrappers/data/20200529/litcovid.json
python trainer.py --gpus=-1 --name test1 --stem ../scrappers/data/20200615/folds/litcovid-0
```
Logs are saved in `lightning_logs/`, best `val_loss` checkpoints in `checkpoints/`.
Logs are saved in `logs/`, for each experiment there is a file `run.json` with hyperparameters and metrics at each epoch, and two checkpoints, the best and last checkpoints.
The best checkpoint is used for testing.
pytorch-lightning provides a tensorboard logger. You can check it with
The logger provides a simplified tensorboard-like facility. Run it with
```
tensorboard --logdir lightning_logs
python logger.py
```
Then point your browser to http://localhost:6006/.
......
data_dir = '../scrappers/data/20200615'
exp = 0
for fold in range(5):
for dataset in ['bibliovid', 'litcovid']:
for loss in ['bce', 'f1']:
for learning_rate in [1e-3, 2e-5]:
for model in ['bert', 'rnn', 'cnn']:
if model in ['rnn', 'cnn']:
for layers in [1, 2, 4]:
for hidden in [128, 256, 512]:
if model == 'cnn':
for kernel in [2, 3, 5]:
print(f"{exp} python trainer.py --gpus -1 --name exp_{exp} --epochs 20 --scheduler warmup_linear --scheduler_warmup 2 --stem '{data_dir}/folds/{dataset}-{fold}' --loss {loss} --model {model} --learning_rate {learning_rate} --{model}_layers {layers} --{model}_embed_size 256 --{model}_hidden_size {hidden} --cnn_kernel_size {kernel}")
exp += 1
elif model == 'rnn':
print(f"{exp} python trainer.py --gpus -1 --name exp_{exp} --epochs 20 --scheduler warmup_linear --scheduler_warmup 2 --stem '{data_dir}/folds/{dataset}-{fold}' --loss {loss} --model {model} --learning_rate {learning_rate} --{model}_layers {layers} --{model}_embed_size 256 --{model}_hidden_size {hidden}")
exp += 1
elif model == 'bert':
print(f"{exp} python trainer.py --gpus -1 --name exp_{exp} --epochs 20 --scheduler warmup_linear --scheduler_warmup 2 --stem '{data_dir}/folds/{dataset}-{fold}' --loss {loss} --model {model} --learning_rate {learning_rate}")
exp += 1
......@@ -4,7 +4,7 @@ import sys
import collections
class Logger:
def __init__(self, name, checkpoint_metric='val_loss', logdir='logs'):
def __init__(self, name, checkpoint_metric='val_loss', logdir='logs', save_checkpoints=True):
self.directory = os.path.join(logdir, name)
os.makedirs(self.directory, exist_ok=True)
self.metrics = collections.defaultdict(dict)
......@@ -14,15 +14,18 @@ class Logger:
self.best_checkpoint = os.path.join(self.directory, 'best_checkpoint')
self.test_metrics = {}
self.save_function = None
self.save_checkpoints = save_checkpoints
def set_save_function(self, save_function):
self.save_function = save_function
def log_metrics(self, epoch, metrics):
self.metrics[epoch].update(metrics)
if self.save_checkpoints:
self.save_function(os.path.join(self.directory, 'last_checkpoint'))
if self.checkpoint_metric in metrics and (self.best_loss is None or metrics[self.checkpoint_metric] > self.best_loss):
self.best_loss = metrics[self.checkpoint_metric]
if self.save_checkpoints:
self.save_function(os.path.join(self.directory, 'best_checkpoint'))
self.save()
......@@ -73,7 +76,7 @@ if __name__ == '__main__':
for row in logs['metrics'].values():
metrics.update(row.keys())
buttons = '<div id="buttons">' + ' | '.join(['<a href="#" onclick="update(\'%s\')">%s</a>' % (metric, metric) for metric in sorted(metrics)]) + '</div>'
buttons = '<div id="buttons">' + ' | '.join(['<a href="#" onclick="update(\'%s\')">%s</a>' % (metric, metric) for metric in sorted(metrics)]) + ' | <input name="filter" type="text" id="filter" placeholder="filter"></div>'
html = buttons + """<canvas id="canvas">
<script src="https://pageperso.lis-lab.fr/benoit.favre/files/autoplot.js"></script>
<script>
......
......@@ -13,7 +13,8 @@ from transformers import AutoModel
import data
# based on https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric
def binary_f1_score_with_logits(y_pred, y_true, epsilon=1e-7):
def binary_f1_score_with_logits(y_pred, y_true):
epsilon = 1e-7
y_pred = torch.sigmoid(y_pred)
y_true = y_true.float()
......@@ -34,8 +35,10 @@ def binary_f1_score_with_logits(y_pred, y_true, epsilon=1e-7):
class RNNLayer(nn.Module):
def __init__(self, hidden_size=128, dropout=0.3):
super().__init__()
rnn_layers = 1
directions = 2
rnn_output = hidden_size * rnn_layers * directions
self.rnn = nn.GRU(hidden_size, hidden_size, bias=True, num_layers=1, bidirectional=True, batch_first=True)
self.rnn = nn.GRU(hidden_size, hidden_size, bias=True, num_layers=1, bidirectional=(directions == 2), batch_first=True)
self.dense = nn.Linear(rnn_output, hidden_size)
self.dropout = nn.Dropout(dropout)
self.norm = nn.LayerNorm(hidden_size)
......@@ -54,12 +57,13 @@ class RNN(nn.Module):
self.layers = nn.ModuleList([RNNLayer(hidden_size=hidden_size, dropout=dropout) for i in range(num_layers)])
self.dropout = nn.Dropout(dropout)
def forward(self, x_text):
embed = self.dropout(self.embed(x_text))
def forward(self, x):
embed = self.dropout(self.embed(x))
activations = self.embed_to_rnn(F.gelu(embed))
for layer in self.layers:
activations = layer(activations)
return activations
pool = F.max_pool1d(activations.transpose(1, 2), activations.size(1))
return pool.view(x.shape[0], -1)
class CNNLayer(nn.Module):
......@@ -143,7 +147,8 @@ class Model(LightningModule):
else:
raise ValueError('invalid model type "%s"' % self.hparams.model)
return self.decision(F.gelu(self.dropout(output)))
decision = self.decision(F.gelu(self.dropout(output)))
return decision
def training_step(self, batch, batch_idx):
x, y = batch
......@@ -273,8 +278,8 @@ class Model(LightningModule):
parser.add_argument('--loss', default='bce', type=str, help='choose loss function [f1, bce] (default=bce)')
parser.add_argument('--augment_data', default=False, action='store_true', help='simulate missing abstract through augmentation (default=do not augment data)')
parser.add_argument('--transfer', default=None, type=str, help='transfer weights from checkpoint (default=do not transfer)')
parser.add_argument('--model', default='bert', type=str, help='model type [rnn, bert] (default=bert)')
parser.add_argument('--bert_flavor', default='monologg/biobert_v1.1_pubmed', type=str, help='pretrained bert model (default=monologg/biobert_v1.1_pubmed')
parser.add_argument('--model', default='bert', type=str, help='model type [rnn, cnn, bert] (default=bert)')
parser.add_argument('--bert_flavor', default='monologg/biobert_v1.1_pubmed', type=str, help='pretrained bert model (default=monologg/biobert_v1.1_pubmed)')
parser.add_argument('--rnn_embed_size', default=128, type=int, help='rnn embedding size (default=128)')
parser.add_argument('--rnn_hidden_size', default=128, type=int, help='rnn hidden size (default=128)')
parser.add_argument('--rnn_layers', default=1, type=int, help='rnn number of layers (default=1)')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment