Skip to content
Snippets Groups Projects
Commit 6607203e authored by Franck Dary's avatar Franck Dary
Browse files

Added scripts to launch experiments from my thesis

parent f2ce7a80
Branches master
No related tags found
No related merge requests found
# To untrack the file do : git update-index --skip-worktree batches.py
class T :
def reset() :
T.dropout = "InputDropout : 0.5"
T.mlp = "MLP : {3200 0.4 1600 0.4}"
T.loss = "Loss : crossentropy"
T.optim = "Optimizer : Adagrad {0.01 0.000001 0 0.0000000001}"
T.focusedBuffer = "0"
T.focusedStack = ""
T.contextualWindow = "-10 0"
T.contextualWindowSeq = "-10 10"
T.contextualTargets = "b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1"
T.contextualTargetsSeq = "b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1"
T.contextualTargetsFlat = "b.-3 b.-2 b.-1 b.0"
T.contextTargets = "b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1"
T.contextTargetsSeq = "b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1"
T.contextTargetsFlat = "b.-3 b.-2 b.-1 b.0"
T.rawInputLeft = "5"
T.rawInputRight = "10"
T.historyNb = "10"
T.historyMineNb = "2"
T.lstmOpt = "1 1 0.0 1"
T.lstmIn = "128"
T.lstmOut = "64"
T.stateNameSize = "64"
T.prefixSize = "5"
T.suffixSize = "5"
T.prefixBuffer = "0"
T.suffixBuffer = "0"
T.prefixStack = ""
T.suffixStack = ""
T.distThreshold = "15"
def refresh() :
T.templateIncr1Class = """Name : Tokenizer, Tagger, Morpho, Lemmatizer, Parser and Segmenter Machine
Classifier : tokeparser
{
Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts}
LossMultiplier : {}
Network type : Modular
%%s
%s
%s
End
%s
Type : classification
%s
}
Splitwords : data/splitwords.ts
Predictions : ID FORM UPOS FEATS LEMMA HEAD DEPREL EOS
Strategy
{
Block : End{cannotMove}
tokenizer tagger ENDWORD 0
tokenizer tagger SPLIT 0
tokenizer tokenizer * 0
tagger morpho * 0
morpho lemmatizer_rules * 0
lemmatizer_rules lemmatizer_case * 0
lemmatizer_case parser * 0
parser segmenter eager_SHIFT 0
parser segmenter eager_RIGHT_rel 0
parser parser * 0
segmenter tokenizer * 1
}
"""%(T.dropout, T.mlp, T.optim, T.loss)
T.templateSeq1Class = """Name : Tokenizer, Tagger, Morpho, Lemmatizer, Parser and Segmenter Machine
Classifier : tokeparser
{
Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts}
LossMultiplier : {}
Network type : Modular
%%s
%s
%s
End
%s
Type : classification
%s
}
Splitwords : data/splitwords.ts
Predictions : ID FORM UPOS FEATS LEMMA HEAD DEPREL EOS
Strategy
{
Block : End{cannotMove}
tokenizer tokenizer ENDWORD 1
tokenizer tokenizer SPLIT 1
tokenizer tokenizer * 0
Block : End{cannotMove}
tagger tagger * 1
Block : End{cannotMove}
morpho morpho * 1
Block : End{cannotMove}
lemmatizer_rules lemmatizer_case * 0
lemmatizer_case lemmatizer_rules * 1
Block : End{cannotMove}
parser segmenter eager_SHIFT 0
parser segmenter eager_RIGHT_rel 0
parser parser * 0
segmenter parser * 1
}
"""%(T.dropout, T.mlp, T.optim, T.loss)
T.templateIncrNClass = """Name : Tokenizer, Tagger, Morpho, Lemmatizer, Parser and Segmenter Machine
Classifier : tokenizer
{
Transitions : {tokenizer,data/tokenizer.ts}
LossMultiplier : {}
Network type : Modular
%%s
%s
%s
End
%s
Type : classification
%s
}
Classifier : tagger
{
Transitions : {tagger,data/tagger.ts}
LossMultiplier : {}
Network type : Modular
%%s
%s
%s
End
%s
Type : classification
%s
}
Classifier : morpho
{
Transitions : {morpho,data/morpho_whole.ts}
LossMultiplier : {}
Network type : Modular
%%s
%s
%s
End
%s
Type : classification
%s
}
Classifier : lemmatizer
{
Transitions : {lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts}
LossMultiplier : {}
Network type : Modular
%%s
%s
%s
End
%s
Type : classification
%s
}
Classifier : parser
{
Transitions : {parser,data/parser_eager_rel_strict.ts}
LossMultiplier : {}
Network type : Modular
%%s
%s
%s
End
%s
Type : classification
%s
}
Classifier : segmenter
{
Transitions : {segmenter,data/segmenter.ts}
LossMultiplier : {}
Network type : Modular
%%s
%s
%s
End
%s
Type : classification
%s
}
Splitwords : data/splitwords.ts
Predictions : ID FORM UPOS FEATS LEMMA HEAD DEPREL EOS
Strategy
{
Block : End{cannotMove}
tokenizer tagger ENDWORD 0
tokenizer tagger SPLIT 0
tokenizer tokenizer * 0
tagger morpho * 0
morpho lemmatizer_rules * 0
lemmatizer_rules lemmatizer_case * 0
lemmatizer_case parser * 0
parser segmenter eager_SHIFT 0
parser segmenter eager_RIGHT_rel 0
parser parser * 0
segmenter tokenizer * 1
}
"""%(T.dropout, T.mlp, T.optim, T.loss, T.dropout, T.mlp, T.optim, T.loss, T.dropout, T.mlp, T.optim, T.loss, T.dropout, T.mlp, T.optim, T.loss, T.dropout, T.mlp, T.optim, T.loss, T.dropout, T.mlp, T.optim, T.loss)
w2v = "FORM,../../cc.fr.300.vec"
T.words = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{FORM,data/FORM.w2v}"%(T.contextualTargets,"FORM",T.lstmOpt,T.lstmIn,T.lstmOut)
T.wordsSeq = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{FORM,data/FORM.w2v}"%(T.contextualTargetsSeq,"FORM",T.lstmOpt,T.lstmIn,T.lstmOut)
T.wordsFlat = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{FORM,data/FORM.w2v}"%(T.contextualTargetsFlat,"FORM",T.lstmOpt,T.lstmIn,T.lstmOut)
T.wordsPretrained = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{%s}"%(T.contextualTargets,"FORM",T.lstmOpt,300,T.lstmOut, w2v)
T.wordsSeqPretrained = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{%s}"%(T.contextualTargetsSeq,"FORM",T.lstmOpt,300,T.lstmOut, w2v)
T.wordsFlatPretrained = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{%s}"%(T.contextualTargetsFlat,"FORM",T.lstmOpt,300,T.lstmOut, w2v)
T.context = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{}"%(T.contextTargets,"ID EOS UPOS FEATS DEPREL",T.lstmOpt,T.lstmIn,T.lstmOut)
T.contextSeq = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{}"%(T.contextTargetsSeq,"ID EOS UPOS FEATS DEPREL",T.lstmOpt,T.lstmIn,T.lstmOut)
T.contextID = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{}"%(T.contextTargetsFlat,"ID",T.lstmOpt,T.lstmIn,T.lstmOut)
T.contextTagger = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{}"%(T.contextTargetsFlat,"ID UPOS",T.lstmOpt,T.lstmIn,T.lstmOut)
T.contextMorpho = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{}"%(T.contextTargetsFlat,"ID UPOS FEATS",T.lstmOpt,T.lstmIn,T.lstmOut)
T.contextParser = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{}"%(T.contextTargets,"ID UPOS FEATS DEPREL",T.lstmOpt,T.lstmIn,T.lstmOut)
T.prefix = "Focused : Column{prefix%s:FORM} NbElem{%s} Buffer{%s} Stack{%s} LSTM{%s} In{%s} Out{%s} w2v{}"%(T.prefixSize,T.prefixSize,T.prefixBuffer,T.prefixStack,T.lstmOpt,T.lstmIn,T.lstmOut)
T.suffix = "Focused : Column{suffix%s:FORM} NbElem{%s} Buffer{%s} Stack{%s} LSTM{%s} In{%s} Out{%s} w2v{}"%(T.suffixSize,T.suffixSize,T.suffixBuffer,T.suffixStack,T.lstmOpt,T.lstmIn,T.lstmOut)
T.rawInput5_10 = "RawInput : Left{5} Right{10} LSTM{%s} In{%s} Out{%s}"%(T.lstmOpt,T.lstmIn,T.lstmOut)
T.rawInput5_5 = "RawInput : Left{5} Right{5} LSTM{%s} In{%s} Out{%s}"%(T.lstmOpt,T.lstmIn,T.lstmOut)
T.rawInput5_15 = "RawInput : Left{5} Right{15} LSTM{%s} In{%s} Out{%s}"%(T.lstmOpt,T.lstmIn,T.lstmOut)
T.rawInput5_2 = "RawInput : Left{5} Right{2} LSTM{%s} In{%s} Out{%s}"%(T.lstmOpt,T.lstmIn,T.lstmOut)
T.history = "History : NbElem{%s} LSTM{%s} In{%s} Out{%s}"%(T.historyNb,T.lstmOpt,T.lstmIn,T.lstmOut)
T.historyMine = "HistoryMine : NbElem{%s} LSTM{%s} In{%s} Out{%s}"%(T.historyMineNb,T.lstmOpt,T.lstmIn,T.lstmOut)
T.splitTrans = "SplitTrans : LSTM{%s} In{%s} Out{%s}"%(T.lstmOpt,T.lstmIn,T.lstmOut)
T.stateName = "StateName : Out{%s}"%(T.stateNameSize)
T.dist = "Distance : FromBuffer{} FromStack{0 1 2} ToBuffer{0} ToStack{} Threshold{%s} LSTM{%s} In{%s} Out{%s}"%(T.distThreshold,T.lstmOpt,T.lstmIn,T.lstmOut)
T.machines = []
T.reset()
T.refresh()
T.machines.append(["tokeparser_incr_1class_window5_10", T.templateIncr1Class%"\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.stateName,T.dist])])
T.machines.append(["tokeparser_seq_1class_window5_10", T.templateSeq1Class%"\n ".join([T.wordsSeq,T.contextSeq,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.stateName,T.dist])])
T.machines.append(["tokeparser_incr_Nclass_sup_window5_10", T.templateIncrNClass%("\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]))])
T.machines.append(["tokeparser_incr_Nclass_sup_window5_5", T.templateIncrNClass%("\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]))])
T.machines.append(["tokeparser_incr_Nclass_sup_window5_15", T.templateIncrNClass%("\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]))])
T.machines.append(["tokeparser_incr_Nclass_sup_window5_2", T.templateIncrNClass%("\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]))])
T.machines.append(["tokeparser_incr_Nclass_nosup_window5_10", T.templateIncrNClass%(
"\n ".join([T.wordsFlat,T.contextID,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans]), # Tokenizer
"\n ".join([T.wordsFlat,T.contextTagger,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans]), # Tagger
"\n ".join([T.wordsFlat,T.contextMorpho,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans]), # Morpho
"\n ".join([T.wordsFlat,T.contextMorpho,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans]), # Lemmatizer
"\n ".join([T.words,T.contextParser,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans,T.dist]), # Parser
"\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]))]) # Segmenter
T.machines.append(["tokeparserPre_incr_1class_window5_10", T.templateIncr1Class%"\n ".join([T.wordsPretrained,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.stateName,T.dist])])
T.machines.append(["tokeparserPre_seq_1class_window5_10", T.templateSeq1Class%"\n ".join([T.wordsSeqPretrained,T.contextSeq,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.stateName,T.dist])])
T.machines.append(["tokeparserPre_incr_Nclass_sup_window5_10", T.templateIncrNClass%("\n ".join([T.wordsPretrained,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]))])
T.machines.append(["tokeparserPre_incr_Nclass_sup_window5_5", T.templateIncrNClass%("\n ".join([T.wordsPretrained,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]))])
T.machines.append(["tokeparserPre_incr_Nclass_sup_window5_15", T.templateIncrNClass%("\n ".join([T.wordsPretrained,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]))])
T.machines.append(["tokeparserPre_incr_Nclass_sup_window5_2", T.templateIncrNClass%("\n ".join([T.wordsPretrained,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]))])
T.machines.append(["tokeparserPre_incr_Nclass_nosup_window5_10", T.templateIncrNClass%(
"\n ".join([T.wordsFlatPretrained,T.contextID,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans]), # Tokenizer
"\n ".join([T.wordsFlatPretrained,T.contextTagger,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans]), # Tagger
"\n ".join([T.wordsFlatPretrained,T.contextMorpho,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans]), # Morpho
"\n ".join([T.wordsFlatPretrained,T.contextMorpho,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans]), # Lemmatizer
"\n ".join([T.wordsPretrained,T.contextParser,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans,T.dist]), # Parser
"\n ".join([T.wordsPretrained,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]))]) # Segmenter
templatesExperiments = [
{
'mode' : 'txt',
'expName' : '%s'%machine[0],
'template' : 'templates/tokeparser_incr',
'arguments' : '-n 40 --lockPretrained --devScore --trainStrategy 0,ExtractGold,ResetParameters:4,ExtractDynamic,Save --machine \"%s\"'%machine[1],
'pretrained' : 'FORM',
'evalArguments' : ''
} for machine in T.machines
]
langs = [
"UD_French-GSD_0",
"UD_French-GSD_1",
"UD_French-GSD_2",
"UD_French-GSD_3",
"UD_French-GSD_4",
"UD_French-GSD_5",
"UD_French-GSD_6",
"UD_French-GSD_7",
"UD_French-GSD_8",
"UD_French-GSD_9",
]
repRange = [0]
# To untrack the file do : git update-index --skip-worktree batches.py
tagExpPre = [
{
'mode' : 'tsv',
'expName' : '%s'%exp,
'template' : 'templates/'+exp,
'arguments' : '-n 20 --trainStrategy 0,ExtractGold,ResetParameters:4,ExtractDynamic,Save --devScore --lockPretrained',
'pretrained' : '',
'evalArguments' : ''
} for (exp) in [("tagparser_incr"), ("tagparser_seq")]
]
tagExpNoPre = [
{
'mode' : 'tsv',
'expName' : '%s'%exp,
'template' : 'templates/'+exp,
'arguments' : '-n 20 --trainStrategy 0,ExtractGold,ResetParameters:4,ExtractDynamic,Save --devScore',
'pretrained' : 'FORM',
'evalArguments' : ''
} for (exp) in [("tagparser_incr_nopretrained"), ("tagparser_seq_nopretrained")]
]
templatesExperiments = tagExpPre + tagExpNoPre
langs = [
"UD_English-EWT",
]
repRange = [0]
## Opening useful libraries
library(data.table)
library (dplyr) # Library for ddply
library(lmerTest) # pvalue
library (lme4)
library(sjmisc) #str_contains
library(piecewiseSEM) # R squared
library(ggplot2)
setwd("/home/franck/oculometry/provoNoTok")
################################################################################################
rescaleVariables = function(dataset)
{
dataset$FREQ = scale(log(dataset$FREQ))
dataset$WD_LEN = scale(dataset$WD_LEN, center = TRUE, scale = TRUE)
for (colName in colnames(dataset))
{
if (colName != "SENT_ID" & (str_contains(colName, "ENT_") | str_contains(colName, "SUR_")))
{
dataset[colName] = scale(dataset[colName], center = TRUE, scale = TRUE)
}
}
return(dataset)
}
################################################################################################
################################################################################################
computeCorrelation = function(dataFile)
{
data = na.omit(read.table(dataFile, header=T, sep="\t", quote="", nrows=-1))
data = subset(data, FIRST_TIME != 0)
data = rescaleVariables(data)
print(paste("CORR ", dataFile, "FREQ"))
print(cor(data$ENT_CUR_MEAN_TAGGER, data$FREQ), method="pearson")
print(cor(data$ENT_CUR_MEAN_MORPHO, data$FREQ), method="pearson")
print(cor(data$ENT_CUR_MEAN_LEMMATIZER_RULES, data$FREQ), method="pearson")
print(cor(data$ENT_CUR_MEAN_PARSER, data$FREQ), method="pearson")
print(cor(data$ENT_CUR_MEAN_SEGMENTER, data$FREQ), method="pearson")
print("")
print(paste("CORR ", dataFile, "WD_LEN"))
print(cor(data$ENT_CUR_MEAN_TAGGER, data$WD_LEN), method="pearson")
print(cor(data$ENT_CUR_MEAN_MORPHO, data$WD_LEN), method="pearson")
print(cor(data$ENT_CUR_MEAN_LEMMATIZER_RULES, data$WD_LEN), method="pearson")
print(cor(data$ENT_CUR_MEAN_PARSER, data$WD_LEN), method="pearson")
print(cor(data$ENT_CUR_MEAN_SEGMENTER, data$WD_LEN), method="pearson")
print("")
}
################################################################################################
################################################################################################
globalSummary = function(targetCol, dataFile)
{
data = na.omit(read.table(dataFile, header=T, sep="\t", quote="", nrows=-1))
data = subset(data, FIRST_TIME != 0)
data = rescaleVariables(data)
sink(paste(dataFile, targetCol, "analysis", sep="."), append=T)
vanillaModel = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + (1 | TEXT_ID/SUBJECT)",sep="")), REML=FALSE, data=data)
print("#SUMMARY")
for (ent in c("ENT", "SUR"))
for (meth in c("CUR"))
for (mod in c("ADD"))
for (level in c("TAGGER", "MORPHO", "LEMMATIZER_RULES", "SEGMENTER"))
{
measure = paste(ent,meth,mod,level,sep="_")
formulaStr = paste(targetCol," ~ FREQ + WD_LEN + ",measure," + (1 | TEXT_ID/SUBJECT)",sep="")
model = lmer(formula(formulaStr), REML=FALSE, data=data)
print(summary(model))
}
for (ent in c("ENT", "SUR"))
for (meth in c("CUR", "ATT", "TGT"))
for (mod in c("ADD", "MEAN", "MAX"))
for (level in c("PARSER"))
{
measure = paste(ent,meth,mod,level,sep="_")
formulaStr = paste(targetCol," ~ FREQ + WD_LEN + ",measure," + (1 | TEXT_ID/SUBJECT)",sep="")
model = lmer(formula(formulaStr), REML=FALSE, data=data)
print(summary(model))
}
sink()
print(paste("SUMMARY", targetCol, dataFile))
}
################################################################################################
################################################################################################
globalAnova = function(targetCol, dataFile)
{
data = na.omit(read.table(dataFile, header=T, sep="\t", quote="", nrows=-1))
data = subset(data, FIRST_TIME != 0)
data = rescaleVariables(data)
sink(paste(dataFile, targetCol, "analysis", sep="."))
vanillaModel = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + (1 | TEXT_ID/SUBJECT)",sep="")), REML=FALSE, data=data)
EntCurAdd = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + ENT_CUR_ADD_TAGGER + ENT_CUR_ADD_MORPHO + ENT_CUR_ADD_LEMMATIZER_RULES + ENT_CUR_ADD_PARSER + ENT_CUR_ADD_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data)
EntCurMean = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + ENT_CUR_MEAN_TAGGER + ENT_CUR_MEAN_MORPHO + ENT_CUR_MEAN_LEMMATIZER_RULES + ENT_CUR_MEAN_PARSER + ENT_CUR_MEAN_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data)
EntCurMax = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + ENT_CUR_MAX_TAGGER + ENT_CUR_MAX_MORPHO + ENT_CUR_MAX_LEMMATIZER_RULES + ENT_CUR_MAX_PARSER + ENT_CUR_MAX_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data)
EntAttAdd = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + ENT_ATT_ADD_TAGGER + ENT_ATT_ADD_MORPHO + ENT_ATT_ADD_LEMMATIZER_RULES + ENT_ATT_ADD_PARSER + ENT_ATT_ADD_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data)
EntAttMean = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + ENT_ATT_MEAN_TAGGER + ENT_ATT_MEAN_MORPHO + ENT_ATT_MEAN_LEMMATIZER_RULES + ENT_ATT_MEAN_PARSER + ENT_ATT_MEAN_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data)
EntAttMax = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + ENT_ATT_MAX_TAGGER + ENT_ATT_MAX_MORPHO + ENT_ATT_MAX_LEMMATIZER_RULES + ENT_ATT_MAX_PARSER + ENT_ATT_MAX_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data)
EntTgtAdd = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + ENT_TGT_ADD_TAGGER + ENT_TGT_ADD_MORPHO + ENT_TGT_ADD_LEMMATIZER_RULES + ENT_TGT_ADD_PARSER + ENT_TGT_ADD_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data)
EntTgtMean = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + ENT_TGT_MEAN_TAGGER + ENT_TGT_MEAN_MORPHO + ENT_TGT_MEAN_LEMMATIZER_RULES + ENT_TGT_MEAN_PARSER + ENT_TGT_MEAN_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data)
EntTgtMax = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + ENT_TGT_MAX_TAGGER + ENT_TGT_MAX_MORPHO + ENT_TGT_MAX_LEMMATIZER_RULES + ENT_TGT_MAX_PARSER + ENT_TGT_MAX_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data)
SurCurAdd = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + SUR_CUR_ADD_TAGGER + SUR_CUR_ADD_MORPHO + SUR_CUR_ADD_LEMMATIZER_RULES + SUR_CUR_ADD_PARSER + SUR_CUR_ADD_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data)
SurCurMean = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + SUR_CUR_MEAN_TAGGER + SUR_CUR_MEAN_MORPHO + SUR_CUR_MEAN_LEMMATIZER_RULES + SUR_CUR_MEAN_PARSER + SUR_CUR_MEAN_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data)
SurCurMax = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + SUR_CUR_MAX_TAGGER + SUR_CUR_MAX_MORPHO + SUR_CUR_MAX_LEMMATIZER_RULES + SUR_CUR_MAX_PARSER + SUR_CUR_MAX_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data)
SurAttAdd = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + SUR_ATT_ADD_TAGGER + SUR_ATT_ADD_MORPHO + SUR_ATT_ADD_LEMMATIZER_RULES + SUR_ATT_ADD_PARSER + SUR_ATT_ADD_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data)
SurAttMean = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + SUR_ATT_MEAN_TAGGER + SUR_ATT_MEAN_MORPHO + SUR_ATT_MEAN_LEMMATIZER_RULES + SUR_ATT_MEAN_PARSER + SUR_ATT_MEAN_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data)
SurAttMax = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + SUR_ATT_MAX_TAGGER + SUR_ATT_MAX_MORPHO + SUR_ATT_MAX_LEMMATIZER_RULES + SUR_ATT_MAX_PARSER + SUR_ATT_MAX_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data)
SurTgtAdd = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + SUR_TGT_ADD_TAGGER + SUR_TGT_ADD_MORPHO + SUR_TGT_ADD_LEMMATIZER_RULES + SUR_TGT_ADD_PARSER + SUR_TGT_ADD_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data)
SurTgtMean = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + SUR_TGT_MEAN_TAGGER + SUR_TGT_MEAN_MORPHO + SUR_TGT_MEAN_LEMMATIZER_RULES + SUR_TGT_MEAN_PARSER + SUR_TGT_MEAN_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data)
SurTgtMax = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + SUR_TGT_MAX_TAGGER + SUR_TGT_MAX_MORPHO + SUR_TGT_MAX_LEMMATIZER_RULES + SUR_TGT_MAX_PARSER + SUR_TGT_MAX_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data)
print("#ANOVA")
print(anova(vanillaModel, EntCurAdd, EntCurMean, EntCurMax, EntAttAdd, EntAttMean, EntAttMax, EntTgtAdd, EntTgtMean, EntTgtMax, SurCurAdd, SurCurMean, SurCurMax, SurAttAdd, SurAttMean, SurAttMax, SurTgtAdd, SurTgtMean, SurTgtMax))
sink()
print(paste("ANOVA", targetCol, dataFile))
}
################################################################################################
################################################################################################
computeCorrelation("outputs/UD_English-EWT/incr_pretrained_times.tsv")
computeCorrelation("outputs/UD_English-EWT/incr_nopretrained_times.tsv")
globalAnova("FIRST_TIME","outputs/UD_English-EWT/incr_nopretrained_times.tsv")
globalAnova("FIRST_TIME","outputs/UD_English-EWT/incr_pretrained_times.tsv")
globalAnova("FIRST_TIME","outputs/UD_English-EWT/seq_nopretrained_times.tsv")
globalAnova("FIRST_TIME","outputs/UD_English-EWT/seq_pretrained_times.tsv")
globalAnova("TOTAL_TIME","outputs/UD_English-EWT/incr_nopretrained_times.tsv")
globalAnova("TOTAL_TIME","outputs/UD_English-EWT/incr_pretrained_times.tsv")
globalAnova("TOTAL_TIME","outputs/UD_English-EWT/seq_nopretrained_times.tsv")
globalAnova("TOTAL_TIME","outputs/UD_English-EWT/seq_pretrained_times.tsv")
globalSummary("FIRST_TIME","outputs/UD_English-EWT/incr_nopretrained_times.tsv")
globalSummary("FIRST_TIME","outputs/UD_English-EWT/incr_pretrained_times.tsv")
globalSummary("FIRST_TIME","outputs/UD_English-EWT/seq_nopretrained_times.tsv")
globalSummary("FIRST_TIME","outputs/UD_English-EWT/seq_pretrained_times.tsv")
globalSummary("TOTAL_TIME","outputs/UD_English-EWT/incr_nopretrained_times.tsv")
globalSummary("TOTAL_TIME","outputs/UD_English-EWT/incr_pretrained_times.tsv")
globalSummary("TOTAL_TIME","outputs/UD_English-EWT/seq_nopretrained_times.tsv")
globalSummary("TOTAL_TIME","outputs/UD_English-EWT/seq_pretrained_times.tsv")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment