From 6607203eb2188e6fee98dd10b2874b012caaa9df Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Wed, 6 Jul 2022 16:41:24 +0200 Subject: [PATCH] Added scripts to launch experiments from my thesis --- UD_any/thesis/chapter2/experimentBatch.py | 298 ++++++++++++++++++++++ UD_any/thesis/chapter3/machinesBatch.py | 32 +++ UD_any/thesis/chapter3/useEntropy.r | 167 ++++++++++++ 3 files changed, 497 insertions(+) create mode 100644 UD_any/thesis/chapter2/experimentBatch.py create mode 100644 UD_any/thesis/chapter3/machinesBatch.py create mode 100644 UD_any/thesis/chapter3/useEntropy.r diff --git a/UD_any/thesis/chapter2/experimentBatch.py b/UD_any/thesis/chapter2/experimentBatch.py new file mode 100644 index 0000000..a5ba176 --- /dev/null +++ b/UD_any/thesis/chapter2/experimentBatch.py @@ -0,0 +1,298 @@ +# To untrack the file do : git update-index --skip-worktree batches.py + +class T : + def reset() : + T.dropout = "InputDropout : 0.5" + T.mlp = "MLP : {3200 0.4 1600 0.4}" + T.loss = "Loss : crossentropy" + T.optim = "Optimizer : Adagrad {0.01 0.000001 0 0.0000000001}" + + T.focusedBuffer = "0" + T.focusedStack = "" + T.contextualWindow = "-10 0" + T.contextualWindowSeq = "-10 10" + T.contextualTargets = "b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1" + T.contextualTargetsSeq = "b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1" + T.contextualTargetsFlat = "b.-3 b.-2 b.-1 b.0" + T.contextTargets = "b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1" + T.contextTargetsSeq = "b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1" + T.contextTargetsFlat = "b.-3 b.-2 b.-1 b.0" + T.rawInputLeft = "5" + T.rawInputRight = "10" + T.historyNb = "10" + T.historyMineNb = "2" + + T.lstmOpt = "1 1 0.0 1" + T.lstmIn = "128" + T.lstmOut = "64" + T.stateNameSize = "64" + + T.prefixSize = "5" + T.suffixSize = "5" + + T.prefixBuffer = "0" + T.suffixBuffer = "0" + T.prefixStack = "" + T.suffixStack = "" + + T.distThreshold = "15" + + def refresh() : + T.templateIncr1Class = """Name : Tokenizer, Tagger, Morpho, Lemmatizer, Parser and Segmenter Machine +Classifier : tokeparser +{ + Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} + LossMultiplier : {} + Network type : Modular + %%s + %s + %s + End + %s + Type : classification + %s +} +Splitwords : data/splitwords.ts +Predictions : ID FORM UPOS FEATS LEMMA HEAD DEPREL EOS +Strategy +{ + Block : End{cannotMove} + tokenizer tagger ENDWORD 0 + tokenizer tagger SPLIT 0 + tokenizer tokenizer * 0 + tagger morpho * 0 + morpho lemmatizer_rules * 0 + lemmatizer_rules lemmatizer_case * 0 + lemmatizer_case parser * 0 + parser segmenter eager_SHIFT 0 + parser segmenter eager_RIGHT_rel 0 + parser parser * 0 + segmenter tokenizer * 1 +} +"""%(T.dropout, T.mlp, T.optim, T.loss) + + T.templateSeq1Class = """Name : Tokenizer, Tagger, Morpho, Lemmatizer, Parser and Segmenter Machine +Classifier : tokeparser +{ + Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} + LossMultiplier : {} + Network type : Modular + %%s + %s + %s + End + %s + Type : classification + %s +} +Splitwords : data/splitwords.ts +Predictions : ID FORM UPOS FEATS LEMMA HEAD DEPREL EOS +Strategy +{ + Block : End{cannotMove} + tokenizer tokenizer ENDWORD 1 + tokenizer tokenizer SPLIT 1 + tokenizer tokenizer * 0 + Block : End{cannotMove} + tagger tagger * 1 + Block : End{cannotMove} + morpho morpho * 1 + Block : End{cannotMove} + lemmatizer_rules lemmatizer_case * 0 + lemmatizer_case lemmatizer_rules * 1 + Block : End{cannotMove} + parser segmenter eager_SHIFT 0 + parser segmenter eager_RIGHT_rel 0 + parser parser * 0 + segmenter parser * 1 +} +"""%(T.dropout, T.mlp, T.optim, T.loss) + + T.templateIncrNClass = """Name : Tokenizer, Tagger, Morpho, Lemmatizer, Parser and Segmenter Machine +Classifier : tokenizer +{ + Transitions : {tokenizer,data/tokenizer.ts} + LossMultiplier : {} + Network type : Modular + %%s + %s + %s + End + %s + Type : classification + %s +} +Classifier : tagger +{ + Transitions : {tagger,data/tagger.ts} + LossMultiplier : {} + Network type : Modular + %%s + %s + %s + End + %s + Type : classification + %s +} +Classifier : morpho +{ + Transitions : {morpho,data/morpho_whole.ts} + LossMultiplier : {} + Network type : Modular + %%s + %s + %s + End + %s + Type : classification + %s +} +Classifier : lemmatizer +{ + Transitions : {lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts} + LossMultiplier : {} + Network type : Modular + %%s + %s + %s + End + %s + Type : classification + %s +} +Classifier : parser +{ + Transitions : {parser,data/parser_eager_rel_strict.ts} + LossMultiplier : {} + Network type : Modular + %%s + %s + %s + End + %s + Type : classification + %s +} +Classifier : segmenter +{ + Transitions : {segmenter,data/segmenter.ts} + LossMultiplier : {} + Network type : Modular + %%s + %s + %s + End + %s + Type : classification + %s +} +Splitwords : data/splitwords.ts +Predictions : ID FORM UPOS FEATS LEMMA HEAD DEPREL EOS +Strategy +{ + Block : End{cannotMove} + tokenizer tagger ENDWORD 0 + tokenizer tagger SPLIT 0 + tokenizer tokenizer * 0 + tagger morpho * 0 + morpho lemmatizer_rules * 0 + lemmatizer_rules lemmatizer_case * 0 + lemmatizer_case parser * 0 + parser segmenter eager_SHIFT 0 + parser segmenter eager_RIGHT_rel 0 + parser parser * 0 + segmenter tokenizer * 1 +} +"""%(T.dropout, T.mlp, T.optim, T.loss, T.dropout, T.mlp, T.optim, T.loss, T.dropout, T.mlp, T.optim, T.loss, T.dropout, T.mlp, T.optim, T.loss, T.dropout, T.mlp, T.optim, T.loss, T.dropout, T.mlp, T.optim, T.loss) + + w2v = "FORM,../../cc.fr.300.vec" + T.words = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{FORM,data/FORM.w2v}"%(T.contextualTargets,"FORM",T.lstmOpt,T.lstmIn,T.lstmOut) + T.wordsSeq = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{FORM,data/FORM.w2v}"%(T.contextualTargetsSeq,"FORM",T.lstmOpt,T.lstmIn,T.lstmOut) + T.wordsFlat = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{FORM,data/FORM.w2v}"%(T.contextualTargetsFlat,"FORM",T.lstmOpt,T.lstmIn,T.lstmOut) + T.wordsPretrained = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{%s}"%(T.contextualTargets,"FORM",T.lstmOpt,300,T.lstmOut, w2v) + T.wordsSeqPretrained = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{%s}"%(T.contextualTargetsSeq,"FORM",T.lstmOpt,300,T.lstmOut, w2v) + T.wordsFlatPretrained = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{%s}"%(T.contextualTargetsFlat,"FORM",T.lstmOpt,300,T.lstmOut, w2v) + T.context = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{}"%(T.contextTargets,"ID EOS UPOS FEATS DEPREL",T.lstmOpt,T.lstmIn,T.lstmOut) + T.contextSeq = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{}"%(T.contextTargetsSeq,"ID EOS UPOS FEATS DEPREL",T.lstmOpt,T.lstmIn,T.lstmOut) + T.contextID = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{}"%(T.contextTargetsFlat,"ID",T.lstmOpt,T.lstmIn,T.lstmOut) + T.contextTagger = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{}"%(T.contextTargetsFlat,"ID UPOS",T.lstmOpt,T.lstmIn,T.lstmOut) + T.contextMorpho = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{}"%(T.contextTargetsFlat,"ID UPOS FEATS",T.lstmOpt,T.lstmIn,T.lstmOut) + T.contextParser = "Context : Targets{%s} Columns{%s} LSTM{%s} In{%s} Out{%s} w2v{}"%(T.contextTargets,"ID UPOS FEATS DEPREL",T.lstmOpt,T.lstmIn,T.lstmOut) + + T.prefix = "Focused : Column{prefix%s:FORM} NbElem{%s} Buffer{%s} Stack{%s} LSTM{%s} In{%s} Out{%s} w2v{}"%(T.prefixSize,T.prefixSize,T.prefixBuffer,T.prefixStack,T.lstmOpt,T.lstmIn,T.lstmOut) + T.suffix = "Focused : Column{suffix%s:FORM} NbElem{%s} Buffer{%s} Stack{%s} LSTM{%s} In{%s} Out{%s} w2v{}"%(T.suffixSize,T.suffixSize,T.suffixBuffer,T.suffixStack,T.lstmOpt,T.lstmIn,T.lstmOut) + T.rawInput5_10 = "RawInput : Left{5} Right{10} LSTM{%s} In{%s} Out{%s}"%(T.lstmOpt,T.lstmIn,T.lstmOut) + T.rawInput5_5 = "RawInput : Left{5} Right{5} LSTM{%s} In{%s} Out{%s}"%(T.lstmOpt,T.lstmIn,T.lstmOut) + T.rawInput5_15 = "RawInput : Left{5} Right{15} LSTM{%s} In{%s} Out{%s}"%(T.lstmOpt,T.lstmIn,T.lstmOut) + T.rawInput5_2 = "RawInput : Left{5} Right{2} LSTM{%s} In{%s} Out{%s}"%(T.lstmOpt,T.lstmIn,T.lstmOut) + T.history = "History : NbElem{%s} LSTM{%s} In{%s} Out{%s}"%(T.historyNb,T.lstmOpt,T.lstmIn,T.lstmOut) + T.historyMine = "HistoryMine : NbElem{%s} LSTM{%s} In{%s} Out{%s}"%(T.historyMineNb,T.lstmOpt,T.lstmIn,T.lstmOut) + T.splitTrans = "SplitTrans : LSTM{%s} In{%s} Out{%s}"%(T.lstmOpt,T.lstmIn,T.lstmOut) + T.stateName = "StateName : Out{%s}"%(T.stateNameSize) + T.dist = "Distance : FromBuffer{} FromStack{0 1 2} ToBuffer{0} ToStack{} Threshold{%s} LSTM{%s} In{%s} Out{%s}"%(T.distThreshold,T.lstmOpt,T.lstmIn,T.lstmOut) + +T.machines = [] +T.reset() +T.refresh() + +T.machines.append(["tokeparser_incr_1class_window5_10", T.templateIncr1Class%"\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.stateName,T.dist])]) +T.machines.append(["tokeparser_seq_1class_window5_10", T.templateSeq1Class%"\n ".join([T.wordsSeq,T.contextSeq,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.stateName,T.dist])]) + +T.machines.append(["tokeparser_incr_Nclass_sup_window5_10", T.templateIncrNClass%("\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]))]) +T.machines.append(["tokeparser_incr_Nclass_sup_window5_5", T.templateIncrNClass%("\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]))]) +T.machines.append(["tokeparser_incr_Nclass_sup_window5_15", T.templateIncrNClass%("\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]))]) +T.machines.append(["tokeparser_incr_Nclass_sup_window5_2", T.templateIncrNClass%("\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]))]) + +T.machines.append(["tokeparser_incr_Nclass_nosup_window5_10", T.templateIncrNClass%( +"\n ".join([T.wordsFlat,T.contextID,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans]), # Tokenizer +"\n ".join([T.wordsFlat,T.contextTagger,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans]), # Tagger +"\n ".join([T.wordsFlat,T.contextMorpho,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans]), # Morpho +"\n ".join([T.wordsFlat,T.contextMorpho,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans]), # Lemmatizer +"\n ".join([T.words,T.contextParser,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans,T.dist]), # Parser +"\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]))]) # Segmenter + + +T.machines.append(["tokeparserPre_incr_1class_window5_10", T.templateIncr1Class%"\n ".join([T.wordsPretrained,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.stateName,T.dist])]) +T.machines.append(["tokeparserPre_seq_1class_window5_10", T.templateSeq1Class%"\n ".join([T.wordsSeqPretrained,T.contextSeq,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.stateName,T.dist])]) + +T.machines.append(["tokeparserPre_incr_Nclass_sup_window5_10", T.templateIncrNClass%("\n ".join([T.wordsPretrained,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]))]) +T.machines.append(["tokeparserPre_incr_Nclass_sup_window5_5", T.templateIncrNClass%("\n ".join([T.wordsPretrained,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_5,T.history,T.splitTrans,T.dist]))]) +T.machines.append(["tokeparserPre_incr_Nclass_sup_window5_15", T.templateIncrNClass%("\n ".join([T.wordsPretrained,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_15,T.history,T.splitTrans,T.dist]))]) +T.machines.append(["tokeparserPre_incr_Nclass_sup_window5_2", T.templateIncrNClass%("\n ".join([T.wordsPretrained,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]), "\n ".join([T.words,T.context,T.prefix,T.suffix,T.rawInput5_2,T.history,T.splitTrans,T.dist]))]) + +T.machines.append(["tokeparserPre_incr_Nclass_nosup_window5_10", T.templateIncrNClass%( +"\n ".join([T.wordsFlatPretrained,T.contextID,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans]), # Tokenizer +"\n ".join([T.wordsFlatPretrained,T.contextTagger,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans]), # Tagger +"\n ".join([T.wordsFlatPretrained,T.contextMorpho,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans]), # Morpho +"\n ".join([T.wordsFlatPretrained,T.contextMorpho,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans]), # Lemmatizer +"\n ".join([T.wordsPretrained,T.contextParser,T.prefix,T.suffix,T.rawInput5_10,T.historyMine,T.splitTrans,T.dist]), # Parser +"\n ".join([T.wordsPretrained,T.context,T.prefix,T.suffix,T.rawInput5_10,T.history,T.splitTrans,T.dist]))]) # Segmenter + + +templatesExperiments = [ + { + 'mode' : 'txt', + 'expName' : '%s'%machine[0], + 'template' : 'templates/tokeparser_incr', + 'arguments' : '-n 40 --lockPretrained --devScore --trainStrategy 0,ExtractGold,ResetParameters:4,ExtractDynamic,Save --machine \"%s\"'%machine[1], + 'pretrained' : 'FORM', + 'evalArguments' : '' + } for machine in T.machines +] + +langs = [ + "UD_French-GSD_0", + "UD_French-GSD_1", + "UD_French-GSD_2", + "UD_French-GSD_3", + "UD_French-GSD_4", + "UD_French-GSD_5", + "UD_French-GSD_6", + "UD_French-GSD_7", + "UD_French-GSD_8", + "UD_French-GSD_9", +] + +repRange = [0] + diff --git a/UD_any/thesis/chapter3/machinesBatch.py b/UD_any/thesis/chapter3/machinesBatch.py new file mode 100644 index 0000000..7928d95 --- /dev/null +++ b/UD_any/thesis/chapter3/machinesBatch.py @@ -0,0 +1,32 @@ +# To untrack the file do : git update-index --skip-worktree batches.py + +tagExpPre = [ + { + 'mode' : 'tsv', + 'expName' : '%s'%exp, + 'template' : 'templates/'+exp, + 'arguments' : '-n 20 --trainStrategy 0,ExtractGold,ResetParameters:4,ExtractDynamic,Save --devScore --lockPretrained', + 'pretrained' : '', + 'evalArguments' : '' + } for (exp) in [("tagparser_incr"), ("tagparser_seq")] +] + +tagExpNoPre = [ + { + 'mode' : 'tsv', + 'expName' : '%s'%exp, + 'template' : 'templates/'+exp, + 'arguments' : '-n 20 --trainStrategy 0,ExtractGold,ResetParameters:4,ExtractDynamic,Save --devScore', + 'pretrained' : 'FORM', + 'evalArguments' : '' + } for (exp) in [("tagparser_incr_nopretrained"), ("tagparser_seq_nopretrained")] +] + +templatesExperiments = tagExpPre + tagExpNoPre + +langs = [ + "UD_English-EWT", +] + +repRange = [0] + diff --git a/UD_any/thesis/chapter3/useEntropy.r b/UD_any/thesis/chapter3/useEntropy.r new file mode 100644 index 0000000..ec39b02 --- /dev/null +++ b/UD_any/thesis/chapter3/useEntropy.r @@ -0,0 +1,167 @@ +## Opening useful libraries + +library(data.table) +library (dplyr) # Library for ddply +library(lmerTest) # pvalue +library (lme4) +library(sjmisc) #str_contains +library(piecewiseSEM) # R squared +library(ggplot2) + +setwd("/home/franck/oculometry/provoNoTok") + +################################################################################################ +rescaleVariables = function(dataset) +{ + dataset$FREQ = scale(log(dataset$FREQ)) + dataset$WD_LEN = scale(dataset$WD_LEN, center = TRUE, scale = TRUE) + + for (colName in colnames(dataset)) + { + if (colName != "SENT_ID" & (str_contains(colName, "ENT_") | str_contains(colName, "SUR_"))) + { + dataset[colName] = scale(dataset[colName], center = TRUE, scale = TRUE) + } + } + + return(dataset) +} +################################################################################################ + +################################################################################################ +computeCorrelation = function(dataFile) +{ + data = na.omit(read.table(dataFile, header=T, sep="\t", quote="", nrows=-1)) + data = subset(data, FIRST_TIME != 0) + data = rescaleVariables(data) + + print(paste("CORR ", dataFile, "FREQ")) + print(cor(data$ENT_CUR_MEAN_TAGGER, data$FREQ), method="pearson") + print(cor(data$ENT_CUR_MEAN_MORPHO, data$FREQ), method="pearson") + print(cor(data$ENT_CUR_MEAN_LEMMATIZER_RULES, data$FREQ), method="pearson") + print(cor(data$ENT_CUR_MEAN_PARSER, data$FREQ), method="pearson") + print(cor(data$ENT_CUR_MEAN_SEGMENTER, data$FREQ), method="pearson") + print("") + + print(paste("CORR ", dataFile, "WD_LEN")) + print(cor(data$ENT_CUR_MEAN_TAGGER, data$WD_LEN), method="pearson") + print(cor(data$ENT_CUR_MEAN_MORPHO, data$WD_LEN), method="pearson") + print(cor(data$ENT_CUR_MEAN_LEMMATIZER_RULES, data$WD_LEN), method="pearson") + print(cor(data$ENT_CUR_MEAN_PARSER, data$WD_LEN), method="pearson") + print(cor(data$ENT_CUR_MEAN_SEGMENTER, data$WD_LEN), method="pearson") + print("") +} +################################################################################################ + + +################################################################################################ +globalSummary = function(targetCol, dataFile) +{ + data = na.omit(read.table(dataFile, header=T, sep="\t", quote="", nrows=-1)) + data = subset(data, FIRST_TIME != 0) + data = rescaleVariables(data) + + sink(paste(dataFile, targetCol, "analysis", sep="."), append=T) + + vanillaModel = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + (1 | TEXT_ID/SUBJECT)",sep="")), REML=FALSE, data=data) + + print("#SUMMARY") + + for (ent in c("ENT", "SUR")) + for (meth in c("CUR")) + for (mod in c("ADD")) + for (level in c("TAGGER", "MORPHO", "LEMMATIZER_RULES", "SEGMENTER")) + { + measure = paste(ent,meth,mod,level,sep="_") + formulaStr = paste(targetCol," ~ FREQ + WD_LEN + ",measure," + (1 | TEXT_ID/SUBJECT)",sep="") + model = lmer(formula(formulaStr), REML=FALSE, data=data) + print(summary(model)) + } + + for (ent in c("ENT", "SUR")) + for (meth in c("CUR", "ATT", "TGT")) + for (mod in c("ADD", "MEAN", "MAX")) + for (level in c("PARSER")) + { + measure = paste(ent,meth,mod,level,sep="_") + formulaStr = paste(targetCol," ~ FREQ + WD_LEN + ",measure," + (1 | TEXT_ID/SUBJECT)",sep="") + model = lmer(formula(formulaStr), REML=FALSE, data=data) + print(summary(model)) + } + + sink() + print(paste("SUMMARY", targetCol, dataFile)) +} +################################################################################################ + +################################################################################################ +globalAnova = function(targetCol, dataFile) +{ + data = na.omit(read.table(dataFile, header=T, sep="\t", quote="", nrows=-1)) + data = subset(data, FIRST_TIME != 0) + data = rescaleVariables(data) + + sink(paste(dataFile, targetCol, "analysis", sep=".")) + + vanillaModel = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + (1 | TEXT_ID/SUBJECT)",sep="")), REML=FALSE, data=data) + + EntCurAdd = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + ENT_CUR_ADD_TAGGER + ENT_CUR_ADD_MORPHO + ENT_CUR_ADD_LEMMATIZER_RULES + ENT_CUR_ADD_PARSER + ENT_CUR_ADD_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data) + EntCurMean = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + ENT_CUR_MEAN_TAGGER + ENT_CUR_MEAN_MORPHO + ENT_CUR_MEAN_LEMMATIZER_RULES + ENT_CUR_MEAN_PARSER + ENT_CUR_MEAN_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data) + EntCurMax = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + ENT_CUR_MAX_TAGGER + ENT_CUR_MAX_MORPHO + ENT_CUR_MAX_LEMMATIZER_RULES + ENT_CUR_MAX_PARSER + ENT_CUR_MAX_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data) + + EntAttAdd = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + ENT_ATT_ADD_TAGGER + ENT_ATT_ADD_MORPHO + ENT_ATT_ADD_LEMMATIZER_RULES + ENT_ATT_ADD_PARSER + ENT_ATT_ADD_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data) + EntAttMean = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + ENT_ATT_MEAN_TAGGER + ENT_ATT_MEAN_MORPHO + ENT_ATT_MEAN_LEMMATIZER_RULES + ENT_ATT_MEAN_PARSER + ENT_ATT_MEAN_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data) + EntAttMax = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + ENT_ATT_MAX_TAGGER + ENT_ATT_MAX_MORPHO + ENT_ATT_MAX_LEMMATIZER_RULES + ENT_ATT_MAX_PARSER + ENT_ATT_MAX_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data) + + EntTgtAdd = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + ENT_TGT_ADD_TAGGER + ENT_TGT_ADD_MORPHO + ENT_TGT_ADD_LEMMATIZER_RULES + ENT_TGT_ADD_PARSER + ENT_TGT_ADD_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data) + EntTgtMean = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + ENT_TGT_MEAN_TAGGER + ENT_TGT_MEAN_MORPHO + ENT_TGT_MEAN_LEMMATIZER_RULES + ENT_TGT_MEAN_PARSER + ENT_TGT_MEAN_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data) + EntTgtMax = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + ENT_TGT_MAX_TAGGER + ENT_TGT_MAX_MORPHO + ENT_TGT_MAX_LEMMATIZER_RULES + ENT_TGT_MAX_PARSER + ENT_TGT_MAX_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data) + + SurCurAdd = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + SUR_CUR_ADD_TAGGER + SUR_CUR_ADD_MORPHO + SUR_CUR_ADD_LEMMATIZER_RULES + SUR_CUR_ADD_PARSER + SUR_CUR_ADD_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data) + SurCurMean = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + SUR_CUR_MEAN_TAGGER + SUR_CUR_MEAN_MORPHO + SUR_CUR_MEAN_LEMMATIZER_RULES + SUR_CUR_MEAN_PARSER + SUR_CUR_MEAN_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data) + SurCurMax = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + SUR_CUR_MAX_TAGGER + SUR_CUR_MAX_MORPHO + SUR_CUR_MAX_LEMMATIZER_RULES + SUR_CUR_MAX_PARSER + SUR_CUR_MAX_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data) + + SurAttAdd = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + SUR_ATT_ADD_TAGGER + SUR_ATT_ADD_MORPHO + SUR_ATT_ADD_LEMMATIZER_RULES + SUR_ATT_ADD_PARSER + SUR_ATT_ADD_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data) + SurAttMean = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + SUR_ATT_MEAN_TAGGER + SUR_ATT_MEAN_MORPHO + SUR_ATT_MEAN_LEMMATIZER_RULES + SUR_ATT_MEAN_PARSER + SUR_ATT_MEAN_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data) + SurAttMax = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + SUR_ATT_MAX_TAGGER + SUR_ATT_MAX_MORPHO + SUR_ATT_MAX_LEMMATIZER_RULES + SUR_ATT_MAX_PARSER + SUR_ATT_MAX_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data) + + SurTgtAdd = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + SUR_TGT_ADD_TAGGER + SUR_TGT_ADD_MORPHO + SUR_TGT_ADD_LEMMATIZER_RULES + SUR_TGT_ADD_PARSER + SUR_TGT_ADD_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data) + SurTgtMean = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + SUR_TGT_MEAN_TAGGER + SUR_TGT_MEAN_MORPHO + SUR_TGT_MEAN_LEMMATIZER_RULES + SUR_TGT_MEAN_PARSER + SUR_TGT_MEAN_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data) + SurTgtMax = lmer(formula(paste(targetCol," ~ FREQ + WD_LEN + SUR_TGT_MAX_TAGGER + SUR_TGT_MAX_MORPHO + SUR_TGT_MAX_LEMMATIZER_RULES + SUR_TGT_MAX_PARSER + SUR_TGT_MAX_SEGMENTER + (1 | TEXT_ID/SUBJECT)",sep="")) , REML=FALSE, data=data) + + print("#ANOVA") + print(anova(vanillaModel, EntCurAdd, EntCurMean, EntCurMax, EntAttAdd, EntAttMean, EntAttMax, EntTgtAdd, EntTgtMean, EntTgtMax, SurCurAdd, SurCurMean, SurCurMax, SurAttAdd, SurAttMean, SurAttMax, SurTgtAdd, SurTgtMean, SurTgtMax)) + + sink() + print(paste("ANOVA", targetCol, dataFile)) +} +################################################################################################ + +################################################################################################ +computeCorrelation("outputs/UD_English-EWT/incr_pretrained_times.tsv") +computeCorrelation("outputs/UD_English-EWT/incr_nopretrained_times.tsv") + + +globalAnova("FIRST_TIME","outputs/UD_English-EWT/incr_nopretrained_times.tsv") +globalAnova("FIRST_TIME","outputs/UD_English-EWT/incr_pretrained_times.tsv") +globalAnova("FIRST_TIME","outputs/UD_English-EWT/seq_nopretrained_times.tsv") +globalAnova("FIRST_TIME","outputs/UD_English-EWT/seq_pretrained_times.tsv") + + +globalAnova("TOTAL_TIME","outputs/UD_English-EWT/incr_nopretrained_times.tsv") +globalAnova("TOTAL_TIME","outputs/UD_English-EWT/incr_pretrained_times.tsv") +globalAnova("TOTAL_TIME","outputs/UD_English-EWT/seq_nopretrained_times.tsv") +globalAnova("TOTAL_TIME","outputs/UD_English-EWT/seq_pretrained_times.tsv") + + +globalSummary("FIRST_TIME","outputs/UD_English-EWT/incr_nopretrained_times.tsv") +globalSummary("FIRST_TIME","outputs/UD_English-EWT/incr_pretrained_times.tsv") +globalSummary("FIRST_TIME","outputs/UD_English-EWT/seq_nopretrained_times.tsv") +globalSummary("FIRST_TIME","outputs/UD_English-EWT/seq_pretrained_times.tsv") + + +globalSummary("TOTAL_TIME","outputs/UD_English-EWT/incr_nopretrained_times.tsv") +globalSummary("TOTAL_TIME","outputs/UD_English-EWT/incr_pretrained_times.tsv") +globalSummary("TOTAL_TIME","outputs/UD_English-EWT/seq_nopretrained_times.tsv") +globalSummary("TOTAL_TIME","outputs/UD_English-EWT/seq_pretrained_times.tsv") + -- GitLab