diff --git a/00-install-deps.sh b/00-install-deps.sh index 6a612c1300c1170d2fdeabb396191f58a97edba4..d2c5e170340395c12f63d683d3ed00de3287b66d 100755 --- a/00-install-deps.sh +++ b/00-install-deps.sh @@ -5,8 +5,11 @@ rm -rf deps/semafor #git clone https://github.com/sammthomson/semafor.git deps/semafor -wget https://semafor-semantic-parser.googlecode.com/files/SEMAFOR-2.1.tgz -O SEMAFOR-2.1.tgz -tar -C deps -xf SEMAFOR-2.1.tgz +wget https://semafor-semantic-parser.googlecode.com/files/SEMAFOR-2.1.tgz -O deps/SEMAFOR-2.1.tgz +tar -C deps -xf deps/SEMAFOR-2.1.tgz javac -cp ${classpath} $SEMAFOR_HOME/edu/cmu/cs/lti/ark/fn/{identification,parsing}/*.java +# make sure macaon is installed +which maca_crf_barebones_decoder +which maca_graph_parser diff --git a/01-parse-data.sh b/01-parse-data.sh index 1fc2fb238bf572ea56248c922c23e6d5ee7eb3da..2238b74609130ebf301601096866dbe151b3839d 100755 --- a/01-parse-data.sh +++ b/01-parse-data.sh @@ -2,7 +2,15 @@ . config.sh -python2 scripts/fulltext_to_elements.py $framenet/fulltext/*.xml > $fefile.frame.elements -python2 scripts/fulltext_to_conll07.py $framenet/fulltext/*.xml > $fefile.conll07 -scripts/run-parser.sh $fefile.conll07 -python2 scripts/conll07_to_tags.py < $fefile.conll07.parsed > $fefile.all.lemma.tags +# convert framenet examples to the format understood by semafor (documented in docs/data-format-readme.txt) +/usr/bin/env python scripts/fulltext_to_elements.py $framenet/fulltext/*.xml > $fefile.frame.elements + +# convert framenet examples to the format understood by macaon +/usr/bin/env python scripts/fulltext_to_conll07.py $framenet/fulltext/*.xml > $fefile.conll07 + +# run macaon tagger+parser +maca_crf_barebones_decoder --conll07 $tagger_model $tagger_dictionary < $fefile.conll07 > $fefile.conll07.tagged +maca_graph_parser -M e -C en -c $fefile.conll07.tagged -m $parser_model.bin -a $parser_model.alpha -d $parser_model.dep_count -x -v 1 -z $fefile.conll07.parsed + +# convert macaon output ot the format understood by semafor (documented in docs/data-format-readme.txt) +/usr/bin/env python scripts/conll07_to_tags.py < $fefile.conll07.parsed > $fefile.all.lemma.tags diff --git a/README b/README index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..6995b5cf91675a67259b7e73767f3a7083ada463 100644 --- a/README +++ b/README @@ -0,0 +1,22 @@ +Instructions for retraining semafor. +==================================== + +Benoit Favre <benoit.favre@lif.univ-mrs.fr> +inspired by http://www.ark.cs.cmu.edu/SEMAFOR/training/ and https://github.com/sammthomson/semafor/tree/master/training + + +1) First, edit the config.sh file in order to reflect your setup. + +2) Make sure you have macaon installed and available in path, or change the 01-parse-data.sh script to use your own dependency parser + +3) Run commands in that order + +00-install-deps.sh +01-parse-data.sh +02-create-more-data-structures.sh +03-train-frame-identification.sh +04-train-argument-identification.sh + +4) Brew lots of coffee because those commands are really long (~24h, 15G of disc) + +models will be ready in data/idmodel.dat and data/argmodel.dat diff --git a/config.sh b/config.sh index 405579a4ac5a968e32a64dc40aec8d34302a11bf..652f77a73ef677a2c8cc79355b593b061027139d 100644 --- a/config.sh +++ b/config.sh @@ -1,13 +1,31 @@ #!/bin/bash +# make bash exit as soon as a command fails set -u -e -o pipefail +# === to be modified === + +# location of framenet data in its original xml format +framenet=/storage/raid1/corpora/fndata-1.5 + +# location of javac/java +jhome=/usr/bin/ + +# location of macaon common/config files +export MACAON_DIR=~benoit.favre/work/macaon/maca_data + +# macaon parser model +parser_model=$MACAON_DIR/en/maca_graph_parser/maca_graph_parser_model1_en + +# macaon tagger model +tagger_model=$MACAON_DIR/en/maca_crf_tagger/crf_tagger_model_en.bin +tagger_dictionary=$MACAON_DIR/en/maca_crf_tagger/crf_tagger_wordtag_lexicon_en.bin + + +# === keep the following as is === SEMAFOR_HOME=$PWD/deps/semafor-semantic-parser classpath=${SEMAFOR_HOME}/lib/semafor-deps.jar:${SEMAFOR_HOME} datadir=$PWD/data -framenet=/storage/raid1/corpora/fndata-1.5 luxmldir=$framenet/lu -jhome=/usr/bin/ - fefile=${datadir}/framenet