diff --git a/00-install-deps.sh b/00-install-deps.sh index d2c5e170340395c12f63d683d3ed00de3287b66d..bf6f6fdeaa7f36d2c9b45f99d4482122923bcd93 100755 --- a/00-install-deps.sh +++ b/00-install-deps.sh @@ -4,12 +4,19 @@ rm -rf deps/semafor -#git clone https://github.com/sammthomson/semafor.git deps/semafor +# download semafor wget https://semafor-semantic-parser.googlecode.com/files/SEMAFOR-2.1.tgz -O deps/SEMAFOR-2.1.tgz tar -C deps -xf deps/SEMAFOR-2.1.tgz +# alternate source (note that some programs work differently): +#git clone https://github.com/sammthomson/semafor.git deps/semafor + +# compile required classes javac -cp ${classpath} $SEMAFOR_HOME/edu/cmu/cs/lti/ark/fn/{identification,parsing}/*.java # make sure macaon is installed which maca_crf_barebones_decoder which maca_graph_parser + +# make sure python-nltk is installed +python -c 'import nltk' diff --git a/04-train-argument-identification.sh b/04-train-argument-identification.sh index c418ec2c07a34796c0f538ca6e4f104e836e8800..4f05d5519d50a87e30cd0769d18df0ad2e8293a8 100755 --- a/04-train-argument-identification.sh +++ b/04-train-argument-identification.sh @@ -2,6 +2,7 @@ . config.sh +# bring in needed files mkdir -p lrdata ln -sf ${SEMAFOR_HOME}/file_properties.xml . ln -sf ${SEMAFOR_HOME}/dict . @@ -9,25 +10,25 @@ ln -sf ${SEMAFOR_HOME}/stopwords.txt lrdata mkdir -p ${datadir}/scan # step 1: Alphabet Creation -#$jhome/java -classpath ${classpath} -Xms4000m -Xmx4000m edu.cmu.cs.lti.ark.fn.parsing.CreateAlphabet \ -# ${fefile}.frame.elements \ -# ${fefile}.all.lemma.tags \ -# ${datadir}/scan/cv.train.events.bin \ -# ${datadir}/scan/parser.conf.unlabeled \ -# ${datadir}/scan/cv.train.sentences.frame.elements.spans \ -# true \ -# false \ -# 1 \ -# null \ -# ${datadir}/framenet.frame.element.map +$jhome/java -classpath ${classpath} -Xms4000m -Xmx4000m edu.cmu.cs.lti.ark.fn.parsing.CreateAlphabet \ + ${fefile}.frame.elements \ + ${fefile}.all.lemma.tags \ + ${datadir}/scan/cv.train.events.bin \ + ${datadir}/scan/parser.conf.unlabeled \ + ${datadir}/scan/cv.train.sentences.frame.elements.spans \ + true \ + false \ + 1 \ + null \ + ${datadir}/framenet.frame.element.map # step 2: Caching Feature Vectors -#$jhome/java -classpath ${classpath} -Xms4000m -Xmx4000m edu.cmu.cs.lti.ark.fn.parsing.FrameFeaturesCache \ -# eventsfile:${datadir}/scan/cv.train.events.bin \ -# spansfile:${datadir}/scan/cv.train.sentences.frame.elements.spans \ -# train-framefile:${fefile}.frame.elements \ -# localfeaturescache:${datadir}/scan/featurecache.jobj +$jhome/java -classpath ${classpath} -Xms4000m -Xmx4000m edu.cmu.cs.lti.ark.fn.parsing.FrameFeaturesCache \ + eventsfile:${datadir}/scan/cv.train.events.bin \ + spansfile:${datadir}/scan/cv.train.sentences.frame.elements.spans \ + train-framefile:${fefile}.frame.elements \ + localfeaturescache:${datadir}/scan/featurecache.jobj # step 3: training $jhome/java -classpath ${classpath} -Xms8000m -Xmx8000m edu.cmu.cs.lti.ark.fn.parsing.TrainingBatchMain \ diff --git a/README b/README index 6995b5cf91675a67259b7e73767f3a7083ada463..c139c39e3aee435654087c69386121c4766c5602 100644 --- a/README +++ b/README @@ -5,9 +5,14 @@ Benoit Favre <benoit.favre@lif.univ-mrs.fr> inspired by http://www.ark.cs.cmu.edu/SEMAFOR/training/ and https://github.com/sammthomson/semafor/tree/master/training +0) Brew lots of coffee because retrining semafor is really long (>24h, 15G of disc) + 1) First, edit the config.sh file in order to reflect your setup. -2) Make sure you have macaon installed and available in path, or change the 01-parse-data.sh script to use your own dependency parser +2) Make sure you have the dependencies installed: + - java 1.6 + - macaon, or change the 01-parse-data.sh script to use your own dependency parser + - ntlk with wordnet 3) Run commands in that order @@ -17,6 +22,6 @@ inspired by http://www.ark.cs.cmu.edu/SEMAFOR/training/ and https://github.com/s 03-train-frame-identification.sh 04-train-argument-identification.sh -4) Brew lots of coffee because those commands are really long (~24h, 15G of disc) +4) Enjoy models will be ready in data/idmodel.dat and data/argmodel.dat diff --git a/scripts/conll07_to_tags.py b/scripts/conll07_to_tags.py index 85cfd2f004e650613f36b97d7910b78e0fd8b924..4f01f574581d58c0786c4a6374e059a3199c5bff 100644 --- a/scripts/conll07_to_tags.py +++ b/scripts/conll07_to_tags.py @@ -8,7 +8,19 @@ def process_sentence(words): output.extend([x[7] for x in words]) # dependency labels output.extend([x[6] for x in words]) # dependency parent output.extend(['O' for x in words]) # dummy slot for NEs - output.extend([wn.morphy(x[1].decode('utf-8')) for x in words]) # lemma from wordnet + lemmas = [wn.morphy(x[1].lower().decode('utf-8')) for x in words] # lemma from wordnet + mapping = {'NN': 'n', 'NNS': 'n', 'JJ': 'a', 'JJS': 'a', 'JJR': 'a', 'RB': 'r', 'RBR': 'r', 'RBS': 'r', 'VB': 'v', 'VBD': 'v', 'VBN': 'v', 'VBG': 'v', 'VBP': 'v', 'VBZ': 'v'} + for tokens in words: + word = tokens[1].lower() + tag = mapping[tokens[3]] if tokens[3] in mapping else '' + if tokens[3] in mapping: + lemma = wn.morphy(word.decode('utf-8'), mapping[tokens[3]]) + else: + lemma = wn.morphy(word.decode('utf-8')) + if lemma != None: + output.append(lemma) + else: + output.append(word) print '\t'.join([str(x) for x in output]) lines = []