diff --git a/00-install-deps.sh b/00-install-deps.sh index 2d8a28fc0c22e504acd1fd24759cf07d38a5ca86..6a612c1300c1170d2fdeabb396191f58a97edba4 100755 --- a/00-install-deps.sh +++ b/00-install-deps.sh @@ -8,5 +8,5 @@ rm -rf deps/semafor wget https://semafor-semantic-parser.googlecode.com/files/SEMAFOR-2.1.tgz -O SEMAFOR-2.1.tgz tar -C deps -xf SEMAFOR-2.1.tgz -javac -cp ${classpath} $SEMAFOR_HOME/edu/cmu/cs/lti/ark/fn/identification/*.java +javac -cp ${classpath} $SEMAFOR_HOME/edu/cmu/cs/lti/ark/fn/{identification,parsing}/*.java diff --git a/03-train-frame-identification.sh b/03-train-frame-identification.sh index 77853b9ba800f30c5ecd06ba7193ed5098708210..5b0771b9be0fe7a9fb81f391fc1a422fd0c0f8f4 100755 --- a/03-train-frame-identification.sh +++ b/03-train-frame-identification.sh @@ -27,6 +27,7 @@ $jhome/java -classpath ${classpath} -Xms8G -Xms8G \ ${datadir} \ ${datadir}/alphabet.dat +rm -rf ${datadir}/log # step 3: creating feature events $jhome/java -classpath ${classpath} -Xms8G -Xmx8G \ edu.cmu.cs.lti.ark.fn.identification.CreateEventsUnsupported \ @@ -42,6 +43,7 @@ $jhome/java -classpath ${classpath} -Xms8G -Xmx8G \ endindex:${end} \ numthreads:4 +rm -rf ${datadir}/log # step 4: traning the frame identification model mkdir -p ${datadir}/models_0.0 $jhome/java -classpath ${classpath} -Xms8G -Xmx8G \ @@ -56,8 +58,9 @@ $jhome/java -classpath ${classpath} -Xms8G -Xmx8G \ numthreads:8 # step 5: convert alphabet files +model=`ls ${datadir}/models_0.0/idmodel.dat_* | sort -n -k3 -t_ | tail -1` $jhome/java -classpath ${classpath} -Xms8G -Xms8G edu.cmu.cs.lti.ark.fn.identification.ConvertAlphabetFile \ ${datadir}/alphabet.dat \ - ${datadir}/models_0.0/idmodel.dat \ + $model \ ${datadir}/idmodel.dat diff --git a/04-train-argument-identification.sh b/04-train-argument-identification.sh index c49788a0af66defbc0235a6899b1202d4848ac45..c418ec2c07a34796c0f538ca6e4f104e836e8800 100755 --- a/04-train-argument-identification.sh +++ b/04-train-argument-identification.sh @@ -2,32 +2,36 @@ . config.sh -mkdir ${datadir}/scan +mkdir -p lrdata +ln -sf ${SEMAFOR_HOME}/file_properties.xml . +ln -sf ${SEMAFOR_HOME}/dict . +ln -sf ${SEMAFOR_HOME}/stopwords.txt lrdata +mkdir -p ${datadir}/scan # step 1: Alphabet Creation -$jhome/java -classpath ${classpath} -Xms4000m -Xmx4000m edu.cmu.cs.lti.ark.fn.parsing.CreateAlphabet \ - ${fefile}.frame.elements \ - ${fefile}.all.lemma.tags \ - ${datadir}/scan/cv.train.events.bin \ - ${datadir}/scan/parser.conf.unlabeled \ - ${datadir}/scan/cv.train.sentences.frame.elements.spans \ - true \ - false \ - 1 \ - null \ - ${datadir}/framenet.frame.element.map +#$jhome/java -classpath ${classpath} -Xms4000m -Xmx4000m edu.cmu.cs.lti.ark.fn.parsing.CreateAlphabet \ +# ${fefile}.frame.elements \ +# ${fefile}.all.lemma.tags \ +# ${datadir}/scan/cv.train.events.bin \ +# ${datadir}/scan/parser.conf.unlabeled \ +# ${datadir}/scan/cv.train.sentences.frame.elements.spans \ +# true \ +# false \ +# 1 \ +# null \ +# ${datadir}/framenet.frame.element.map # step 2: Caching Feature Vectors -$jhome/java -classpath ${classpath} -Xms4000m -Xmx4000m edu.cmu.cs.lti.ark.fn.parsing.FrameFeaturesCache \ - eventsfile:${datadir}/scan/cv.train.events.bin \ - spansfile:${datadir}/scan/cv.train.sentences.frame.elements.spans \ - train-framefile:${fefile}.frame.elements \ - localfeaturescache:${datadir}/scan/featurecache.jobj +#$jhome/java -classpath ${classpath} -Xms4000m -Xmx4000m edu.cmu.cs.lti.ark.fn.parsing.FrameFeaturesCache \ +# eventsfile:${datadir}/scan/cv.train.events.bin \ +# spansfile:${datadir}/scan/cv.train.sentences.frame.elements.spans \ +# train-framefile:${fefile}.frame.elements \ +# localfeaturescache:${datadir}/scan/featurecache.jobj # step 3: training $jhome/java -classpath ${classpath} -Xms8000m -Xmx8000m edu.cmu.cs.lti.ark.fn.parsing.TrainingBatchMain \ - model:${datadir}/argmodel.dat \ + model:${datadir}/argmodel.dat \ alphabetfile:${datadir}/scan/parser.conf.unlabeled \ localfeaturescache:${datadir}/scan/featurecache.jobj \ train-framefile:${fefile}.frame.elements \ diff --git a/scripts/fulltext_to_elements.py b/scripts/fulltext_to_elements.py index 00723472ac543bdf57d92f02d8fce4c39c5daa30..0790d6e8807e0e0c936ed756091d5fbab994f30a 100644 --- a/scripts/fulltext_to_elements.py +++ b/scripts/fulltext_to_elements.py @@ -60,6 +60,10 @@ def process_fulltext_xml(filename): continue target_start, target_end = int(target_label_node.attrib['start']), int(target_label_node.attrib['end']) target = text[target_start: target_end + 1] + while index[target_start] == -1: + target_start += 1 + while index[target_end] == -1: + target_end -=1 target_token_number = '_'.join([str(x) for x in range(index[target_start], index[target_end] + 1)]) #print ' ', frame_name, lexical_unit_name, target, target_token_number @@ -68,6 +72,10 @@ def process_fulltext_xml(filename): frame_element = frame_element_node.attrib['name'] frame_element_start, frame_element_end = int(frame_element_node.attrib['start']), int(frame_element_node.attrib['end']) #print ' ', frame_element, index[frame_element_start], index[frame_element_end], text[frame_element_start: frame_element_end] + while index[frame_element_start] == -1: + frame_element_start += 1 + while index[frame_element_end] == -1: + frame_element_end -= 1 frame_elements.append(frame_element) frame_elements.append('%d:%d' % (index[frame_element_start], index[frame_element_end]))