From b2f5a650eb11ae109eb05c1a217f6f5e3799a2a3 Mon Sep 17 00:00:00 2001
From: Benoit Favre <benoit.favre@gmail.com>
Date: Thu, 31 Jul 2014 15:32:26 +0200
Subject: [PATCH] fix paths, add step 4

---
 00-install-deps.sh                  |  6 +++--
 02-create-more-data-structures.sh   |  4 +++
 03-train-frame-identification.sh    | 12 ++++++---
 04-train-argument-identification.sh | 38 +++++++++++++++++++++++++++++
 config.sh                           |  6 ++---
 5 files changed, 58 insertions(+), 8 deletions(-)
 create mode 100755 04-train-argument-identification.sh

diff --git a/00-install-deps.sh b/00-install-deps.sh
index cfac195..2d8a28f 100755
--- a/00-install-deps.sh
+++ b/00-install-deps.sh
@@ -4,7 +4,9 @@
 
 rm -rf deps/semafor 
 
-git clone https://github.com/sammthomson/semafor.git deps/semafor
+#git clone https://github.com/sammthomson/semafor.git deps/semafor
+wget https://semafor-semantic-parser.googlecode.com/files/SEMAFOR-2.1.tgz -O SEMAFOR-2.1.tgz
+tar -C deps -xf SEMAFOR-2.1.tgz
 
-javac -cp ${classpath} deps/semafor/edu/cmu/cs/lti/ark/fn/identification/*.java
+javac -cp ${classpath} $SEMAFOR_HOME/edu/cmu/cs/lti/ark/fn/identification/*.java
 
diff --git a/02-create-more-data-structures.sh b/02-create-more-data-structures.sh
index 59550e6..75c39e2 100755
--- a/02-create-more-data-structures.sh
+++ b/02-create-more-data-structures.sh
@@ -2,6 +2,8 @@
 
 . config.sh
 
+pushd $SEMAFOR_HOME
+
 ${jhome}/java -classpath ${classpath}:. -Xms2g -Xmx2g edu.cmu.cs.lti.ark.fn.identification.RequiredDataCreation \
     stopwords-file:${SEMAFOR_HOME}/stopwords.txt \
     wordnet-configfile:${SEMAFOR_HOME}/file_properties.xml \
@@ -14,3 +16,5 @@ ${jhome}/java -classpath ${classpath}:. -Xms2g -Xmx2g edu.cmu.cs.lti.ark.fn.iden
     revisedmapfile:${datadir}/revisedrelmap.ser \
     lemmacachefile:${datadir}/hvlemmas.ser \
     fnidreqdatafile:${datadir}/reqData.jobj
+
+popd
diff --git a/03-train-frame-identification.sh b/03-train-frame-identification.sh
index 60e21c7..77853b9 100755
--- a/03-train-frame-identification.sh
+++ b/03-train-frame-identification.sh
@@ -22,7 +22,10 @@ $jhome/java -classpath ${classpath} -Xms8G -Xmx8G \
     numthreads:4
 
 # step 2: combine alphabets?
-$jhome/java -classpath ${classpath} -Xms8G -Xms8G edu.cmu.cs.lti.ark.fn.identification.CombineAlphabets ${datadir} ${datadir}/alphabet.dat
+$jhome/java -classpath ${classpath} -Xms8G -Xms8G \
+    edu.cmu.cs.lti.ark.fn.identification.CombineAlphabets \
+    ${datadir} \
+    ${datadir}/alphabet.dat
 
 # step 3: creating feature events
 $jhome/java -classpath ${classpath} -Xms8G -Xmx8G \
@@ -40,7 +43,7 @@ $jhome/java -classpath ${classpath} -Xms8G -Xmx8G \
     numthreads:4
 
 # step 4: traning the frame identification model
-mkdir ${datadir}/models_0.0
+mkdir -p ${datadir}/models_0.0
 $jhome/java -classpath ${classpath} -Xms8G -Xmx8G \
     edu.cmu.cs.lti.ark.fn.identification.TrainBatchModelDerThreaded \
     alphabetfile:${datadir}/alphabet.dat \
@@ -53,5 +56,8 @@ $jhome/java -classpath ${classpath} -Xms8G -Xmx8G \
     numthreads:8
 
 # step 5: convert alphabet files
-$jhome/java -classpath ${classpath} -Xms8G -Xms8G edu.cmu.cs.lti.ark.fn.identification.ConvertAlphabetFile ${datadir}/alphabet.dat ${datadir}/models_0.0/idmodel.dat ${datadir}/idmodel.dat
+$jhome/java -classpath ${classpath} -Xms8G -Xms8G edu.cmu.cs.lti.ark.fn.identification.ConvertAlphabetFile \
+    ${datadir}/alphabet.dat \
+    ${datadir}/models_0.0/idmodel.dat \
+    ${datadir}/idmodel.dat
 
diff --git a/04-train-argument-identification.sh b/04-train-argument-identification.sh
new file mode 100755
index 0000000..c49788a
--- /dev/null
+++ b/04-train-argument-identification.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+. config.sh
+
+mkdir ${datadir}/scan
+
+# step 1: Alphabet Creation
+$jhome/java -classpath ${classpath} -Xms4000m -Xmx4000m edu.cmu.cs.lti.ark.fn.parsing.CreateAlphabet \
+    ${fefile}.frame.elements \
+    ${fefile}.all.lemma.tags \
+    ${datadir}/scan/cv.train.events.bin \
+    ${datadir}/scan/parser.conf.unlabeled \
+    ${datadir}/scan/cv.train.sentences.frame.elements.spans \
+    true \
+    false \
+    1 \
+    null \
+    ${datadir}/framenet.frame.element.map
+
+
+# step 2: Caching Feature Vectors
+$jhome/java -classpath ${classpath} -Xms4000m -Xmx4000m edu.cmu.cs.lti.ark.fn.parsing.FrameFeaturesCache \
+    eventsfile:${datadir}/scan/cv.train.events.bin \
+    spansfile:${datadir}/scan/cv.train.sentences.frame.elements.spans \
+    train-framefile:${fefile}.frame.elements \
+    localfeaturescache:${datadir}/scan/featurecache.jobj
+
+# step 3: training
+$jhome/java -classpath ${classpath} -Xms8000m -Xmx8000m edu.cmu.cs.lti.ark.fn.parsing.TrainingBatchMain \
+    model:${datadir}/argmodel.dat \ 
+    alphabetfile:${datadir}/scan/parser.conf.unlabeled \
+    localfeaturescache:${datadir}/scan/featurecache.jobj \
+    train-framefile:${fefile}.frame.elements \
+    regularization:reg \
+    lambda:0.1 \
+    numthreads:4 \
+    binaryoverlapfactor:false
+
diff --git a/config.sh b/config.sh
index 341d1d8..405579a 100644
--- a/config.sh
+++ b/config.sh
@@ -2,9 +2,9 @@
 
 set -u -e -o pipefail
 
-SEMAFOR_HOME=$PWD/deps/semafor
-classpath=${SEMAFOR_HOME}/lib/semafor-deps.jar
-datadir=data/training
+SEMAFOR_HOME=$PWD/deps/semafor-semantic-parser
+classpath=${SEMAFOR_HOME}/lib/semafor-deps.jar:${SEMAFOR_HOME}
+datadir=$PWD/data
 framenet=/storage/raid1/corpora/fndata-1.5
 luxmldir=$framenet/lu
 jhome=/usr/bin/
-- 
GitLab