diff --git a/00-install-deps.sh b/00-install-deps.sh new file mode 100755 index 0000000000000000000000000000000000000000..cfac19544efd5c4293461d8cd5ecbb524eb137ec --- /dev/null +++ b/00-install-deps.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +. config.sh + +rm -rf deps/semafor + +git clone https://github.com/sammthomson/semafor.git deps/semafor + +javac -cp ${classpath} deps/semafor/edu/cmu/cs/lti/ark/fn/identification/*.java + diff --git a/01-parse-data.sh b/01-parse-data.sh new file mode 100755 index 0000000000000000000000000000000000000000..1fc2fb238bf572ea56248c922c23e6d5ee7eb3da --- /dev/null +++ b/01-parse-data.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +. config.sh + +python2 scripts/fulltext_to_elements.py $framenet/fulltext/*.xml > $fefile.frame.elements +python2 scripts/fulltext_to_conll07.py $framenet/fulltext/*.xml > $fefile.conll07 +scripts/run-parser.sh $fefile.conll07 +python2 scripts/conll07_to_tags.py < $fefile.conll07.parsed > $fefile.all.lemma.tags diff --git a/02-create-more-data-structures.sh b/02-create-more-data-structures.sh new file mode 100755 index 0000000000000000000000000000000000000000..59550e6c6e1959760ba321ce915478c2680cb22b --- /dev/null +++ b/02-create-more-data-structures.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +. config.sh + +${jhome}/java -classpath ${classpath}:. -Xms2g -Xmx2g edu.cmu.cs.lti.ark.fn.identification.RequiredDataCreation \ + stopwords-file:${SEMAFOR_HOME}/stopwords.txt \ + wordnet-configfile:${SEMAFOR_HOME}/file_properties.xml \ + framenet-mapfile:${datadir}/framenet.original.map \ + luxmldir:${luxmldir} \ + allrelatedwordsfile:${datadir}/allrelatedwords.ser \ + hvcorrespondencefile:${datadir}/hvmap.ser \ + wnrelatedwordsforwordsfile:${datadir}/wnallrelwords.ser \ + wnmapfile:${datadir}/wnMap.ser \ + revisedmapfile:${datadir}/revisedrelmap.ser \ + lemmacachefile:${datadir}/hvlemmas.ser \ + fnidreqdatafile:${datadir}/reqData.jobj diff --git a/03-train-frame-identification.sh b/03-train-frame-identification.sh new file mode 100755 index 0000000000000000000000000000000000000000..60e21c727739817fceb863901fe38711e4430fc4 --- /dev/null +++ b/03-train-frame-identification.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +. config.sh + +mkdir -p ${datadir}/events +rm -rf ${datadir}/log +end=`cat ${fefile}.frame.elements | wc -l` + +# step 1: alphabet creation +$jhome/java -classpath ${classpath} -Xms8G -Xmx8G \ + edu.cmu.cs.lti.ark.fn.identification.AlphabetCreationThreaded \ + train-fefile:${fefile}.frame.elements \ + train-parsefile:${fefile}.all.lemma.tags \ + stopwords-file:${SEMAFOR_HOME}/stopwords.txt \ + wordnet-configfile:${SEMAFOR_HOME}/file_properties.xml \ + fnidreqdatafile:${datadir}/reqData.jobj \ + logoutputfile:${datadir}/log \ + model:${datadir}/alphabet.dat \ + eventsfile:${datadir}/events \ + startindex:0 \ + endindex:${end} \ + numthreads:4 + +# step 2: combine alphabets? +$jhome/java -classpath ${classpath} -Xms8G -Xms8G edu.cmu.cs.lti.ark.fn.identification.CombineAlphabets ${datadir} ${datadir}/alphabet.dat + +# step 3: creating feature events +$jhome/java -classpath ${classpath} -Xms8G -Xmx8G \ + edu.cmu.cs.lti.ark.fn.identification.CreateEventsUnsupported \ + train-fefile:${fefile}.frame.elements \ + train-parsefile:${fefile}.all.lemma.tags \ + stopwords-file:${SEMAFOR_HOME}/stopwords.txt \ + wordnet-configfile:${SEMAFOR_HOME}/file_properties.xml \ + fnidreqdatafile:${datadir}/reqData.jobj \ + logoutputfile:${datadir}/log \ + model:${datadir}/alphabet.dat \ + eventsfile:${datadir}/events \ + startindex:0 \ + endindex:${end} \ + numthreads:4 + +# step 4: traning the frame identification model +mkdir ${datadir}/models_0.0 +$jhome/java -classpath ${classpath} -Xms8G -Xmx8G \ + edu.cmu.cs.lti.ark.fn.identification.TrainBatchModelDerThreaded \ + alphabetfile:${datadir}/alphabet.dat \ + eventsfile:${datadir}/events \ + model:${datadir}/models_0.0/idmodel.dat \ + regularization:reg \ + lambda:0.0 \ + restartfile:null \ + logoutputfile:${datadir}/log \ + numthreads:8 + +# step 5: convert alphabet files +$jhome/java -classpath ${classpath} -Xms8G -Xms8G edu.cmu.cs.lti.ark.fn.identification.ConvertAlphabetFile ${datadir}/alphabet.dat ${datadir}/models_0.0/idmodel.dat ${datadir}/idmodel.dat + diff --git a/config.sh b/config.sh new file mode 100644 index 0000000000000000000000000000000000000000..341d1d8b369cfc3e35fc3f57d011f1285d7a7faf --- /dev/null +++ b/config.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +set -u -e -o pipefail + +SEMAFOR_HOME=$PWD/deps/semafor +classpath=${SEMAFOR_HOME}/lib/semafor-deps.jar +datadir=data/training +framenet=/storage/raid1/corpora/fndata-1.5 +luxmldir=$framenet/lu +jhome=/usr/bin/ + +fefile=${datadir}/framenet + diff --git a/data/framenet.frame.element.map b/data/framenet.frame.element.map new file mode 100644 index 0000000000000000000000000000000000000000..7a1fa0a376dae47c456b3ef1c811212fffdc87e5 Binary files /dev/null and b/data/framenet.frame.element.map differ diff --git a/data/framenet.original.map b/data/framenet.original.map new file mode 100644 index 0000000000000000000000000000000000000000..bf2cd06178c25ad141807fca2ffd5b7dedc5e4df Binary files /dev/null and b/data/framenet.original.map differ diff --git a/docs/data-format-readme.txt b/docs/data-format-readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..55136972c3f374220c84917921764123ae894dec --- /dev/null +++ b/docs/data-format-readme.txt @@ -0,0 +1,32 @@ +This is a README about the full text annotations used for training SEMAFOR 2.0 +on FrameNet 1.5 full text annotations +Dipanjan Das +dipanjan@cs.cmu.edu +2/18/2012 +============================================================================== + +1) Of interest are the *.tokenized files which I have automatically tokenized using the Penn Treebank conventions. + +2) Parsed files are *.all.lemma.tags. The format these files follow is: + i) Each line contains a sentence with annotations. + ii) All tokens per line are tab separated. + iii) The first token is the number of words in the sentence (n). + iv) After that come n words. + v) Then come n POS tags. + vi) The third series of n tokens correspond to dependency tree labels for each word's syntactic parent. + vii) The fourth series of n tokens correspond to the index of each syntactic parent (0 is the dummy word, 1 is the first word, and so on). + viii) The fifth series of n tokens are '0'-s. These were there for providing the capability of using NE tags, but right now we don't use them. + ix) The final series of n tokens are lemmas for each word, computed using WordNet. + +3) The full text annotations in FrameNet are in *.frame.elements. The format of these files is: + i) Each line corresponds to one predicate-argument structure; again tokens are tab separated. + ii) The first token counts the number of roles and the frame. E.g., if there are 2 roles, this number will be 3. + iii) The second token is the frame. + iv) The third token is the lexical unit. + v) The fourth token is the token number of the actual target in the sentence (token numbers start with 0). + If the target has multiple words, the token numbers will be series of numbers separated by _. + vi) The fifth token is the actual form of the target in the sentence. + vii) The sixth token is the sentence number in the corresponding *.all.lemma.tags file in which this predicate-argument + structure appears. Again sentence numbers start from 0. + viii) After that come role and span pairs. If the span contains more than one word, the span is denoted by start:end, + where start is the index of the first word in the span, and end is the last one. The word indices again start from 0. diff --git a/docs/semafor-training-readme.txt b/docs/semafor-training-readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..14a2ee3fcd5943b7a38b127a7478152150a64720 --- /dev/null +++ b/docs/semafor-training-readme.txt @@ -0,0 +1,160 @@ +This is a README for training on the FrameNet 1.5 full text annotations +Dipanjan Das +dipanjan@cs.cmu.edu +2/18/2012 +======================================================================= + +Training models for frame-semantic parsing with SEMAFOR is still a very laborious and +clunky set of steps. Your kind patience is required to train the models :-) + +0) Run svn checkout https://semafor-semantic-parser.googlecode.com/svn/trunk/ semafor-semantic-parser. Compile necessary files as you go. + + +1) The first step is to create some data structures which are used to train and test the frame identification and argument identification models (please refer to our NAACL 2010 paper to understand these two steps). The first step is to create two maps -- I name these framenet.original.map and framenet.frame.element.map + + i) The first map is of type THashMap<String, THashSet<String>>. It maps a frame to a set of disambiguated predicates + (words along with part of speech tags, but in the style of FrameNet). + ii) The second map is of type THashMap<String,THashSet<String>>, which maps each frame to a set of frame element names. + In other words, this data structure is necessary for the argument identification model to know what + the frame elements are for each frame. + +My versions of these two maps are present in this directory (these are just serialized Java objects). Use the semafor-deps.jar file in lib/ directory of the googlecode repository to get the right version of GNU trove, and read (deserialize) these two maps. After that print the keys, and the corresponding values to see exactly what is stored in these maps. After that, you will need to create your own versions of these two maps for your domain, in exactly the same format as these maps. + +If you want existing code in SEMAFOR to create these maps, you could use the method writeSerializedObject(Object object, String outFile) in edu/cmu/cs/lti/ark/util/SerializedObjects.java to write serialize those maps. So creating your own maps will be easy. You could also read the maps using that class. + + + +2) The next step creates some more data structures used for the training and inference procedure. You will find a class called: edu/cmu/cs/lti/ark/fn/identification/RequiredDataCreation.java. Compile it and run the following: + +classpath=${SEMAFOR_HOME}/lib/semafor-deps.jar +datadir=<some directory where you store the data structures, including the two previous maps> +luxmldir=<the directory that contains all the lexical unit xmls for FrameNet 1.5; you can also add your own xmls to this directory; for format information, take a look at the lu/ directory under the FrameNet release> +jhome=<java bin home> + +${jhome}/java -classpath ${classpath} -Xms2g -Xmx2g edu.cmu.cs.lti.ark.fn.identification.RequiredDataCreation \ + stopwords-file:${SEMAFOR_HOME}/stopwords.txt \ + wordnet-configfile:${SEMAFOR_HOME}/file_properties.xml \ + framenet-mapfile:${datadir}/framenet.original.map \ + luxmldir:${luxmldir} \ + allrelatedwordsfile:${datadir}/allrelatedwords.ser \ + hvcorrespondencefile:${datadir}/hvmap.ser \ + wnrelatedwordsforwordsfile:${datadir}/wnallrelwords.ser \ + wnmapfile:${datadir}/wnMap.ser \ + revisedmapfile:${datadir}/revisedrelmap.ser \ + lemmacachefile:${datadir}/hvlemmas.ser \ + fnidreqdatafile:${datadir}/reqData.jobj + + + +3) This step corresponds to training the frame identification model. I will be using the flags declared above. + +# a directory where training events will be stored for efficiency +mkdir ${datadir}/events +fefile=${datadir}/cv.train.sentences.frame.elements +end=`wc -l ${fefile}` +end=`expr ${end% *} + + + i) + # step 1: alphabet creation + $jhome/java -classpath ${classpath} -Xms8000m -Xmx8000m \ + edu.cmu.cs.lti.ark.fn.identification.AlphabetCreationThreaded \ + train-fefile:${datadir}/cv.train.sentences.frame.elements \ + train-parsefile:${datadir}/cv.train.sentences.all.lemma.tags \ + stopwords-file:${SEMAFOR_HOME}/stopwords.txt \ + wordnet-configfile:${SEMAFOR_HOME}/file_properties.xml \ + fnidreqdatafile:${datadir}/reqData.jobj \ + logoutputfile:${datadir}/log \ + model:${datadir}/alphabet.dat \ + eventsfile:${datadir}/events \ + startindex:0 \ + endindex:${end} \ + numthreads:4 + + + ii) In this step, we are combining the different alphabets each thread above creates. Run edu.cmu.cs.lti.ark.fn.identification.CombineAlphabets with two + arguments - the full path to the directory that contains the alphabet files, and the full path to the name of a combined alphabet file. After that's d one, you will have a single alphabet file, with which you will run the following step. + + iii) Creating feature events for each datapoint. + + # step 3: creating feature events + $jhome/java -classpath ${classpath} -Xms8000m -Xmx8000m \ + edu.cmu.cs.lti.ark.fn.identification.CreateEventsUnsupported \ + train-fefile:${datadir}/cv.train.sentences.frame.elements \ + train-parsefile:${datadir}/cv.train.sentences.all.lemma.tags \ + stopwords-file:${SEMAFOR_HOME}/stopwords.txt \ + wordnet-configfile:${SEMAFOR_HOME}/file_properties.xml \ + fnidreqdatafile:${datadir}/reqData.jobj \ + logoutputfile:${datadir}/log \ + model:${datadir}/alphabet.dat \ + eventsfile:${datadir}/events \ + startindex:0 \ + endindex:${end} \ + numthreads:4 + + + + iv) After the above step is done in the ${datadir}/events directory, you will find tons of the serialized objects, which are the feature vectors for + each data point. After this events creation is done, you can run training by running: + + # step 4: traning the frame identification model + + mkdir ${datadir}/models_0.0 + $jhome/java -classpath ${classpath} -Xms8g -Xmx8g \ + edu.cmu.cs.lti.ark.fn.identification.TrainBatchModelDerThreaded \ + alphabetfile:${datadir}/alphabet.dat \ + eventsfile:${datadir}/events \ + model:${datadir}/models_0.0/idmodel.dat \ + regularization:reg \ + lambda:0.0 \ + restartfile:null \ + logoutputfile:${datadir}/log \ + numthreads:8 + + +The training procedure will run for a long period of time. Line search in L-BFGS may fail at the end, but that does not mean training failed. In models_0.0, there will be models produced every few iterations. If line search failed, take the last model. + +We do not use the format of the model file produced by the above procedure, and convert it to a more usable version. To do that, +use edu.cmu.cs.lti.ark.fn.identification.ConvertAlphabetFile to convert the produced models to the new format. +The class takes three arguments: the alphabet file path, the model file path, and the output model file path. + + +4) This step corresponds to the training the argument identification model. + + + mkdir ${datadir}/scan + + # step 1: Alphabet Creation + + $jhome/java -classpath ${classpath} -Xms4000m -Xmx4000m edu.cmu.cs.lti.ark.fn.parsing.CreateAlphabet \ + ${datadir}/cv.train.sentences.frame.elements \ + ${datadir}/cv.train.sentences.all.lemma.tags \ + ${datadir}/scan/cv.train.events.bin \ + ${datadir}/scan/parser.conf.unlabeled \ + ${datadir}/scan/cv.train.sentences.frame.elements.spans \ + true \ + false \ + 1 \ + null \ + ${datadir}/framenet.frame.element.map + + + # step 2: Caching Feature Vectors + $jhome/java -classpath ${classpath} -Xms4000m -Xmx4000m edu.cmu.cs.lti.ark.fn.parsing.FrameFeaturesCache \ + eventsfile:${datadir}/scan/cv.train.events.bin \ + spansfile:${datadir}/scan/cv.train.sentences.frame.elements.spans \ + train-framefile:${datadir}/cv.train.sentences.frame.elements \ + localfeaturescache:${datadir}/scan/featurecache.jobj + + # step 3: training + $jhome/java -classpath ${classpath} -Xms8000m -Xmx8000m edu.cmu.cs.lti.ark.fn.parsing.TrainingBatchMain \ + model:${datadir}/argmodel.dat \ + alphabetfile:${datadir}/scan/parser.conf.unlabeled \ + localfeaturescache:${datadir}/scan/featurecache.jobj \ + train-framefile:${datadir}/cv.train.sentences.frame.elements \ + regularization:reg \ + lambda:0.1 \ + numthreads:4 \ + binaryoverlapfactor:false + +You may tune lambda on a development set to get the best results. \ No newline at end of file diff --git a/scripts/conll07_to_tags.py b/scripts/conll07_to_tags.py new file mode 100644 index 0000000000000000000000000000000000000000..85cfd2f004e650613f36b97d7910b78e0fd8b924 --- /dev/null +++ b/scripts/conll07_to_tags.py @@ -0,0 +1,21 @@ +import sys +from nltk.corpus import wordnet as wn + +def process_sentence(words): + output = [len(words)] + output.extend([x[1] for x in words]) # word forms + output.extend([x[3] for x in words]) # tags + output.extend([x[7] for x in words]) # dependency labels + output.extend([x[6] for x in words]) # dependency parent + output.extend(['O' for x in words]) # dummy slot for NEs + output.extend([wn.morphy(x[1].decode('utf-8')) for x in words]) # lemma from wordnet + print '\t'.join([str(x) for x in output]) + +lines = [] +for line in sys.stdin: + line = line.strip() + if line == '': + process_sentence(lines) + lines = [] + else: + lines.append(line.split()) diff --git a/scripts/fulltext_to_conll07.py b/scripts/fulltext_to_conll07.py new file mode 100644 index 0000000000000000000000000000000000000000..8220dbb49cf23d38a071f3f131952cbd08461c3b --- /dev/null +++ b/scripts/fulltext_to_conll07.py @@ -0,0 +1,43 @@ +import sys, re +import xml.etree.ElementTree as ET +from fulltext_to_elements import find, find_first + +ns = '{http://framenet.icsi.berkeley.edu}' + +# find a namespace-prefixed xml element, optionally filtered by attrib name/value +def find(element, path, attrib = {}): + path = re.sub(r'/([^/.])', '/' + ns + r'\1', path) + output = [] + for node in element.findall(path): + skip = False + for name, value in attrib.items(): + if name not in node.attrib or (value != None and node.attrib[name] != value): + skip = True + break + if not skip: + output.append(node) + return output + +def find_first(element, path, attrib = {}): + output = find(element, path, attrib) + return output[0] if len(output) > 0 else None + +sentence_id = 0 + +def process_fulltext_xml(filename): + global sentence_id + fp = open(filename) + root = ET.parse(fp).getroot() + fp.close() + + for sentence in find(root, './/sentence'): + text = find_first(sentence, './text').text + for word_id, word in enumerate(text.strip().split()): + print '\t'.join([str(word_id + 1), word] + ['_'] * 10) + print + sentence_id += 1 + +if __name__ == '__main__': + for filename in sorted(sys.argv[1:]): + print >>sys.stderr, filename + process_fulltext_xml(filename) diff --git a/scripts/fulltext_to_elements.py b/scripts/fulltext_to_elements.py new file mode 100644 index 0000000000000000000000000000000000000000..00723472ac543bdf57d92f02d8fce4c39c5daa30 --- /dev/null +++ b/scripts/fulltext_to_elements.py @@ -0,0 +1,82 @@ +import sys, re +import xml.etree.ElementTree as ET + +ns = '{http://framenet.icsi.berkeley.edu}' + +# find a namespace-prefixed xml element, optionally filtered by attrib name/value +def find(element, path, attrib = {}): + path = re.sub(r'/([^/.])', '/' + ns + r'\1', path) + output = [] + for node in element.findall(path): + skip = False + for name, value in attrib.items(): + if name not in node.attrib or (value != None and node.attrib[name] != value): + skip = True + break + if not skip: + output.append(node) + return output + +def find_first(element, path, attrib = {}): + output = find(element, path, attrib) + return output[0] if len(output) > 0 else None + +def token_index(text): + index = [] + token = 0 + in_separator = False + for character in text: + if character != ' ': + if in_separator: + in_separator = False + token += 1 + index.append(token) + else: + index.append(-1) + in_separator = True + return index + + +sentence_id = 0 + +def process_fulltext_xml(filename): + global sentence_id + fp = open(filename) + root = ET.parse(fp).getroot() + fp.close() + + for sentence in find(root, './/sentence'): + text = find_first(sentence, './text').text + index = token_index(text) + #print text + tokens = text.strip().split() + for annotation in find(sentence, ".//annotationSet[@status='MANUAL']"): + if 'luName' in annotation.attrib: + lexical_unit_name = annotation.attrib['luName'] + frame_name = annotation.attrib['frameName'] + + target_label_node = find_first(annotation, ".//label", {'name': 'Target'}) + if target_label_node == None: + continue + target_start, target_end = int(target_label_node.attrib['start']), int(target_label_node.attrib['end']) + target = text[target_start: target_end + 1] + target_token_number = '_'.join([str(x) for x in range(index[target_start], index[target_end] + 1)]) + + #print ' ', frame_name, lexical_unit_name, target, target_token_number + frame_elements = [] + for frame_element_node in find(annotation, "./layer[@name='FE']/label[@start]"): + frame_element = frame_element_node.attrib['name'] + frame_element_start, frame_element_end = int(frame_element_node.attrib['start']), int(frame_element_node.attrib['end']) + #print ' ', frame_element, index[frame_element_start], index[frame_element_end], text[frame_element_start: frame_element_end] + frame_elements.append(frame_element) + frame_elements.append('%d:%d' % (index[frame_element_start], index[frame_element_end])) + + output = [len(frame_elements) / 2 + 1, frame_name, lexical_unit_name, target_token_number, target, sentence_id] + output.extend(frame_elements) + print '\t'.join([str(x) for x in output]) + sentence_id += 1 + +if __name__ == '__main__': + for filename in sorted(sys.argv[1:]): + print >>sys.stderr, filename + process_fulltext_xml(filename) diff --git a/scripts/run-parser.sh b/scripts/run-parser.sh new file mode 100755 index 0000000000000000000000000000000000000000..8d6b3cc8bdf8bff7deee9ee1573e5c6817f5422f --- /dev/null +++ b/scripts/run-parser.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +MACAON_DIR=$HOME/work/macaon/macaon-for-uw_20140710/models/common/ +MODEL=$MACAON_DIR/../parser/ptb-order1/maca_graph_parser_model1_en +TAGGER=$HOME/work/macaon/git-maca_data/en/maca_crf_tagger/ + +input=$1 + +maca_crf_barebones_decoder --conll07 $TAGGER/crf_tagger_model_en.bin $TAGGER/crf_tagger_wordtag_lexicon_en.bin < $input > $input.tagged +maca_graph_parser -M e -C en -c $input.parsed -m $MODEL.bin -a $MODEL.alpha -d $MODEL.dep_count -x -v 1 -z $input.parsed