Something went wrong on our end
-
Franck Dary authoredFranck Dary authored
Makefile 1.44 KiB
include ../config
SCRIPTS=../../../../scripts
CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl
TRAIN_FILES=$(shell find $(CORPUS) -type f -name '*train*.conllu')
DEV_FILES=$(shell find $(CORPUS) -type f -name '*dev*.conllu')
TEST_FILES=$(shell find $(CORPUS) -type f -name '*test*.conllu')
#This part is for lemmatizer rules and excpetions computation
THRESHOLD=10
FPLM_FILENAME=fplm
all: tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain
rm -f col_*\.txt
rm -f all_no_test.conllu
all_no_test.conllu:
cat $(TRAIN_FILES) $(DEV_FILES) > $@
tokenizer.ts: all_no_test.conllu
echo "ENDWORD" > $@
$(SCRIPTS)/conllu2splits.py $< > splitwords.ts 2> ambiguities.txt
echo "SPLIT 0" >> $@
echo "SPLIT 1" >> $@
echo "SPLIT 2" >> $@
echo "SPLIT 3" >> $@
echo "SPLIT 4" >> $@
echo "SPLIT 5" >> $@
echo "SPLIT 6" >> $@
echo "SPLIT 7" >> $@
echo "ADDCHARTOWORD" >> $@
echo "IGNORECHAR" >> $@
sed -i -e 's/^/<tokenizer> /' $@
sed -i -e 's/^/<tokenizer> /' splitwords.ts
segmenter.ts:
echo "EOS b.0" > $@
echo "NOTHING" >> $@
sed -i -e 's/^/<segmenter> /' $@
transitions: all_no_test.conllu
./getTransitionSets.py $<
texts:
./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES)
pretrain: texts
./pretrainEmbeddings.py $(shell find $(CORPUS) -type f -name '*train*.txt') 64
$(FPLM_FILENAME): all_no_test.conllu
$(SCRIPTS)/conllu2fplm.py $< > $@
clean:
- rm -f *\.txt
- rm -f *\.conll*
- rm -f *\.ts
- rm -f $(FPLM_FILENAME)