Skip to content
Snippets Groups Projects
Select Git revision
  • b74a8e24b594528dfc789ff62b457b4914f18f7e
  • master default protected
2 results

Makefile

Blame
  • Makefile 1.49 KiB
    SCRIPTS=../../../../scripts
    CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl
    
    TRAIN_FILES=$(shell find . -type f -name '*train*.conllu')
    DEV_FILES=$(shell find . -type f -name '*dev*.conllu')
    TEST_FILES=$(shell find . -type f -name '*test*.conllu')
    
    #This part is for lemmatizer rules and excpetions computation
    THRESHOLD=10
    FPLM_FILENAME=fplm
    
    all: tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain
    	rm -f all_no_test.conllu
    
    all_no_test.conllu:
    	cat $(TRAIN_FILES) $(DEV_FILES) > $@
    
    tokenizer.ts: all_no_test.conllu
    	echo "ENDWORD" > $@
    	$(SCRIPTS)/conllu2splits.py $< > splitwords.ts 2> ambiguities.txt
    	echo "SPLIT 0" >> $@
    	echo "SPLIT 1" >> $@
    	echo "SPLIT 2" >> $@
    	echo "SPLIT 3" >> $@
    	echo "SPLIT 4" >> $@
    	echo "SPLIT 5" >> $@
    	echo "SPLIT 6" >> $@
    	echo "SPLIT 7" >> $@
    	echo "ADDCHARTOWORD 1" >> $@
    	echo "ADDCHARTOWORD 2" >> $@
    	echo "ADDCHARTOWORD 3" >> $@
    	echo "ADDCHARTOWORD 4" >> $@
    	echo "ADDCHARTOWORD 5" >> $@
    	echo "ADDCHARTOWORD 6" >> $@
    	echo "IGNORECHAR" >> $@
    	sed -i -e 's/^/<tokenizer> /' $@
    	sed -i -e 's/^/<tokenizer> /' splitwords.ts
    
    segmenter.ts:
    	echo "EOS b.0" > $@
    	echo "NOTHING" >> $@
    	sed -i -e 's/^/<segmenter> /' $@
     
    transitions: all_no_test.conllu
    	./getTransitionSets.py $<
    
    texts:
    	./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES)
    
    pretrain:
    	./pretrainEmbeddings.py $(TRAIN_FILES) 64 pretrained.w2v
    
    $(FPLM_FILENAME): all_no_test.conllu
    	$(SCRIPTS)/conllu2fplm.py $< > $@
    
    clean:
    	- rm -f *\.ts
    	- rm -f ambiguities\.txt
    	- rm -f $(FPLM_FILENAME)