Something went wrong on our end
-
Franck Dary authoredFranck Dary authored
Makefile 1.71 KiB
include ../config
SCRIPTS=../../scripts
CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl
MCD=conllu.mcd
TRAIN_FILES=$(shell find $(UD_ROOT) -type f -name '*train*.conllu')
DEV_FILES=$(shell find $(UD_ROOT) -type f -name '*dev*.conllu')
TEST_FILES=$(shell find $(UD_ROOT) -type f -name '*test*.conllu')
#This part is for lemmatizer rules and excpetions computation
THRESHOLD=10
FPLM_FILENAME=fplm
RULES_FILENAME=lemmatizer_rules.ts
EXCEPTIONS_FPLM_FILENAME=maca_trans_lemmatizer_exceptions.fplm
all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns $(FPLM_FILENAME) $(RULES_FILENAME)
rm col_*\.txt
rm all_no_test.conllu
all_no_test.conllu:
cat $(TRAIN_FILES) > $@
tokenizer.ts: all_no_test.conllu $(MCD)
echo "ENDWORD" > $@
echo "IGNORECHAR" >> $@
$(SCRIPTS)/conllu2splits.py $< $(MCD) >> $@ 2> ambiguities.txt
echo "ADDCHARTOWORD" >> $@
sed -i -e 's/^/<tokenizer> /' $@
segmenter.ts:
echo "EOS b.0" > $@
echo "REWRITE b.0 EOS _" >> $@
columns: all_no_test.conllu $(MCD)
for number in 1 2 3 4 5 6 7 8 9 10 ; do \
cat all_no_test.conllu | sed '/^#/ d' | cut -f$$number | sort --unique > col_$$number.txt ; \
done
./getTransitionSets.py $(MCD) col_*\.txt
cat tagger.ts parser.ts > taggerparser.ts
texts:
./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES)
$(FPLM_FILENAME): all_no_test.conllu $(MCD)
$(SCRIPTS)/conllu2fplm.py $< $(MCD) > $@
$(RULES_FILENAME): $(FPLM_FILENAME)
macaon_compute_l_rules -f $(FPLM_FILENAME) -e $(EXCEPTIONS_FPLM_FILENAME) -r tmp.txt -t $(THRESHOLD)
rm tmp.txt
echo -e "Default : NOTHING\nTOLOWER b.0 LEMMA\nTOUPPER b.0 LEMMA" > lemmatizer_case.ts
clean:
- rm *\.txt
- rm *\.conll*
- rm *\.ts
- rm $(RULES_FILENAME)
- rm $(EXCEPTIONS_FPLM_FILENAME)
- rm $(FPLM_FILENAME)