include ../config SCRIPTS=../../../../scripts CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl MCD=conllu.mcd TRAIN_FILES=$(shell find $(CORPUS) -type f -name '*train*.conllu') DEV_FILES=$(shell find $(CORPUS) -type f -name '*dev*.conllu') TEST_FILES=$(shell find $(CORPUS) -type f -name '*test*.conllu') #This part is for lemmatizer rules and excpetions computation THRESHOLD=10 FPLM_FILENAME=fplm RULES_FILENAME=lemmatizer_rules.ts EXCEPTIONS_FPLM_FILENAME=maca_trans_lemmatizer_exceptions.fplm all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns $(FPLM_FILENAME) $(RULES_FILENAME) rm -f col_*\.txt rm -f all_no_test.conllu all_no_test.conllu: cat $(TRAIN_FILES) $(DEV_FILES) > $@ tokenizer.ts: all_no_test.conllu $(MCD) echo "ENDWORD" > $@ $(SCRIPTS)/conllu2splits.py $< $(MCD) > splitwords.ts 2> ambiguities.txt echo "SPLIT 0" >> $@ echo "SPLIT 1" >> $@ echo "SPLIT 2" >> $@ echo "SPLIT 3" >> $@ echo "SPLIT 4" >> $@ echo "SPLIT 5" >> $@ echo "SPLIT 6" >> $@ echo "SPLIT 7" >> $@ echo "ADDCHARTOWORD" >> $@ echo "IGNORECHAR" >> $@ sed -i -e 's/^/<tokenizer> /' $@ sed -i -e 's/^/<tokenizer> /' splitwords.ts segmenter.ts: echo "EOS b.0" > $@ echo "REWRITE b.0 EOS _" >> $@ columns: all_no_test.conllu $(MCD) for number in 1 2 3 4 5 6 7 8 9 10 ; do \ cat all_no_test.conllu | sed '/^#/ d' | cut -f$$number | sort --unique > col_$$number.txt ; \ done ./getTransitionSets.py $(MCD) col_*\.txt cat tagger.ts parser.ts > taggerparser.ts texts: ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) $(FPLM_FILENAME): all_no_test.conllu $(MCD) $(SCRIPTS)/conllu2fplm.py $< $(MCD) > $@ $(RULES_FILENAME): $(FPLM_FILENAME) macaon_compute_l_rules -f $(FPLM_FILENAME) -e $(EXCEPTIONS_FPLM_FILENAME) -r tmp.txt -t $(THRESHOLD) rm -f tmp.txt echo -e "Default : NOTHING\nTOLOWER b.0 LEMMA\nTOUPPER b.0 LEMMA" > lemmatizer_case.ts clean: - rm -f *\.txt - rm -f *\.conll* - rm -f *\.ts - rm -f $(RULES_FILENAME) - rm -f $(EXCEPTIONS_FPLM_FILENAME) - rm -f $(FPLM_FILENAME)