Skip to content
Snippets Groups Projects
Commit c8ed2c54 authored by Franck Dary's avatar Franck Dary
Browse files

Improved lemmatizer. Changed the generation of fplm. Not using any info of...

Improved lemmatizer. Changed the generation of fplm. Not using any info of test dataset on training.
parent 0a856c60
No related branches found
No related tags found
No related merge requests found
......@@ -12,29 +12,30 @@ FP_FILENAME=fP
RULES_FILENAME=lemmatizer_rules.as
EXCEPTIONS_FPLM_FILENAME=maca_trans_lemmatizer_exceptions.fplm
all: tokenizer.as texts all.conllu columns $(FPLM_FILENAME) $(FP_FILENAME) $(RULES_FILENAME)
all: tokenizer.as texts all_no_test.conllu columns $(FPLM_FILENAME) $(FP_FILENAME) $(RULES_FILENAME)
rm col_*\.txt
rm all.conllu
rm all_no_test.conllu
all.conllu:
cat $(UD_ROOT)*/*\.conllu > $@
all_no_test.conllu:
cat $(UD_ROOT)*/*train*\.conllu > $@
cat $(UD_ROOT)*/*dev*\.conllu >> $@
tokenizer.as: all.conllu $(MCD)
tokenizer.as: all_no_test.conllu $(MCD)
echo "Default : IGNORECHAR" > $@
$(TOOLS)/conllu2splits.py $< $(MCD) >> $@ 2> ambiguities.txt
echo "ENDWORD" >> $@
echo "ADDCHARTOWORD" >> $@
columns: all.conllu $(MCD)
columns: all_no_test.conllu $(MCD)
for number in 1 2 3 4 5 6 7 8 9 10 ; do \
cat all.conllu | sed '/^#/ d' | cut -f$$number | sort --unique > col_$$number.txt ; \
cat all_no_test.conllu | sed '/^#/ d' | cut -f$$number | sort --unique > col_$$number.txt ; \
done
./getActionSets.py $(MCD) col_*\.txt
texts:
./getRawText.py $(CONLL2TXT) $(UD_ROOT)*/*\.conllu
$(FPLM_FILENAME): all.conllu $(MCD)
$(FPLM_FILENAME): all_no_test.conllu $(MCD)
$(TOOLS)/conllu2fplm.py $< $(MCD) > $@
$(FP_FILENAME): $(FPLM_FILENAME)
......@@ -44,6 +45,7 @@ $(RULES_FILENAME): $(FPLM_FILENAME)
macaon_compute_l_rules -f $(FPLM_FILENAME) -e $(EXCEPTIONS_FPLM_FILENAME) -r tmp.txt $(STRICT) -t $(THRESHOLD)
cat tmp.txt | sed s/^/RULE\ LEMMA\ ON\ FORM\ /g | sed s/RULE\ LEMMA\ ON\ FORM\ @@$$/Default\ :\ \ RULE\ LEMMA\ ON\ FORM\ @@/g > $@
rm tmp.txt
echo -e "Default : NOTHING\nTOLOWER b.0 LEMMA\nTOUPPER b.0 LEMMA" > lemmatizer_case.as
clean:
- rm *\.txt
......
......@@ -4,3 +4,8 @@ Lemmatizer_Rules_form 30 Embeddings
Lemmatizer_Rules_letters 10 Embeddings
Lemmatizer_Rules_pos 30 Embeddings
Lemmatizer_Rules_morpho 30 Embeddings
#############################################
Lemmatizer_Case_form 30 Embeddings
Lemmatizer_Case_letters 10 Embeddings
Lemmatizer_Case_pos 30 Embeddings
Lemmatizer_Case_morpho 30 Embeddings
Name : Lemmatizer_Case
Type : Prediction
Oracle : lemma_case
Feature Model : lemmatizer_case.fm
Action Set : data/lemmatizer_case.as
Topology : (100,RELU,0.1)
Dynamic : yes
# Features classiques
b.0#POS
b.0#XPOS
b.0#MORPHO
b.0#FORM
# Suffixes
b.0#FORM.PART.-4.-4
b.0#FORM.PART.-3.-3
b.0#FORM.PART.-2.-2
b.0#FORM.PART.-1.-1
b.0#FORM.PART.0.0
b.0#FORM.PART.1.1
b.0#FORM.PART.2.2
b.0#FORM.PART.3.3
......@@ -3,13 +3,17 @@ Dicts : lemmatizer.dicts
%CLASSIFIERS
lemmatizer_lookup lemmatizer_lookup.cla
lemmatizer_rules lemmatizer_rules.cla
lemmatizer_case lemmatizer_case.cla
strategy strategy.cla
%STATES
strategy strategy
lemmatizer_lookup lemmatizer_lookup
lemmatizer_rules lemmatizer_rules
lemmatizer_case lemmatizer_case
%TRANSITIONS
lemmatizer_lookup strategy *
lemmatizer_rules strategy *
lemmatizer_case strategy *
strategy lemmatizer_lookup MOVE lemmatizer_lookup
strategy lemmatizer_rules MOVE lemmatizer_rules
strategy lemmatizer_case MOVE lemmatizer_case
......@@ -20,6 +20,11 @@ def readMCD(mcdFilename) :
return mcd
def sameLineWithoutLemma(l1, l2) :
l1s = l1.split('\t')
l2s = l2.split('\t')
return (l1s[:-3],l1s[-1]) == (l2s[:-3],l2s[-1])
if __name__ == "__main__" :
sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
......@@ -30,7 +35,7 @@ if __name__ == "__main__" :
conllMCD = readMCD(sys.argv[2])
conllMCDr = {v: k for k, v in conllMCD.items()}
entriesSet = set()
entriesCount = {}
entriesList = []
for line in open(sys.argv[1], "r", encoding="utf8") :
......@@ -48,11 +53,25 @@ if __name__ == "__main__" :
entry = entry + columns[int(conllMCDr[col])] + '\t'
entry = entry[:-1]
if entry not in entriesSet :
entriesSet.add(entry)
if entry not in entriesCount :
entriesCount[entry] = 1
else :
entriesCount[entry] = 1+entriesCount[entry]
for entry in entriesCount :
entriesList.append(entry)
entriesList.sort()
for entry in entriesList :
print(entry)
i = 0
while i < len(entriesList) :
maxCount = 0
maxIndex = 0
j = i
while j < len(entriesList) and sameLineWithoutLemma(entriesList[i], entriesList[j]) :
if entriesCount[entriesList[j]] > maxCount :
maxCount = entriesCount[entriesList[j]]
maxIndex = j
j = j+1
print("%s"%(entriesList[maxIndex]))
i = j
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment