diff --git a/en/data/morpho-lexicon/add_to_fplm.txt b/en/data/morpho-lexicon/add_to_fplm.txt index b25eb3a2ab5680fcd5c1d7fb81b52d255cd823e4..323817769461b03457aefa8b2b3e0a44f058e3f6 100644 --- a/en/data/morpho-lexicon/add_to_fplm.txt +++ b/en/data/morpho-lexicon/add_to_fplm.txt @@ -1,305 +1,310 @@ n't RB not ##### -according to IN $lemma ##### +in the middle of IN in the middle of ##### +in the middle of JJ in the middle of ##### +in the middle of NN in the middle of ##### +in the middle of RB in the middle of ##### +in the middle of RP in the middle of ##### +according to IN according to ##### according to JJ according to ##### according to NN according to ##### according to RB according to ##### according to RP according to ##### -ahead of IN $lemma ##### +ahead of IN ahead of ##### ahead of JJ ahead of ##### ahead of NN ahead of ##### ahead of RB ahead of ##### ahead of RP ahead of ##### -along with IN $lemma ##### +along with IN along with ##### along with JJ along with ##### along with NN along with ##### along with RB along with ##### along with RP along with ##### -apart from IN $lemma ##### +apart from IN apart from ##### apart from JJ apart from ##### apart from NN apart from ##### apart from RB apart from ##### apart from RP apart from ##### -as for IN $lemma ##### +as for IN as for ##### as for JJ as for ##### as for NN as for ##### as for RB as for ##### as for RP as for ##### -as well as IN $lemma ##### +as well as IN as well as ##### as well as JJ as well as ##### as well as NN as well as ##### as well as RB as well as ##### as well as RP as well as ##### -aside from IN $lemma ##### +aside from IN aside from ##### aside from JJ aside from ##### aside from NN aside from ##### aside from RB aside from ##### aside from RP aside from ##### -at the bottom of IN $lemma ##### +at the bottom of IN at the bottom of ##### at the bottom of JJ at the bottom of ##### at the bottom of NN at the bottom of ##### at the bottom of RB at the bottom of ##### at the bottom of RP at the bottom of ##### -at the top of IN $lemma ##### +at the top of IN at the top of ##### at the top of JJ at the top of ##### at the top of NN at the top of ##### at the top of RB at the top of ##### at the top of RP at the top of ##### -away from IN $lemma ##### +away from IN away from ##### away from JJ away from ##### away from NN away from ##### away from RB away from ##### away from RP away from ##### -because of IN $lemma ##### +because of IN because of ##### because of JJ because of ##### because of NN because of ##### because of RB because of ##### because of RP because of ##### -but for IN $lemma ##### +but for IN but for ##### but for JJ but for ##### but for NN but for ##### but for RB but for ##### but for RP but for ##### -by means of IN $lemma ##### +by means of IN by means of ##### by means of JJ by means of ##### by means of NN by means of ##### by means of RB by means of ##### by means of RP by means of ##### -by virtue of IN $lemma ##### +by virtue of IN by virtue of ##### by virtue of JJ by virtue of ##### by virtue of NN by virtue of ##### by virtue of RB by virtue of ##### by virtue of RP by virtue of ##### -by way of IN $lemma ##### +by way of IN by way of ##### by way of JJ by way of ##### by way of NN by way of ##### by way of RB by way of ##### by way of RP by way of ##### -close to IN $lemma ##### +close to IN close to ##### close to JJ close to ##### close to NN close to ##### close to RB close to ##### close to RP close to ##### -contrary to IN $lemma ##### +contrary to IN contrary to ##### contrary to JJ contrary to ##### contrary to NN contrary to ##### contrary to RB contrary to ##### contrary to RP contrary to ##### -due to IN $lemma ##### +due to IN due to ##### due to JJ due to ##### due to NN due to ##### due to RB due to ##### due to RP due to ##### -except for IN $lemma ##### +except for IN except for ##### except for JJ except for ##### except for NN except for ##### except for RB except for ##### except for RP except for ##### -far from IN $lemma ##### +far from IN far from ##### far from JJ far from ##### far from NN far from ##### far from RB far from ##### far from RP far from ##### -for lack of IN $lemma ##### +for lack of IN for lack of ##### for lack of JJ for lack of ##### for lack of NN for lack of ##### for lack of RB for lack of ##### for lack of RP for lack of ##### -in accordance with IN $lemma ##### +in accordance with IN in accordance with ##### in accordance with JJ in accordance with ##### in accordance with NN in accordance with ##### in accordance with RB in accordance with ##### in accordance with RP in accordance with ##### -in addition to IN $lemma ##### +in addition to IN in addition to ##### in addition to JJ in addition to ##### in addition to NN in addition to ##### in addition to RB in addition to ##### in addition to RP in addition to ##### -in back of IN $lemma ##### +in back of IN in back of ##### in back of JJ in back of ##### in back of NN in back of ##### in back of RB in back of ##### in back of RP in back of ##### -in between IN $lemma ##### +in between IN in between ##### in between JJ in between ##### in between NN in between ##### in between RB in between ##### in between RP in between ##### -in the case of IN $lemma ##### +in the case of IN in the case of ##### in the case of JJ in the case of ##### in the case of NN in the case of ##### in the case of RB in the case of ##### in the case of RP in the case of ##### -in case of IN $lemma ##### +in case of IN in case of ##### in case of JJ in case of ##### in case of NN in case of ##### in case of RB in case of ##### in case of RP in case of ##### -in charge of IN $lemma ##### +in charge of IN in charge of ##### in charge of JJ in charge of ##### in charge of NN in charge of ##### in charge of RB in charge of ##### in charge of RP in charge of ##### -in exchange for IN $lemma ##### +in exchange for IN in exchange for ##### in exchange for JJ in exchange for ##### in exchange for NN in exchange for ##### in exchange for RB in exchange for ##### in exchange for RP in exchange for ##### -in front of IN $lemma ##### +in front of IN in front of ##### in front of JJ in front of ##### in front of NN in front of ##### in front of RB in front of ##### in front of RP in front of ##### -in light of IN $lemma ##### +in light of IN in light of ##### in light of JJ in light of ##### in light of NN in light of ##### in light of RB in light of ##### in light of RP in light of ##### -in line with IN $lemma ##### +in line with IN in line with ##### in line with JJ in line with ##### in line with NN in line with ##### in line with RB in line with ##### in line with RP in line with ##### -in place of IN $lemma ##### +in place of IN in place of ##### in place of JJ in place of ##### in place of NN in place of ##### in place of RB in place of ##### in place of RP in place of ##### -in process of IN $lemma ##### +in process of IN in process of ##### in process of JJ in process of ##### in process of NN in process of ##### in process of RB in process of ##### in process of RP in process of ##### -in the process of IN $lemma ##### +in the process of IN in the process of ##### in the process of JJ in the process of ##### in the process of NN in the process of ##### in the process of RB in the process of ##### in the process of RP in the process of ##### -in regard to IN $lemma ##### +in regard to IN in regard to ##### in regard to JJ in regard to ##### in regard to NN in regard to ##### in regard to RB in regard to ##### in regard to RP in regard to ##### -inside of IN $lemma ##### +inside of IN inside of ##### inside of JJ inside of ##### inside of NN inside of ##### inside of RB inside of ##### inside of RP inside of ##### -in spite of IN $lemma ##### +in spite of IN in spite of ##### in spite of JJ in spite of ##### in spite of NN in spite of ##### in spite of RB in spite of ##### in spite of RP in spite of ##### -instead of IN $lemma ##### +instead of IN instead of ##### instead of JJ instead of ##### instead of NN instead of ##### instead of RB instead of ##### instead of RP instead of ##### -in view of IN $lemma ##### +in view of IN in view of ##### in view of JJ in view of ##### in view of NN in view of ##### in view of RB in view of ##### in view of RP in view of ##### -near to IN $lemma ##### +near to IN near to ##### near to JJ near to ##### near to NN near to ##### near to RB near to ##### near to RP near to ##### -next to IN $lemma ##### +next to IN next to ##### next to JJ next to ##### next to NN next to ##### next to RB next to ##### next to RP next to ##### -on account of IN $lemma ##### +on account of IN on account of ##### on account of JJ on account of ##### on account of NN on account of ##### on account of RB on account of ##### on account of RP on account of ##### -on behalf of IN $lemma ##### +on behalf of IN on behalf of ##### on behalf of JJ on behalf of ##### on behalf of NN on behalf of ##### on behalf of RB on behalf of ##### on behalf of RP on behalf of ##### -on top of IN $lemma ##### +on top of IN on top of ##### on top of JJ on top of ##### on top of NN on top of ##### on top of RB on top of ##### on top of RP on top of ##### -on the top of IN $lemma ##### +on the top of IN on the top of ##### on the top of JJ on the top of ##### on the top of NN on the top of ##### on the top of RB on the top of ##### on the top of RP on the top of ##### -on the bottom of IN $lemma ##### +on the bottom of IN on the bottom of ##### on the bottom of JJ on the bottom of ##### on the bottom of NN on the bottom of ##### on the bottom of RB on the bottom of ##### on the bottom of RP on the bottom of ##### -out of IN $lemma ##### +out of IN out of ##### out of JJ out of ##### out of NN out of ##### out of RB out of ##### out of RP out of ##### -outside of IN $lemma ##### +outside of IN outside of ##### outside of JJ outside of ##### outside of NN outside of ##### outside of RB outside of ##### outside of RP outside of ##### -owing to IN $lemma ##### +owing to IN owing to ##### owing to JJ owing to ##### owing to NN owing to ##### owing to RB owing to ##### owing to RP owing to ##### -prior to IN $lemma ##### +prior to IN prior to ##### prior to JJ prior to ##### prior to NN prior to ##### prior to RB prior to ##### prior to RP prior to ##### -subsequent to IN $lemma ##### +subsequent to IN subsequent to ##### subsequent to JJ subsequent to ##### subsequent to NN subsequent to ##### subsequent to RB subsequent to ##### subsequent to RP subsequent to ##### -such as IN $lemma ##### +such as IN such as ##### such as JJ such as ##### such as NN such as ##### such as RB such as ##### such as RP such as ##### -thanks to IN $lemma ##### +thanks to IN thanks to ##### thanks to JJ thanks to ##### thanks to NN thanks to ##### thanks to RB thanks to ##### thanks to RP thanks to ##### -to the right of IN $lemma ##### +to the right of IN to the right of ##### to the right of JJ to the right of ##### to the right of NN to the right of ##### to the right of RB to the right of ##### to the right of RP to the right of ##### -to the left of IN $lemma ##### +to the left of IN to the left of ##### to the left of JJ to the right of ##### to the left of NN to the right of ##### to the left of RB to the right of ##### to the left of RP to the right of ##### -together with IN $lemma ##### +together with IN together with ##### together with JJ together with ##### together with NN together with ##### together with RB together with ##### together with RP together with ##### -up against IN $lemma ##### +up against IN up against ##### up against JJ up against ##### up against NN up against ##### up against RB up against ##### up against RP up against ##### -up to IN $lemma ##### +up to IN up to ##### up to JJ up to ##### up to NN up to ##### up to RB up to ##### up to RP up to ##### -up until IN $lemma ##### +up until IN up until ##### up until JJ up until ##### up until NN up until ##### up until RB up until ##### up until RP up until ##### -with respect to IN $lemma ##### +with respect to IN with respect to ##### with respect to JJ with respect to ##### with respect to NN with respect to ##### with respect to RB with respect to ##### diff --git a/tools/Makefile b/tools/Makefile index b917c29d9dd7e068ea307a15dcb90c08250e92ff..76cb3e8f5e1db5619c809f00dc135ee7330d163e 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -9,7 +9,7 @@ LIBS = -lm OBJ = conll_lib.o orfeo.o util.o hash_str.o ftb_lib.o -ALL= ftb_tokenize_point ftb_remove_punct ftb2datcha decoda2orfeo ftb2orfeo process_decoda_tsv conll07_renumber_tokens ftb2fr_stream conll2mcf +ALL= ftb_tokenize_point ftb_remove_punct ftb2datcha decoda2orfeo ftb2orfeo process_decoda_tsv conll07_renumber_tokens ftb2fr_stream conll2mcf conllu2mcf all: $(ALL) @@ -17,6 +17,9 @@ all: $(ALL) conll2mcf : conll2mcf.c $(OBJ) $(CC) $(CFLAGS) -o $@ $< $(OBJ) $(LIBS) +conllu2mcf : conllu2mcf.c $(OBJ) + $(CC) $(CFLAGS) -o $@ $< $(OBJ) $(LIBS) + ftb2fr_stream : ftb2fr_stream.c $(OBJ) $(CC) $(CFLAGS) -o $@ $< $(OBJ) $(LIBS) @@ -61,3 +64,4 @@ clean: - rm -f process_decoda_tsv - rm -f conll07_renumber_tokens - rm -f conll2mcf + - rm -f conllu2mcf diff --git a/tools/conll2fplm.pl b/tools/conll2fplm.pl index ff6f76be03efa82aecc91934fbf0ff8617d2e714..545886a2e5060e57284631ad9fe4a3dd33290391 100755 --- a/tools/conll2fplm.pl +++ b/tools/conll2fplm.pl @@ -25,10 +25,11 @@ while($arg){ #print "use_coarse_pos = $use_coarse_pos conll = $conll\n"; -#open(my $CONLL, "<", $conll) -# or die "Can't open < $conll: $!"; +open(my $CONLL, "<", $conll) + or die "Can't open < $conll: $!"; -while(<>){ +while(<$CONLL>){ + if(!/^#/){ ($index, $form, $lemma, $cpos, $pos, $morpho, $gov, $label) = split; if($use_coarse_pos){ $fplm = $form . "\t" . $cpos . "\t" . $lemma . "\t" . $morpho; @@ -37,6 +38,7 @@ while(<>){ $fplm = $form . "\t" . $pos . "\t" . $lemma . "\t" . $morpho; } $hash{$fplm} = 1; + } } close($CONLL); diff --git a/tools/conll_lib.c b/tools/conll_lib.c index fd05b6cb91b093b60877635a9f42e1d245a71b3e..141b5ed73e3db61185d3910e19e2fa5438f5ecb8 100644 --- a/tools/conll_lib.c +++ b/tools/conll_lib.c @@ -151,6 +151,26 @@ int parse_line(FILE *f, sentence *s) return 0; } + /* specific to conll_u */ + + /* ignore comments */ + if(buff[0] == '#'){ + return 0; + } + + { + /* ignore amalgams */ + int i; + for(i=0; (buff[i] != '\t') && (i < MAX_LINE_LENGTH); i++) + if(buff[i] == '-') return 0; + + } + + + + /* end of specific to conll_u */ + + s->words[s->l] = w = malloc(sizeof(word)); w->daughters_nb = 0; s->l++;