diff --git a/tools/conll_keep_most_frequent_morpho_tags.pl b/tools/conll_keep_most_frequent_morpho_tags.pl new file mode 100755 index 0000000000000000000000000000000000000000..b485368664084dd26fce14bad3b63441b9f93a97 --- /dev/null +++ b/tools/conll_keep_most_frequent_morpho_tags.pl @@ -0,0 +1,37 @@ +#!/usr/bin/perl + +$conll = shift; +$threshold = shift; + +open(my $CONLL, "<", $conll) + or die "Can't open < $conll: $!"; + +while(<$CONLL>){ + if(!/^#/){ + ($index, $form, $lemma, $cpos, $pos, $morpho, $gov, $label) = split /\t/; + $hash_morpho{$morpho}++; + } +} + +close($CONLL); + +open(my $CONLL, "<", $conll) + or die "Can't open < $conll: $!"; + +while(<$CONLL>){ + if(/^\n/){ + print "\n"; + } + else{ + chop; + if(!/^#/){ + ($index, $form, $lemma, $cpos, $pos, $morpho, $gov, $label) = split /\t/; + if($hash_morpho{$morpho} < $threshold){ + $morpho = "_"; + } + print "$index\t$form\t$lemma\t$cpos\t$pos\t$morpho\t$gov\t$label\t_\t_\n"; + } + } +} + +close($CONLL);