From 9c65f41399dd8cb6b0c42d09c2bbcd6978d738d7 Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Fri, 21 Apr 2017 11:10:08 +0200 Subject: [PATCH] added conll_keep_most_frequent_morpho_tags.pl --- tools/conll_keep_most_frequent_morpho_tags.pl | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100755 tools/conll_keep_most_frequent_morpho_tags.pl diff --git a/tools/conll_keep_most_frequent_morpho_tags.pl b/tools/conll_keep_most_frequent_morpho_tags.pl new file mode 100755 index 0000000..b485368 --- /dev/null +++ b/tools/conll_keep_most_frequent_morpho_tags.pl @@ -0,0 +1,37 @@ +#!/usr/bin/perl + +$conll = shift; +$threshold = shift; + +open(my $CONLL, "<", $conll) + or die "Can't open < $conll: $!"; + +while(<$CONLL>){ + if(!/^#/){ + ($index, $form, $lemma, $cpos, $pos, $morpho, $gov, $label) = split /\t/; + $hash_morpho{$morpho}++; + } +} + +close($CONLL); + +open(my $CONLL, "<", $conll) + or die "Can't open < $conll: $!"; + +while(<$CONLL>){ + if(/^\n/){ + print "\n"; + } + else{ + chop; + if(!/^#/){ + ($index, $form, $lemma, $cpos, $pos, $morpho, $gov, $label) = split /\t/; + if($hash_morpho{$morpho} < $threshold){ + $morpho = "_"; + } + print "$index\t$form\t$lemma\t$cpos\t$pos\t$morpho\t$gov\t$label\t_\t_\n"; + } + } +} + +close($CONLL); -- GitLab