From 9c65f41399dd8cb6b0c42d09c2bbcd6978d738d7 Mon Sep 17 00:00:00 2001
From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr>
Date: Fri, 21 Apr 2017 11:10:08 +0200
Subject: [PATCH] added conll_keep_most_frequent_morpho_tags.pl

---
 tools/conll_keep_most_frequent_morpho_tags.pl | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100755 tools/conll_keep_most_frequent_morpho_tags.pl

diff --git a/tools/conll_keep_most_frequent_morpho_tags.pl b/tools/conll_keep_most_frequent_morpho_tags.pl
new file mode 100755
index 0000000..b485368
--- /dev/null
+++ b/tools/conll_keep_most_frequent_morpho_tags.pl
@@ -0,0 +1,37 @@
+#!/usr/bin/perl
+
+$conll = shift;
+$threshold = shift;
+
+open(my $CONLL, "<", $conll)
+	or die "Can't open < $conll: $!";
+
+while(<$CONLL>){
+    if(!/^#/){
+	($index, $form, $lemma, $cpos, $pos, $morpho, $gov, $label) = split /\t/;
+	$hash_morpho{$morpho}++;
+    }
+}
+
+close($CONLL);
+
+open(my $CONLL, "<", $conll)
+	or die "Can't open < $conll: $!";
+
+while(<$CONLL>){
+    if(/^\n/){
+	print "\n";
+    }
+    else{
+	chop;
+	if(!/^#/){
+	    ($index, $form, $lemma, $cpos, $pos, $morpho, $gov, $label) = split /\t/;
+	    if($hash_morpho{$morpho} < $threshold){
+		$morpho = "_";
+	    }
+	    print "$index\t$form\t$lemma\t$cpos\t$pos\t$morpho\t$gov\t$label\t_\t_\n";
+	}
+    }
+}
+
+close($CONLL);
-- 
GitLab