From f4bf37cf9c4ff0968ec324f7e49d22f08f5c8475 Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Fri, 10 Sep 2021 14:39:31 +0200
Subject: [PATCH] =?UTF-8?q?Passe=20par=20conllu=20et=20supprim=C3=A9=20phr?=
 =?UTF-8?q?ases=20avec=20plusieurs=20racines?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fr_orpheo/data/Makefile                  | 52 ++++----------------
 fr_orpheo/data/add_time_and_speaker.perl | 61 -----------------------
 fr_orpheo/data/fplm_add                  | 26 ----------
 fr_orpheo/data/mcf.mcd                   | 14 ------
 fr_orpheo/data/prepareOrfeoData.py       | 11 ++++-
 fr_orpheo/data/purge_sfplm.pl            | 62 ------------------------
 fr_orpheo/data/rmBlankLines.py           | 16 ++++++
 fr_orpheo/data/wpmlgfs.mcd               | 13 -----
 8 files changed, 34 insertions(+), 221 deletions(-)
 delete mode 100755 fr_orpheo/data/add_time_and_speaker.perl
 delete mode 100644 fr_orpheo/data/fplm_add
 delete mode 100644 fr_orpheo/data/mcf.mcd
 delete mode 100755 fr_orpheo/data/purge_sfplm.pl
 create mode 100755 fr_orpheo/data/rmBlankLines.py
 delete mode 100644 fr_orpheo/data/wpmlgfs.mcd

diff --git a/fr_orpheo/data/Makefile b/fr_orpheo/data/Makefile
index a3cd9cf..3d02d76 100644
--- a/fr_orpheo/data/Makefile
+++ b/fr_orpheo/data/Makefile
@@ -1,50 +1,16 @@
-TOOLS=../../tools
 ORFEO_DIR=../../data/fr_orpheo
-MCD=wpmlgfs.mcd
-CONLLUMCD=conllu.mcd
-MCFMCD=mcf.mcd
 
-#This part is for lemmatizer rules and excpetions computation
-THRESHOLD=10
-STRICT=-s
-FPLM_FILENAME=fplm
-RULES_FILENAME=maca_trans_lemmatizer_rules.txt
-EXCEPTIONS_FPLM_FILENAME=maca_trans_lemmatizer_exceptions.fplm
-FPLM_FILENAME=fplm
-FP_FILENAME=fP
-
-compile: fplm fP $(RULES_FILENAME)
-
-train.mcf : 
+conllu : 
 	./prepareOrfeoData.py $(ORFEO_DIR)/mcf/ $(ORFEO_DIR)/meta_data/
-	$(TOOLS)/conllu2mcf.py train.conll $(CONLLUMCD) train.mcf $(MCFMCD)
-	$(TOOLS)/conllu2mcf.py test.conll $(CONLLUMCD) test.mcf $(MCFMCD)
-	cat train.mcf | ./add_time_and_speaker.perl > tmpTrain
-	cat test.mcf | ./add_time_and_speaker.perl > tmpTest
-	mv tmpTest test.mcf
-	$(TOOLS)/mcfShuffleAndMakeDev.py tmpTrain $(MCD) 0.1 train.mcf dev.mcf
-	rm tmpTrain
-
-$(RULES_FILENAME): $(FPLM_FILENAME)
-	macaon_compute_l_rules -f $(FPLM_FILENAME) -e $(EXCEPTIONS_FPLM_FILENAME) -r $(RULES_FILENAME) $(STRICT) -t $(THRESHOLD)
-
-$(FPLM_FILENAME): train.mcf
-	cat train.mcf dev.mcf test.mcf > all.mcf
-	$(TOOLS)/mcf2fplm.py all.mcf $(MCD) > tmp
-	./purge_sfplm.pl $(ORFEO_DIR)/sfplm/ | sort | uniq > tmp2
-	cat tmp tmp2 fplm_add | sort | uniq | sed '/^$$/d' > $@
-	rm all.mcf
-	rm tmp
-	rm tmp2
-
-$(FP_FILENAME): $(FPLM_FILENAME)
-	$(TOOLS)/fplm2fP.py $< > $@
+	./rmBlankLines.py train.conllu test.conllu
+	~/macaon_data/scripts/conlluCheckProblems.py train.conllu > train 2> pbTrain
+	~/macaon_data/scripts/conlluCheckProblems.py test.conllu > test 2> pbTest
+	mv train train.conllu
+	mv test test.conllu
+	~/oculometry/scripts/splitTrainDevTest.py --dev 0.1 --test 0.0 train.conllu
 
 clean:
-	- rm $(RULES_FILENAME)
-	- rm $(EXCEPTIONS_FPLM_FILENAME)
-	- rm fP
-	- rm fplm
-	- rm *\.mcf
 	- rm *\.conll*
+	- rm pbTrain
+	- rm pbTest
 
diff --git a/fr_orpheo/data/add_time_and_speaker.perl b/fr_orpheo/data/add_time_and_speaker.perl
deleted file mode 100755
index 4d13eb7..0000000
--- a/fr_orpheo/data/add_time_and_speaker.perl
+++ /dev/null
@@ -1,61 +0,0 @@
-#! /usr/bin/perl
-
-#bonjour	INT	bonjour	-1	dm	0	5.45	5.59	appelant
-
-$first = 1;
-while(<>){
-    chop;
-    ($filename, $word, $pos, $lemma, $head, $label, $start, $end, $speaker, $nbLocuteurs, $milieu, $type, $secteur, $eos) = split /\t/; 
-    if($first){
-	$first = 0;
-	$intervalle_int = 0;
-	$change_speaker = 0;
-    }
-    else{
-#	print "end prec = $end_prec start = 
-	$intervalle = $start - $end_prec;
-
-#	if($intervalle < 0){ $intervalle_int = 0;}
-#	elsif($intervalle < 0.005){ $intervalle_int = 0;}
-#	elsif($intervalle < 0.065){ $intervalle_int = 1;}
-#	elsif($intervalle < 0.255){ $intervalle_int = 2;}
-#	elsif($intervalle < 1.625){ $intervalle_int = 3;}
-#	else {$intevalle_int = 4;}
-
-#	print("intervalle = $intervalle intervalle_int = $intervalle_int\n");
-	$intervalle_int = int($intervalle * 10);
-##alexis 23/01/18
-#	if($intervalle_int < 0){
-#	    $intervalle_int = 0;
-#	}
-##end	
-	if($intervalle_int < -10){
-	    $intervalle_int = -10;
-	}
-	if($intervalle_int > 10){
-	    $intervalle_int = 10;
-	}
-
-
-	
-	if($speaker ne $speaker_prec){
-	    $change_speaker = 1;
-	}
-	else{
-	    $change_speaker = 0;
-	}
-#    print "$word_prec\t$intervalle_int\t$change_speaker\t$pos_prec\t$lemma_prec\t$head_prec\t$label_prec\t$eos_prec\n";
-    }
-#    print "$word\t$pos\t$lemma\t$head\t$label\t$eos\t$intervalle_int\t$change_speaker\n";
-    print "$filename\t$word\t$intervalle_int\t$change_speaker\t$pos\t$lemma\t$head\t$label\t$nbLocuteurs\t$milieu\t$type\t$secteur\t$eos\n";
-
-    $word_prec = $word;
-    $pos_prec = $pos;
-    $lemma_prec = $lemma; 
-    $head_prec = $head;
-    $label_prec = $label;
-    $eos_prec = $eos;
-    $start_prec = $start; 
-    $end_prec = $end;
-    $speaker_prec = $speaker;
-}
diff --git a/fr_orpheo/data/fplm_add b/fr_orpheo/data/fplm_add
deleted file mode 100644
index 245f328..0000000
--- a/fr_orpheo/data/fplm_add
+++ /dev/null
@@ -1,26 +0,0 @@
-des	prep	de	#####
-des	det	un	#####
-de	det	un	#####
-aux	prep	à	#####
-au	prep	à	#####
-du	prep	de	#####
-M	titre	M	#####
-m	titre	M	#####
-Mr	titre	M	#####
-mr	titre	M	#####
-MM	titre	M	#####
-mm	titre	M	#####
-Mme	titre	M	#####
-mme	titre	M	#####
-Mmes	titre	M	#####
-mmes	titre	M	#####
-Mlle	titre	M	#####
-mlle	titre	M	#####
-Mlles	titre	M	#####
-mlles	titre	M	#####
-Dr	titre	docteur	#####
-Drs	titre	docteur	#####
-Pr	titre	professeur	#####
-Prs	titre	professeur	#####
-Mgr	titre	monseigneur	#####
-mgr	titre	monseigneur	#####
diff --git a/fr_orpheo/data/mcf.mcd b/fr_orpheo/data/mcf.mcd
deleted file mode 100644
index f40b392..0000000
--- a/fr_orpheo/data/mcf.mcd
+++ /dev/null
@@ -1,14 +0,0 @@
-0 FILENAME
-1 FORM
-2 POS
-3 LEMMA
-4 GOV
-5 LABEL
-6 TIME1
-7 TIME2
-8 SPKR
-9 NBLOCUTEURS
-10 MILIEU
-11 TYPE
-12 SECTEUR
-13 EOS
diff --git a/fr_orpheo/data/prepareOrfeoData.py b/fr_orpheo/data/prepareOrfeoData.py
index 658fc01..1438732 100755
--- a/fr_orpheo/data/prepareOrfeoData.py
+++ b/fr_orpheo/data/prepareOrfeoData.py
@@ -3,6 +3,9 @@
 import sys
 import os
 
+def mcd() :
+  return "# global.columns = FILE ID FORM LEMMA POS UPOS FEATS HEAD DEPREL NONE1 NONE2 TIME1 TIME2 SPEAKER NBLOCS TYPE MILIEU"
+
 def printUsageAndExit() :
   print("USAGE : %s rawMcfDirectory metaDataDirectory"%(sys.argv[0]), file=sys.stderr)
   exit(1)
@@ -55,8 +58,9 @@ def treatDirectory(mcfs, metadatas) :
             metas[name] = {}
           metas[name][corresp] = target
 
-  output = open("train.conll", "w")
+  output = open("train.conllu", "w")
 
+  print(mcd(), file=output)
   for mcf in trains :
     featsForFile = list.copy(features)
     name = mcf.split(".")[0]
@@ -77,14 +81,16 @@ def treatDirectory(mcfs, metadatas) :
     for line in open(mcfs+mcf, "r") :
       clean = line.strip()
       if len(line) <= 2 :
+        print(file=output)
         continue
       completeLine = clean
       for feat in featsForFile :
         completeLine += "\t" + feat
       print(completeLine,file=output)
 
-  output = open("test.conll", "w")
+  output = open("test.conllu", "w")
 
+  print(mcd(), file=output)
   for mcf in tests :
     featsForFile = list.copy(features)
     name = mcf.split(".")[0]
@@ -105,6 +111,7 @@ def treatDirectory(mcfs, metadatas) :
     for line in open(mcfs+mcf, "r") :
       clean = line.strip()
       if len(line) <= 2 :
+        print(file=output)
         continue
       completeLine = clean
       for feat in featsForFile :
diff --git a/fr_orpheo/data/purge_sfplm.pl b/fr_orpheo/data/purge_sfplm.pl
deleted file mode 100755
index 92e2f65..0000000
--- a/fr_orpheo/data/purge_sfplm.pl
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/usr/bin/perl
-
-my $dir= shift;
-
-my $statut;
-my $reste;
-
-
-my @fichiers =(
-"$dir/ADJ.sfplm",
-"$dir/ADV.sfplm",
-"$dir/CLN.sfplm",
-"$dir/CSU.sfplm",
-"$dir/INT.sfplm",
-"$dir/PCT.sfplm",
-"$dir/PRO.sfplm",
-"$dir/VNF.sfplm",
-"$dir/VPR.sfplm",
-"$dir/ADN.sfplm",
-"$dir/CLI.sfplm",
-"$dir/CLS.sfplm",
-"$dir/COO.sfplm",
-"$dir/DET.sfplm",
-"$dir/NOM.sfplm",
-"$dir/PRE.sfplm",
-"$dir/PRQ.sfplm",
-"$dir/VPP.sfplm",
-"$dir/VRB.sfplm"
-);
-
-
-foreach $fichier_sfplm (@fichiers){
-    $fichier_fplm = $fichier_sfplm;
-#    $fichier_fplm =~ s/sfplm/fplm/;
-    print STDERR "processing $fichier_sfplm\n";
-#    print "fichier sfplm = $fichier_sfplm\n";
-#    print "fichier fplm = $fichier_fplm\n";
-#    open(FPLM,">$fichier_fplm");
-    open(SFPLM,"<$fichier_sfplm");
-    while(<SFPLM>){
-	s/
//g;
-	s/ +$//;
-	s/\t+$//;
-	s/\t +/\t/g;
-	s/ +\t/\t/g;
-	s/\t+/\t/g;
-	/([^\t]*)\t(.*)/;
-	$statut = $1;
-	$reste = $2;
-	if(($statut eq "N") ||($statut eq "A")){
-#	    print FPLM "$reste\n";
-#	    if((!$reste=~ /\tde\t/)
-#	       && (!$reste=~ /\tdu\t/)
-#	       && (!$reste=~ /\td\'\t/)
-#	       && (!$reste=~ /\tdes\t/)){
-	    print "$reste\n";
-#	    }
-	}
-    }
-#    close FPLM;
-    close SFPLM;
-}
diff --git a/fr_orpheo/data/rmBlankLines.py b/fr_orpheo/data/rmBlankLines.py
new file mode 100755
index 0000000..641bfe1
--- /dev/null
+++ b/fr_orpheo/data/rmBlankLines.py
@@ -0,0 +1,16 @@
+#! /usr/bin/env python3
+
+import sys
+
+for filename in sys.argv[1:] :
+  lines = []
+  for line in open(filename, "r") :
+    line = line.strip()
+    if len(line) == 0 and len(lines) > 0 and len(lines[-1]) == 0 :
+      continue
+    lines.append(line)
+  
+  with open(filename, "w") as out :
+    for line in lines :
+      print(line, file=out)
+  
diff --git a/fr_orpheo/data/wpmlgfs.mcd b/fr_orpheo/data/wpmlgfs.mcd
deleted file mode 100644
index c39ec0f..0000000
--- a/fr_orpheo/data/wpmlgfs.mcd
+++ /dev/null
@@ -1,13 +0,0 @@
-0 FILENAME
-1 FORM
-2 SPKRCHANGE
-3 SILENCE
-4 POS
-5 LEMMA
-6 GOV
-7 LABEL
-8 NBLOCUTEURS
-9 MILIEU
-10 TYPE
-11 SECTEUR
-12 EOS
-- 
GitLab