From f4bf37cf9c4ff0968ec324f7e49d22f08f5c8475 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Fri, 10 Sep 2021 14:39:31 +0200 Subject: [PATCH] =?UTF-8?q?Passe=20par=20conllu=20et=20supprim=C3=A9=20phr?= =?UTF-8?q?ases=20avec=20plusieurs=20racines?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fr_orpheo/data/Makefile | 52 ++++---------------- fr_orpheo/data/add_time_and_speaker.perl | 61 ----------------------- fr_orpheo/data/fplm_add | 26 ---------- fr_orpheo/data/mcf.mcd | 14 ------ fr_orpheo/data/prepareOrfeoData.py | 11 ++++- fr_orpheo/data/purge_sfplm.pl | 62 ------------------------ fr_orpheo/data/rmBlankLines.py | 16 ++++++ fr_orpheo/data/wpmlgfs.mcd | 13 ----- 8 files changed, 34 insertions(+), 221 deletions(-) delete mode 100755 fr_orpheo/data/add_time_and_speaker.perl delete mode 100644 fr_orpheo/data/fplm_add delete mode 100644 fr_orpheo/data/mcf.mcd delete mode 100755 fr_orpheo/data/purge_sfplm.pl create mode 100755 fr_orpheo/data/rmBlankLines.py delete mode 100644 fr_orpheo/data/wpmlgfs.mcd diff --git a/fr_orpheo/data/Makefile b/fr_orpheo/data/Makefile index a3cd9cf..3d02d76 100644 --- a/fr_orpheo/data/Makefile +++ b/fr_orpheo/data/Makefile @@ -1,50 +1,16 @@ -TOOLS=../../tools ORFEO_DIR=../../data/fr_orpheo -MCD=wpmlgfs.mcd -CONLLUMCD=conllu.mcd -MCFMCD=mcf.mcd -#This part is for lemmatizer rules and excpetions computation -THRESHOLD=10 -STRICT=-s -FPLM_FILENAME=fplm -RULES_FILENAME=maca_trans_lemmatizer_rules.txt -EXCEPTIONS_FPLM_FILENAME=maca_trans_lemmatizer_exceptions.fplm -FPLM_FILENAME=fplm -FP_FILENAME=fP - -compile: fplm fP $(RULES_FILENAME) - -train.mcf : +conllu : ./prepareOrfeoData.py $(ORFEO_DIR)/mcf/ $(ORFEO_DIR)/meta_data/ - $(TOOLS)/conllu2mcf.py train.conll $(CONLLUMCD) train.mcf $(MCFMCD) - $(TOOLS)/conllu2mcf.py test.conll $(CONLLUMCD) test.mcf $(MCFMCD) - cat train.mcf | ./add_time_and_speaker.perl > tmpTrain - cat test.mcf | ./add_time_and_speaker.perl > tmpTest - mv tmpTest test.mcf - $(TOOLS)/mcfShuffleAndMakeDev.py tmpTrain $(MCD) 0.1 train.mcf dev.mcf - rm tmpTrain - -$(RULES_FILENAME): $(FPLM_FILENAME) - macaon_compute_l_rules -f $(FPLM_FILENAME) -e $(EXCEPTIONS_FPLM_FILENAME) -r $(RULES_FILENAME) $(STRICT) -t $(THRESHOLD) - -$(FPLM_FILENAME): train.mcf - cat train.mcf dev.mcf test.mcf > all.mcf - $(TOOLS)/mcf2fplm.py all.mcf $(MCD) > tmp - ./purge_sfplm.pl $(ORFEO_DIR)/sfplm/ | sort | uniq > tmp2 - cat tmp tmp2 fplm_add | sort | uniq | sed '/^$$/d' > $@ - rm all.mcf - rm tmp - rm tmp2 - -$(FP_FILENAME): $(FPLM_FILENAME) - $(TOOLS)/fplm2fP.py $< > $@ + ./rmBlankLines.py train.conllu test.conllu + ~/macaon_data/scripts/conlluCheckProblems.py train.conllu > train 2> pbTrain + ~/macaon_data/scripts/conlluCheckProblems.py test.conllu > test 2> pbTest + mv train train.conllu + mv test test.conllu + ~/oculometry/scripts/splitTrainDevTest.py --dev 0.1 --test 0.0 train.conllu clean: - - rm $(RULES_FILENAME) - - rm $(EXCEPTIONS_FPLM_FILENAME) - - rm fP - - rm fplm - - rm *\.mcf - rm *\.conll* + - rm pbTrain + - rm pbTest diff --git a/fr_orpheo/data/add_time_and_speaker.perl b/fr_orpheo/data/add_time_and_speaker.perl deleted file mode 100755 index 4d13eb7..0000000 --- a/fr_orpheo/data/add_time_and_speaker.perl +++ /dev/null @@ -1,61 +0,0 @@ -#! /usr/bin/perl - -#bonjour INT bonjour -1 dm 0 5.45 5.59 appelant - -$first = 1; -while(<>){ - chop; - ($filename, $word, $pos, $lemma, $head, $label, $start, $end, $speaker, $nbLocuteurs, $milieu, $type, $secteur, $eos) = split /\t/; - if($first){ - $first = 0; - $intervalle_int = 0; - $change_speaker = 0; - } - else{ -# print "end prec = $end_prec start = - $intervalle = $start - $end_prec; - -# if($intervalle < 0){ $intervalle_int = 0;} -# elsif($intervalle < 0.005){ $intervalle_int = 0;} -# elsif($intervalle < 0.065){ $intervalle_int = 1;} -# elsif($intervalle < 0.255){ $intervalle_int = 2;} -# elsif($intervalle < 1.625){ $intervalle_int = 3;} -# else {$intevalle_int = 4;} - -# print("intervalle = $intervalle intervalle_int = $intervalle_int\n"); - $intervalle_int = int($intervalle * 10); -##alexis 23/01/18 -# if($intervalle_int < 0){ -# $intervalle_int = 0; -# } -##end - if($intervalle_int < -10){ - $intervalle_int = -10; - } - if($intervalle_int > 10){ - $intervalle_int = 10; - } - - - - if($speaker ne $speaker_prec){ - $change_speaker = 1; - } - else{ - $change_speaker = 0; - } -# print "$word_prec\t$intervalle_int\t$change_speaker\t$pos_prec\t$lemma_prec\t$head_prec\t$label_prec\t$eos_prec\n"; - } -# print "$word\t$pos\t$lemma\t$head\t$label\t$eos\t$intervalle_int\t$change_speaker\n"; - print "$filename\t$word\t$intervalle_int\t$change_speaker\t$pos\t$lemma\t$head\t$label\t$nbLocuteurs\t$milieu\t$type\t$secteur\t$eos\n"; - - $word_prec = $word; - $pos_prec = $pos; - $lemma_prec = $lemma; - $head_prec = $head; - $label_prec = $label; - $eos_prec = $eos; - $start_prec = $start; - $end_prec = $end; - $speaker_prec = $speaker; -} diff --git a/fr_orpheo/data/fplm_add b/fr_orpheo/data/fplm_add deleted file mode 100644 index 245f328..0000000 --- a/fr_orpheo/data/fplm_add +++ /dev/null @@ -1,26 +0,0 @@ -des prep de ##### -des det un ##### -de det un ##### -aux prep à ##### -au prep à ##### -du prep de ##### -M titre M ##### -m titre M ##### -Mr titre M ##### -mr titre M ##### -MM titre M ##### -mm titre M ##### -Mme titre M ##### -mme titre M ##### -Mmes titre M ##### -mmes titre M ##### -Mlle titre M ##### -mlle titre M ##### -Mlles titre M ##### -mlles titre M ##### -Dr titre docteur ##### -Drs titre docteur ##### -Pr titre professeur ##### -Prs titre professeur ##### -Mgr titre monseigneur ##### -mgr titre monseigneur ##### diff --git a/fr_orpheo/data/mcf.mcd b/fr_orpheo/data/mcf.mcd deleted file mode 100644 index f40b392..0000000 --- a/fr_orpheo/data/mcf.mcd +++ /dev/null @@ -1,14 +0,0 @@ -0 FILENAME -1 FORM -2 POS -3 LEMMA -4 GOV -5 LABEL -6 TIME1 -7 TIME2 -8 SPKR -9 NBLOCUTEURS -10 MILIEU -11 TYPE -12 SECTEUR -13 EOS diff --git a/fr_orpheo/data/prepareOrfeoData.py b/fr_orpheo/data/prepareOrfeoData.py index 658fc01..1438732 100755 --- a/fr_orpheo/data/prepareOrfeoData.py +++ b/fr_orpheo/data/prepareOrfeoData.py @@ -3,6 +3,9 @@ import sys import os +def mcd() : + return "# global.columns = FILE ID FORM LEMMA POS UPOS FEATS HEAD DEPREL NONE1 NONE2 TIME1 TIME2 SPEAKER NBLOCS TYPE MILIEU" + def printUsageAndExit() : print("USAGE : %s rawMcfDirectory metaDataDirectory"%(sys.argv[0]), file=sys.stderr) exit(1) @@ -55,8 +58,9 @@ def treatDirectory(mcfs, metadatas) : metas[name] = {} metas[name][corresp] = target - output = open("train.conll", "w") + output = open("train.conllu", "w") + print(mcd(), file=output) for mcf in trains : featsForFile = list.copy(features) name = mcf.split(".")[0] @@ -77,14 +81,16 @@ def treatDirectory(mcfs, metadatas) : for line in open(mcfs+mcf, "r") : clean = line.strip() if len(line) <= 2 : + print(file=output) continue completeLine = clean for feat in featsForFile : completeLine += "\t" + feat print(completeLine,file=output) - output = open("test.conll", "w") + output = open("test.conllu", "w") + print(mcd(), file=output) for mcf in tests : featsForFile = list.copy(features) name = mcf.split(".")[0] @@ -105,6 +111,7 @@ def treatDirectory(mcfs, metadatas) : for line in open(mcfs+mcf, "r") : clean = line.strip() if len(line) <= 2 : + print(file=output) continue completeLine = clean for feat in featsForFile : diff --git a/fr_orpheo/data/purge_sfplm.pl b/fr_orpheo/data/purge_sfplm.pl deleted file mode 100755 index 92e2f65..0000000 --- a/fr_orpheo/data/purge_sfplm.pl +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/perl - -my $dir= shift; - -my $statut; -my $reste; - - -my @fichiers =( -"$dir/ADJ.sfplm", -"$dir/ADV.sfplm", -"$dir/CLN.sfplm", -"$dir/CSU.sfplm", -"$dir/INT.sfplm", -"$dir/PCT.sfplm", -"$dir/PRO.sfplm", -"$dir/VNF.sfplm", -"$dir/VPR.sfplm", -"$dir/ADN.sfplm", -"$dir/CLI.sfplm", -"$dir/CLS.sfplm", -"$dir/COO.sfplm", -"$dir/DET.sfplm", -"$dir/NOM.sfplm", -"$dir/PRE.sfplm", -"$dir/PRQ.sfplm", -"$dir/VPP.sfplm", -"$dir/VRB.sfplm" -); - - -foreach $fichier_sfplm (@fichiers){ - $fichier_fplm = $fichier_sfplm; -# $fichier_fplm =~ s/sfplm/fplm/; - print STDERR "processing $fichier_sfplm\n"; -# print "fichier sfplm = $fichier_sfplm\n"; -# print "fichier fplm = $fichier_fplm\n"; -# open(FPLM,">$fichier_fplm"); - open(SFPLM,"<$fichier_sfplm"); - while(<SFPLM>){ - s/ //g; - s/ +$//; - s/\t+$//; - s/\t +/\t/g; - s/ +\t/\t/g; - s/\t+/\t/g; - /([^\t]*)\t(.*)/; - $statut = $1; - $reste = $2; - if(($statut eq "N") ||($statut eq "A")){ -# print FPLM "$reste\n"; -# if((!$reste=~ /\tde\t/) -# && (!$reste=~ /\tdu\t/) -# && (!$reste=~ /\td\'\t/) -# && (!$reste=~ /\tdes\t/)){ - print "$reste\n"; -# } - } - } -# close FPLM; - close SFPLM; -} diff --git a/fr_orpheo/data/rmBlankLines.py b/fr_orpheo/data/rmBlankLines.py new file mode 100755 index 0000000..641bfe1 --- /dev/null +++ b/fr_orpheo/data/rmBlankLines.py @@ -0,0 +1,16 @@ +#! /usr/bin/env python3 + +import sys + +for filename in sys.argv[1:] : + lines = [] + for line in open(filename, "r") : + line = line.strip() + if len(line) == 0 and len(lines) > 0 and len(lines[-1]) == 0 : + continue + lines.append(line) + + with open(filename, "w") as out : + for line in lines : + print(line, file=out) + diff --git a/fr_orpheo/data/wpmlgfs.mcd b/fr_orpheo/data/wpmlgfs.mcd deleted file mode 100644 index c39ec0f..0000000 --- a/fr_orpheo/data/wpmlgfs.mcd +++ /dev/null @@ -1,13 +0,0 @@ -0 FILENAME -1 FORM -2 SPKRCHANGE -3 SILENCE -4 POS -5 LEMMA -6 GOV -7 LABEL -8 NBLOCUTEURS -9 MILIEU -10 TYPE -11 SECTEUR -12 EOS -- GitLab