diff --git a/datcha/tools/datcha2mcf.pl b/datcha/tools/datcha2mcf.pl new file mode 100755 index 0000000000000000000000000000000000000000..07c0dedf1c36cf30cc01ca7f873f4756434467f7 --- /dev/null +++ b/datcha/tools/datcha2mcf.pl @@ -0,0 +1,99 @@ +#!/usr/bin/perl + +my $token_num = 1; + + +#les parties de discours + +$tilt2datcha_pos{"ABR"} = "NOM"; +$tilt2datcha_pos{"ANONYM"} = "NAM"; +$tilt2datcha_pos{"NUM"} = "DET"; +$tilt2datcha_pos{"DATE"} = "NOM"; +$tilt2datcha_pos{"DET:ART"} = "DET"; +$tilt2datcha_pos{"DET:ART:AM"} = "DET"; +$tilt2datcha_pos{"DET:IND"} = "DET"; +$tilt2datcha_pos{"DET:POS"} = "DET"; +$tilt2datcha_pos{"PRO:DEM"} = "PRO"; +$tilt2datcha_pos{"PRO:IND"} = "PRO"; +#$tilt2datcha_pos{"PRO:PER"} = "PRO:PER"; +$tilt2datcha_pos{"PRO:PER"} = "PRO"; +$tilt2datcha_pos{"PRO:POS"} = "PRO"; +$tilt2datcha_pos{"PRO:REL"} = "PRO:REL"; +$tilt2datcha_pos{"PUN:cit"} = "PUN"; +$tilt2datcha_pos{"SENT"} = "PUN"; +$tilt2datcha_pos{"SYM"} = "NOM"; +$tilt2datcha_pos{"URL"} = "NOM"; +$tilt2datcha_pos{"HEURE"} = "NOM"; +$tilt2datcha_pos{"XXX"} = "NOM"; + + +$tilt2datcha_pos{"ADJ"} = "ADJ"; +$tilt2datcha_pos{"ADV"} = "ADV"; +$tilt2datcha_pos{"ANG"} = "ANG"; +$tilt2datcha_pos{"DET"} = "DET"; +$tilt2datcha_pos{"INT"} = "INT"; +$tilt2datcha_pos{"KON"} = "KON"; +$tilt2datcha_pos{"NAM"} = "NAM"; +$tilt2datcha_pos{"NOM"} = "NOM"; +$tilt2datcha_pos{"PRF"} = "PRF"; +$tilt2datcha_pos{"PRO"} = "PRO"; +$tilt2datcha_pos{"PRP"} = "PRP"; +$tilt2datcha_pos{"PUN"} = "PUN"; +$tilt2datcha_pos{"VER"} = "VER"; +$tilt2datcha_pos{"VER:impe"} = "VER:impe"; +$tilt2datcha_pos{"VER:infi"} = "VER:infi"; +$tilt2datcha_pos{"VER:pper"} = "VER:pper"; +$tilt2datcha_pos{"VER:ppre"} = "VER:ppre"; + +# les parties de discours simplifiées + +$tilt2datcha_cpos{"ABR"} = "NOM"; +$tilt2datcha_cpos{"ANONYM"} = "NAM"; +$tilt2datcha_cpos{"NUM"} = "DET"; +$tilt2datcha_cpos{"DATE"} = "NOM"; +$tilt2datcha_cpos{"DET:ART"} = "DET"; +$tilt2datcha_cpos{"DET:ART:AM"} = "DET"; +$tilt2datcha_cpos{"DET:IND"} = "DET"; +$tilt2datcha_cpos{"DET:POS"} = "DET"; +$tilt2datcha_cpos{"PRO:DEM"} = "PRO"; +$tilt2datcha_cpos{"PRO:IND"} = "PRO"; +$tilt2datcha_cpos{"PRO:PER"} = "PRO"; +$tilt2datcha_cpos{"PRO:POS"} = "PRO"; +$tilt2datcha_cpos{"PRO:REL"} = "PRO"; +$tilt2datcha_cpos{"PUN:cit"} = "PUN"; +$tilt2datcha_cpos{"SENT"} = "PUN"; +$tilt2datcha_cpos{"SYM"} = "NOM"; +$tilt2datcha_cpos{"URL"} = "NOM"; +$tilt2datcha_cpos{"HEURE"} = "NOM"; +$tilt2datcha_cpos{"XXX"} = "NOM"; + + +$tilt2datcha_cpos{"ADJ"} = "ADJ"; +$tilt2datcha_cpos{"ADV"} = "ADV"; +$tilt2datcha_cpos{"ANG"} = "ANG"; +$tilt2datcha_cpos{"DET"} = "DET"; +$tilt2datcha_cpos{"INT"} = "INT"; +$tilt2datcha_cpos{"KON"} = "KON"; +$tilt2datcha_cpos{"NAM"} = "NAM"; +$tilt2datcha_cpos{"NOM"} = "NOM"; +$tilt2datcha_cpos{"PRF"} = "PRF"; +$tilt2datcha_cpos{"PRO"} = "PRO"; +$tilt2datcha_cpos{"PRP"} = "PRP"; +$tilt2datcha_cpos{"PUN"} = "PUN"; +$tilt2datcha_cpos{"VER"} = "VER"; +$tilt2datcha_cpos{"VER:impe"} = "VER"; +$tilt2datcha_cpos{"VER:infi"} = "VER"; +$tilt2datcha_cpos{"VER:pper"} = "VER"; +$tilt2datcha_cpos{"VER:ppre"} = "VER"; + +while(<>){ + chop; +#Bonjour Bonjour OK INT bonjour tchat1 TC [00:11:09] Bonjour IV + ($A, $B, $C, $D, $E, $F, $G, $H, $I, $J) = split /\t/; + $form =~ s/ /_/g; + $lemma =~ s/ /_/g; + $cpos = $tilt2datcha_cpos{$pos}; + $pos = $tilt2datcha_pos{$pos}; + + print "$A\t$B\t$C\t$tilt2datcha_pos{$D}\t$E\t$F\t$G\t$H\t$I\t$J\n"; +}