Initial commit

c00fe2d1 · Franck Dary · c00fe2d1 · c00fe2d1 · c00fe2d1 · c00fe2d1
Commit c00fe2d1 authored 5 years ago by Franck Dary
--- a/UD_any/config
+++ b/UD_any/config
+UD_ROOT=~/Documents/ud/ud-treebanks-v2.5/UD_French-GSD/
--- a/UD_any/data/.gitignore
+++ b/UD_any/data/.gitignore
+*\.ts
+*fplm
+fP
+ambiguities.txt
--- a/UD_any/data/Makefile
+++ b/UD_any/data/Makefile
+include ../config
+
+SCRIPTS=../../scripts
+CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl
+MCD=conllu.mcd
+
+TRAIN_FILES=$(shell find $(UD_ROOT) -type f -name '*train*.conllu')
+DEV_FILES=$(shell find $(UD_ROOT) -type f -name '*dev*.conllu')
+TEST_FILES=$(shell find $(UD_ROOT) -type f -name '*test*.conllu')
+
+#This part is for lemmatizer rules and excpetions computation
+THRESHOLD=10
+FPLM_FILENAME=fplm
+RULES_FILENAME=lemmatizer_rules.ts
+EXCEPTIONS_FPLM_FILENAME=maca_trans_lemmatizer_exceptions.fplm
+
+all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns $(FPLM_FILENAME) $(RULES_FILENAME)
+	rm col_*\.txt
+	rm all_no_test.conllu
+
+all_no_test.conllu:
+	cat $(TRAIN_FILES) > $@
+
+tokenizer.ts: all_no_test.conllu $(MCD)
+	echo "Default : IGNORECHAR" > $@
+	$(SCRIPTS)/conllu2splits.py $< $(MCD) >> $@ 2> ambiguities.txt
+	echo "ENDWORD" >> $@
+	echo "ADDCHARTOWORD" >> $@
+
+segmenter.ts:
+	echo "EOS b.0" > $@
+	echo "REWRITE b.0 EOS _" >> $@
+ 
+columns: all_no_test.conllu $(MCD)
+	for number in 1 2 3 4 5 6 7 8 9 10 ; do \
+		cat all_no_test.conllu | sed '/^#/ d' | cut -f$$number | sort --unique > col_$$number.txt ; \
+	done
+	./getTransitionSets.py $(MCD) col_*\.txt
+
+texts:
+	./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILEs) $(TEST_FILES)
+
+$(FPLM_FILENAME): all_no_test.conllu $(MCD)
+	$(SCRIPTS)/conllu2fplm.py $< $(MCD) > $@
+
+$(RULES_FILENAME): $(FPLM_FILENAME)
+	macaon_compute_l_rules -f $(FPLM_FILENAME) -e $(EXCEPTIONS_FPLM_FILENAME) -r tmp.txt -t $(THRESHOLD)
+	cat tmp.txt | sed s/^/RULE\ LEMMA\ ON\ FORM\ /g | sed s/RULE\ LEMMA\ ON\ FORM\ @@$$/Default\ :\ \ RULE\ LEMMA\ ON\ FORM\ @@/g > $@
+	rm tmp.txt
+	echo -e "Default : NOTHING\nTOLOWER b.0 LEMMA\nTOUPPER b.0 LEMMA" > lemmatizer_case.ts
+
+clean:
+	- rm *\.txt
+	- rm *\.conll*
+	- rm *\.ts
+	- rm $(RULES_FILENAME)
+	- rm $(EXCEPTIONS_FPLM_FILENAME)
+	- rm $(FPLM_FILENAME)
+
--- a/UD_any/data/conllu.mcd
+++ b/UD_any/data/conllu.mcd
+ID
+FORM
+LEMMA
+UPOS
+XPOS
+FEATS
+HEAD
+DEPREL
+DEPS
+MISC
--- a/UD_any/data/getRawText.py
+++ b/UD_any/data/getRawText.py
+#! /usr/bin/python3
+
+import sys
+import os
+import subprocess
+
+def printUsageAndExit() :
+  print("USAGE : %s conll_2_text.pl file1.conllu file2.conllu..."%sys.argv[0], file=sys.stderr)
+  exit(1)
+
+if __name__ == "__main__" :
+  if len(sys.argv) < 3 :
+    printUsageAndExit()
+
+  for pathToFile in sys.argv[2:] :
+    splited = os.path.splitext(pathToFile)
+    target = splited[0] + ".txt"
+
+    targetFile = open(target, "w")
+    command = sys.argv[1] + " " + pathToFile
+    p = subprocess.Popen(command, stdout=targetFile, stderr=sys.stderr, shell=True)
+    p.wait()
--- a/UD_any/data/getTransitionSets.py
+++ b/UD_any/data/getTransitionSets.py
+#! /usr/bin/python3
+
+import sys
+
+sys.path.insert(1, '../../scripts')
+
+from readMCD import readMCD
+
+def printUsageAndExit() :
+  print("USAGE : %s mcd column_1.txt columns_2.txt..."%sys.argv[0], file=sys.stderr)
+  exit(1)
+
+if __name__ == "__main__" :
+
+  sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
+
+  if len(sys.argv) < 3 :
+    printUsageAndExit()
+
+  conllMCD, conllMCDr = readMCD(sys.argv[1])
+
+  for colFile in sys.argv[2:] :
+    numCol = int(colFile.split('.')[0].split('_')[-1]) -1
+    if not numCol in conllMCD :
+      continue
+    nameCol = conllMCD[numCol]
+
+    if nameCol == "UPOS" :
+      output = open("tagger.ts", 'w', encoding='utf-8')
+      for line in open(colFile, "r", encoding='utf-8') :
+        striped = line.strip()
+        if len(striped) == 0 :
+          continue
+        print("WRITE b.0 UPOS " + striped, file=output)
+      output.close()
+
+    elif nameCol == "XPOS" :
+      output = open("taggerx.ts", 'w', encoding='utf-8')
+      for line in open(colFile, "r", encoding='utf-8') :
+        striped = line.strip()
+        if len(striped) == 0 :
+          continue
+        print("WRITE b.0 XPOS " + striped, file=output)
+      output.close()
+
+    elif nameCol == "FEATS" :
+      output = open("morpho_whole.ts", 'w', encoding='utf-8')
+      for line in open(colFile, "r", encoding='utf-8') :
+        striped = line.strip()
+        if len(striped) == 0 :
+          continue
+        print("WRITE b.0 FEATS " + striped, file=output)
+      output.close()
+      output = open("morpho_parts.ts", 'w', encoding='utf-8')
+      allParts = set()
+      allPartsList = []
+      for line in open(colFile, "r", encoding='utf-8') :
+        striped = line.strip()
+        if len(striped) == 0 :
+          continue
+        parts = striped.split('|')
+        for part in parts :
+          allParts.add(part)
+      for part in allParts :
+        allPartsList.append(part)
+      allPartsList.sort()
+      for part in allPartsList :
+        print("ADD b.0 FEATS " + part, file=output)
+      print("Default : NOTHING", file=output)
+      output.close()
+
+    elif nameCol == "DEPREL" :
+      output = open("parser_legacy.ts", 'w', encoding='utf-8')
+      print("REDUCE", file=output)
+      labels = set()
+      labelsList = []
+      for line in open(colFile, "r", encoding='utf-8') :
+        striped = line.strip()
+        if len(striped) == 0 or striped == "root" or striped == "_" :
+          continue
+        label = striped.split(':')[0]
+        if label not in labels :
+          labels.add(striped)
+          labelsList.append(striped)
+      labelsList.sort()
+      for label in labelsList :
+        print("LEFT " + label, file=output)
+        print("RIGHT " + label, file=output)
+      print("EOS s.0", file=output)
+      print("Default : SHIFT", file=output)
+      output.close()
+      output = open("parser.ts", 'w', encoding='utf-8')
+      print("REDUCE", file=output)
+      labels = set()
+      labelsList = []
+      for line in open(colFile, "r", encoding='utf-8') :
+        striped = line.strip()
+        if len(striped) == 0 or striped == "root" or striped == "_" :
+          continue
+        label = striped.split(':')[0]
+        if label not in labels :
+          labels.add(striped)
+          labelsList.append(striped)
+      labelsList.sort()
+      for label in labelsList :
+        print("LEFT " + label, file=output)
+        print("RIGHT " + label, file=output)
+      print("Default : SHIFT", file=output)
+      output.close()
+
--- a/UD_any/tagger/machine.rm
+++ b/UD_any/tagger/machine.rm
+Name : Tagger Machine
+Strategy : sequential
+Classifier : tagger MLP(500,RELU,0.3) data/tagger.ts
--- a/scripts/.gitignore
+++ b/scripts/.gitignore
+__pycache__
--- a/scripts/conll18_ud_eval.py
+++ b/scripts/conll18_ud_eval.py
--- a/scripts/conllu2fplm.py
+++ b/scripts/conllu2fplm.py
+#! /usr/bin/python3
+
+import sys
+from readMCD import readMCD
+
+def printUsageAndExit() :
+  print("USAGE : %s file.conllu mcd"%sys.argv[0], file=sys.stderr)
+  exit(1)
+
+def sameLineWithoutLemma(l1, l2) :
+  l1s = l1.split('\t')
+  l2s = l2.split('\t')
+  return (l1s[:-3],l1s[-1]) == (l2s[:-3],l2s[-1])
+
+if __name__ == "__main__" :
+
+  sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
+
+  if len(sys.argv) != 3 :
+    printUsageAndExit()
+
+  conllMCD, conllMCDr = readMCD(sys.argv[2])
+
+  entriesCount = {}
+  entriesList = []
+
+  for line in open(sys.argv[1], "r", encoding="utf8") :
+    if len(line.strip()) < 3 :
+      continue
+    if line.strip()[0] == '#' :
+      continue
+
+    columns = line.strip().split('\t')
+    if len(columns[int(conllMCDr["ID"])].split('-')) > 1 :
+      continue
+
+    entry = ""
+    for col in ["FORM", "UPOS", "LEMMA", "FEATS"] :
+      entry = entry + columns[int(conllMCDr[col])] + '\t'
+    entry = entry[:-1]
+
+    if entry not in entriesCount :
+      entriesCount[entry] = 1
+    else :
+      entriesCount[entry] = 1+entriesCount[entry]
+
+  for entry in entriesCount :
+    entriesList.append(entry)
+
+  entriesList.sort()
+  i = 0
+  while i < len(entriesList) :
+    maxCount = 0
+    maxIndex = 0
+    j = i
+    while j < len(entriesList) and sameLineWithoutLemma(entriesList[i], entriesList[j]) :
+      if entriesCount[entriesList[j]] > maxCount :
+        maxCount = entriesCount[entriesList[j]]
+        maxIndex = j
+      j = j+1
+    print("%s"%(entriesList[maxIndex]))
+    i = j
+
--- a/scripts/conllu2splits.py
+++ b/scripts/conllu2splits.py
+#! /usr/bin/python3
+
+import sys
+from readMCD import readMCD
+
+rules = {}
+prefix = "SPLITWORD "
+
+def printUsageAndExit() :
+  print("USAGE : %s file.conllu conllu.mcd"%sys.argv[0], file=sys.stderr)
+  exit(1)
+
+def computeRules(sentence) :
+  wordById = {}
+  for word in sentence :
+    splited = word[0].split("-")
+    if len(splited) > 1 :
+      continue
+    wordById[word[0]] = word[1]
+
+  for word in sentence :
+    splited = word[0].split("-")
+    if len(splited) > 1 :
+      rule = ""
+      for id in range(int(splited[0]),int(splited[-1])+1) :
+        rule += "@" + wordById[str(id)]
+      if word[1] in rules :
+        if rule in rules[word[1]] :
+          rules[word[1]][rule] += 1
+        else :
+          rules[word[1]][rule] = 1
+      else :
+        rules[word[1]] = {}
+        rules[word[1]][rule] = 1
+
+def main() :
+
+  sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
+
+  if len(sys.argv) != 3 :
+    printUsageAndExit()
+
+  conllMCD, conllMCDr = readMCD(sys.argv[2])
+
+  idId = int(conllMCDr["ID"])
+  idForm = int(conllMCDr["FORM"])
+
+  sentence = []
+
+  for line in open(sys.argv[1], "r", encoding="utf8") :
+    if len(line.strip()) < 2 or line[0] == '#' :
+      if len(sentence) > 0 :
+        computeRules(sentence)
+      sentence = []
+      continue
+
+    splited = line.strip().split('\t')
+    sentence += [[splited[idId], splited[idForm]]]
+
+  for word in rules :
+    if len(rules[word]) > 1 :
+      print("WARNING : Abiguity detected in \'%s\'"%(word+" "+str(rules[word])), file=sys.stderr)
+    toPrint = []
+    for rule in rules[word] :
+      toPrint.append([len(rule.split('@')), prefix+word+rule])
+    toPrint.sort(reverse=True)
+    for rule in toPrint :
+      print(rule[1])
+
+if __name__ == "__main__" :
+  main()
+
--- a/scripts/conllu_to_text.pl
+++ b/scripts/conllu_to_text.pl
+#!/usr/bin/env perl
+# Extracts raw text from CoNLL-U file. Uses newdoc and newpar tags when available.
+# Copyright © 2017 Dan Zeman <zeman@ufal.mff.cuni.cz>
+# License: GNU GPL
+
+use utf8;
+use open ':utf8';
+binmode(STDIN, ':utf8');
+binmode(STDOUT, ':utf8');
+binmode(STDERR, ':utf8');
+use Getopt::Long;
+
+# Language code 'zh' or 'ja' will trigger Chinese-like text formatting.
+my $language = 'en';
+GetOptions
+(
+    'language=s' => \$language
+);
+my $chinese = $language =~ m/^(zh|ja|lzh|yue)(_|$)/;
+
+my $text = ''; # from the text attribute of the sentence
+my $ftext = ''; # from the word forms of the tokens
+my $newpar = 0;
+my $newdoc = 0;
+my $buffer = '';
+my $start = 1;
+my $mwtlast;
+while(<>)
+{
+    if(m/^\#\s*text\s*=\s*(.+)/)
+    {
+        $text = $1;
+    }
+    elsif(m/^\#\s*newpar(\s|$)/i)
+    {
+        $newpar = 1;
+    }
+    elsif(m/^\#\s*newdoc(\s|$)/i)
+    {
+        $newdoc = 1;
+    }
+    elsif(m/^\d+-(\d+)\t/)
+    {
+        $mwtlast = $1;
+        my @f = split(/\t/, $_);
+        # Paragraphs may start in the middle of a sentence (bulleted lists, verse etc.)
+        # The first token of the new paragraph has "NewPar=Yes" in the MISC column.
+        # Multi-word tokens have this in the token-introducing line.
+        if($f[9] =~ m/NewPar=Yes/i)
+        {
+            # Empty line between documents and paragraphs. (There may have been
+            # a paragraph break before the first part of this sentence as well!)
+            $buffer = print_new_paragraph_if_needed($start, $newdoc, $newpar, $buffer);
+            $buffer .= $ftext;
+            # Line breaks at word boundaries after at most 80 characters.
+            $buffer = print_lines_from_buffer($buffer, 80, $chinese);
+            print("$buffer\n\n");
+            $buffer = '';
+            # Start is only true until we write the first sentence of the input stream.
+            $start = 0;
+            $newdoc = 0;
+            $newpar = 0;
+            $text = '';
+            $ftext = '';
+        }
+        $ftext .= $f[1];
+        $ftext .= ' ' unless($f[9] =~ m/SpaceAfter=No/);
+    }
+    elsif(m/^(\d+)\t/ && !(defined($mwtlast) && $1<=$mwtlast))
+    {
+        $mwtlast = undef;
+        my @f = split(/\t/, $_);
+        # Paragraphs may start in the middle of a sentence (bulleted lists, verse etc.)
+        # The first token of the new paragraph has "NewPar=Yes" in the MISC column.
+        # Multi-word tokens have this in the token-introducing line.
+        if($f[9] =~ m/NewPar=Yes/i)
+        {
+            # Empty line between documents and paragraphs. (There may have been
+            # a paragraph break before the first part of this sentence as well!)
+            $buffer = print_new_paragraph_if_needed($start, $newdoc, $newpar, $buffer);
+            $buffer .= $ftext;
+            # Line breaks at word boundaries after at most 80 characters.
+            $buffer = print_lines_from_buffer($buffer, 80, $chinese);
+            print("$buffer\n\n");
+            $buffer = '';
+            # Start is only true until we write the first sentence of the input stream.
+            $start = 0;
+            $newdoc = 0;
+            $newpar = 0;
+            $text = '';
+            $ftext = '';
+        }
+        $ftext .= $f[1];
+        $ftext .= ' ' unless($f[9] =~ m/SpaceAfter=No/);
+    }
+    elsif(m/^\s*$/)
+    {
+        # In a valid CoNLL-U file, $text should be equal to $ftext except for the
+        # space after the last token. However, if there have been intra-sentential
+        # paragraph breaks, $ftext contains only the part after the last such
+        # break, and $text is empty. Hence we currently use $ftext everywhere
+        # and ignore $text, even though we note it when seeing the text attribute.
+        # $text .= ' ' unless($chinese);
+        # Empty line between documents and paragraphs.
+        $buffer = print_new_paragraph_if_needed($start, $newdoc, $newpar, $buffer);
+        $buffer .= $ftext;
+        # Line breaks at word boundaries after at most 80 characters.
+        $buffer = print_lines_from_buffer($buffer, 80, $chinese);
+        # Start is only true until we write the first sentence of the input stream.
+        $start = 0;
+        $newdoc = 0;
+        $newpar = 0;
+        $text = '';
+        $ftext = '';
+        $mwtlast = undef;
+    }
+}
+# There may be unflushed buffer contents after the last sentence, less than 80 characters
+# (otherwise we would have already dealt with it), so just flush it.
+if($buffer ne '')
+{
+    print("$buffer\n");
+}
+
+
+
+#------------------------------------------------------------------------------
+# Checks whether we have to print an extra line to separate paragraphs. Does it
+# if necessary. Returns the updated buffer.
+#------------------------------------------------------------------------------
+sub print_new_paragraph_if_needed
+{
+    my $start = shift;
+    my $newdoc = shift;
+    my $newpar = shift;
+    my $buffer = shift;
+    if(!$start && ($newdoc || $newpar))
+    {
+        if($buffer ne '')
+        {
+            print("$buffer\n");
+            $buffer = '';
+        }
+        print("\n");
+    }
+    return $buffer;
+}
+
+
+
+#------------------------------------------------------------------------------
+# Prints as many complete lines of text as there are in the buffer. Returns the
+# remaining contents of the buffer.
+#------------------------------------------------------------------------------
+sub print_lines_from_buffer
+{
+    my $buffer = shift;
+    # Maximum number of characters allowed on one line, not counting the line
+    # break character(s), which also replace any number of trailing spaces.
+    # Exception: If there is a word longer than the limit, it will be printed
+    # on one line.
+    # Note that this algorithm is not suitable for Chinese and Japanese.
+    my $limit = shift;
+    # We need a different algorithm for Chinese and Japanese.
+    my $chinese = shift;
+    if($chinese)
+    {
+        return print_chinese_lines_from_buffer($buffer, $limit);
+    }
+    if(length($buffer) >= $limit)
+    {
+        my @cbuffer = split(//, $buffer);
+        # There may be more than one new line waiting in the buffer.
+        while(scalar(@cbuffer) >= $limit)
+        {
+            ###!!! We could make it simpler if we ignored multi-space sequences
+            ###!!! between words. It sounds OK to ignore them because at the
+            ###!!! line break we do not respect original spacing anyway.
+            my $i;
+            my $ilastspace;
+            for($i = 0; $i<=$#cbuffer; $i++)
+            {
+                if($i>$limit && defined($ilastspace))
+                {
+                    last;
+                }
+                if($cbuffer[$i] =~ m/\s/)
+                {
+                    $ilastspace = $i;
+                }
+            }
+            if(defined($ilastspace) && $ilastspace>0)
+            {
+                my @out = @cbuffer[0..($ilastspace-1)];
+                splice(@cbuffer, 0, $ilastspace+1);
+                print(join('', @out), "\n");
+            }
+            else
+            {
+                print(join('', @cbuffer), "\n");
+                splice(@cbuffer);
+            }
+        }
+        $buffer = join('', @cbuffer);
+    }
+    return $buffer;
+}
+
+
+
+#------------------------------------------------------------------------------
+# Prints as many complete lines of text as there are in the buffer. Returns the
+# remaining contents of the buffer. Assumes that there are no spaces between
+# words and lines can be broken between any two characters, as is the custom in
+# Chinese and Japanese.
+#------------------------------------------------------------------------------
+sub print_chinese_lines_from_buffer
+{
+    my $buffer = shift;
+    # Maximum number of characters allowed on one line, not counting the line
+    # break character(s).
+    my $limit = shift;
+    # We cannot simply print the first $limit characters from the buffer,
+    # followed by a line break. There could be embedded Latin words or
+    # numbers and we do not want to insert a line break in the middle of
+    # a foreign word.
+    my @cbuffer = split(//, $buffer);
+    while(scalar(@cbuffer) >= $limit)
+    {
+        my $nprint = 0;
+        for(my $i = 0; $i <= $#cbuffer; $i++)
+        {
+            if($i > $limit && $nprint > 0)
+            {
+                last;
+            }
+            unless($i < $#cbuffer && $cbuffer[$i] =~ m/[\p{Latin}0-9]/ && $cbuffer[$i+1] =~ m/[\p{Latin}0-9]/)
+            {
+                $nprint = $i+1;
+            }
+        }
+        my @out = @cbuffer[0..($nprint-1)];
+        splice(@cbuffer, 0, $nprint);
+        print(join('', @out), "\n");
+    }
+    $buffer = join('', @cbuffer);
+    return $buffer;
+}
--- a/scripts/readMCD.py
+++ b/scripts/readMCD.py
+def readMCD(mcdFilename) :
+  mcd = {}
+  for line in open(mcdFilename, "r", encoding="utf8") :
+    clean = line.strip()
+    if len(line) == 2 or line[0] == '#' :
+      continue
+    mcd[len(mcd)] = clean
+
+  return mcd, {v: k for k, v in mcd.items()}
+