Skip to content
Snippets Groups Projects
Commit c00fe2d1 authored by Franck Dary's avatar Franck Dary
Browse files

Initial commit

parents
No related branches found
No related tags found
No related merge requests found
UD_ROOT=~/Documents/ud/ud-treebanks-v2.5/UD_French-GSD/
*\.ts
*fplm
fP
ambiguities.txt
include ../config
SCRIPTS=../../scripts
CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl
MCD=conllu.mcd
TRAIN_FILES=$(shell find $(UD_ROOT) -type f -name '*train*.conllu')
DEV_FILES=$(shell find $(UD_ROOT) -type f -name '*dev*.conllu')
TEST_FILES=$(shell find $(UD_ROOT) -type f -name '*test*.conllu')
#This part is for lemmatizer rules and excpetions computation
THRESHOLD=10
FPLM_FILENAME=fplm
RULES_FILENAME=lemmatizer_rules.ts
EXCEPTIONS_FPLM_FILENAME=maca_trans_lemmatizer_exceptions.fplm
all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns $(FPLM_FILENAME) $(RULES_FILENAME)
rm col_*\.txt
rm all_no_test.conllu
all_no_test.conllu:
cat $(TRAIN_FILES) > $@
tokenizer.ts: all_no_test.conllu $(MCD)
echo "Default : IGNORECHAR" > $@
$(SCRIPTS)/conllu2splits.py $< $(MCD) >> $@ 2> ambiguities.txt
echo "ENDWORD" >> $@
echo "ADDCHARTOWORD" >> $@
segmenter.ts:
echo "EOS b.0" > $@
echo "REWRITE b.0 EOS _" >> $@
columns: all_no_test.conllu $(MCD)
for number in 1 2 3 4 5 6 7 8 9 10 ; do \
cat all_no_test.conllu | sed '/^#/ d' | cut -f$$number | sort --unique > col_$$number.txt ; \
done
./getTransitionSets.py $(MCD) col_*\.txt
texts:
./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILEs) $(TEST_FILES)
$(FPLM_FILENAME): all_no_test.conllu $(MCD)
$(SCRIPTS)/conllu2fplm.py $< $(MCD) > $@
$(RULES_FILENAME): $(FPLM_FILENAME)
macaon_compute_l_rules -f $(FPLM_FILENAME) -e $(EXCEPTIONS_FPLM_FILENAME) -r tmp.txt -t $(THRESHOLD)
cat tmp.txt | sed s/^/RULE\ LEMMA\ ON\ FORM\ /g | sed s/RULE\ LEMMA\ ON\ FORM\ @@$$/Default\ :\ \ RULE\ LEMMA\ ON\ FORM\ @@/g > $@
rm tmp.txt
echo -e "Default : NOTHING\nTOLOWER b.0 LEMMA\nTOUPPER b.0 LEMMA" > lemmatizer_case.ts
clean:
- rm *\.txt
- rm *\.conll*
- rm *\.ts
- rm $(RULES_FILENAME)
- rm $(EXCEPTIONS_FPLM_FILENAME)
- rm $(FPLM_FILENAME)
ID
FORM
LEMMA
UPOS
XPOS
FEATS
HEAD
DEPREL
DEPS
MISC
#! /usr/bin/python3
import sys
import os
import subprocess
def printUsageAndExit() :
print("USAGE : %s conll_2_text.pl file1.conllu file2.conllu..."%sys.argv[0], file=sys.stderr)
exit(1)
if __name__ == "__main__" :
if len(sys.argv) < 3 :
printUsageAndExit()
for pathToFile in sys.argv[2:] :
splited = os.path.splitext(pathToFile)
target = splited[0] + ".txt"
targetFile = open(target, "w")
command = sys.argv[1] + " " + pathToFile
p = subprocess.Popen(command, stdout=targetFile, stderr=sys.stderr, shell=True)
p.wait()
#! /usr/bin/python3
import sys
sys.path.insert(1, '../../scripts')
from readMCD import readMCD
def printUsageAndExit() :
print("USAGE : %s mcd column_1.txt columns_2.txt..."%sys.argv[0], file=sys.stderr)
exit(1)
if __name__ == "__main__" :
sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
if len(sys.argv) < 3 :
printUsageAndExit()
conllMCD, conllMCDr = readMCD(sys.argv[1])
for colFile in sys.argv[2:] :
numCol = int(colFile.split('.')[0].split('_')[-1]) -1
if not numCol in conllMCD :
continue
nameCol = conllMCD[numCol]
if nameCol == "UPOS" :
output = open("tagger.ts", 'w', encoding='utf-8')
for line in open(colFile, "r", encoding='utf-8') :
striped = line.strip()
if len(striped) == 0 :
continue
print("WRITE b.0 UPOS " + striped, file=output)
output.close()
elif nameCol == "XPOS" :
output = open("taggerx.ts", 'w', encoding='utf-8')
for line in open(colFile, "r", encoding='utf-8') :
striped = line.strip()
if len(striped) == 0 :
continue
print("WRITE b.0 XPOS " + striped, file=output)
output.close()
elif nameCol == "FEATS" :
output = open("morpho_whole.ts", 'w', encoding='utf-8')
for line in open(colFile, "r", encoding='utf-8') :
striped = line.strip()
if len(striped) == 0 :
continue
print("WRITE b.0 FEATS " + striped, file=output)
output.close()
output = open("morpho_parts.ts", 'w', encoding='utf-8')
allParts = set()
allPartsList = []
for line in open(colFile, "r", encoding='utf-8') :
striped = line.strip()
if len(striped) == 0 :
continue
parts = striped.split('|')
for part in parts :
allParts.add(part)
for part in allParts :
allPartsList.append(part)
allPartsList.sort()
for part in allPartsList :
print("ADD b.0 FEATS " + part, file=output)
print("Default : NOTHING", file=output)
output.close()
elif nameCol == "DEPREL" :
output = open("parser_legacy.ts", 'w', encoding='utf-8')
print("REDUCE", file=output)
labels = set()
labelsList = []
for line in open(colFile, "r", encoding='utf-8') :
striped = line.strip()
if len(striped) == 0 or striped == "root" or striped == "_" :
continue
label = striped.split(':')[0]
if label not in labels :
labels.add(striped)
labelsList.append(striped)
labelsList.sort()
for label in labelsList :
print("LEFT " + label, file=output)
print("RIGHT " + label, file=output)
print("EOS s.0", file=output)
print("Default : SHIFT", file=output)
output.close()
output = open("parser.ts", 'w', encoding='utf-8')
print("REDUCE", file=output)
labels = set()
labelsList = []
for line in open(colFile, "r", encoding='utf-8') :
striped = line.strip()
if len(striped) == 0 or striped == "root" or striped == "_" :
continue
label = striped.split(':')[0]
if label not in labels :
labels.add(striped)
labelsList.append(striped)
labelsList.sort()
for label in labelsList :
print("LEFT " + label, file=output)
print("RIGHT " + label, file=output)
print("Default : SHIFT", file=output)
output.close()
Name : Tagger Machine
Strategy : sequential
Classifier : tagger MLP(500,RELU,0.3) data/tagger.ts
__pycache__
This diff is collapsed.
#! /usr/bin/python3
import sys
from readMCD import readMCD
def printUsageAndExit() :
print("USAGE : %s file.conllu mcd"%sys.argv[0], file=sys.stderr)
exit(1)
def sameLineWithoutLemma(l1, l2) :
l1s = l1.split('\t')
l2s = l2.split('\t')
return (l1s[:-3],l1s[-1]) == (l2s[:-3],l2s[-1])
if __name__ == "__main__" :
sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
if len(sys.argv) != 3 :
printUsageAndExit()
conllMCD, conllMCDr = readMCD(sys.argv[2])
entriesCount = {}
entriesList = []
for line in open(sys.argv[1], "r", encoding="utf8") :
if len(line.strip()) < 3 :
continue
if line.strip()[0] == '#' :
continue
columns = line.strip().split('\t')
if len(columns[int(conllMCDr["ID"])].split('-')) > 1 :
continue
entry = ""
for col in ["FORM", "UPOS", "LEMMA", "FEATS"] :
entry = entry + columns[int(conllMCDr[col])] + '\t'
entry = entry[:-1]
if entry not in entriesCount :
entriesCount[entry] = 1
else :
entriesCount[entry] = 1+entriesCount[entry]
for entry in entriesCount :
entriesList.append(entry)
entriesList.sort()
i = 0
while i < len(entriesList) :
maxCount = 0
maxIndex = 0
j = i
while j < len(entriesList) and sameLineWithoutLemma(entriesList[i], entriesList[j]) :
if entriesCount[entriesList[j]] > maxCount :
maxCount = entriesCount[entriesList[j]]
maxIndex = j
j = j+1
print("%s"%(entriesList[maxIndex]))
i = j
#! /usr/bin/python3
import sys
from readMCD import readMCD
rules = {}
prefix = "SPLITWORD "
def printUsageAndExit() :
print("USAGE : %s file.conllu conllu.mcd"%sys.argv[0], file=sys.stderr)
exit(1)
def computeRules(sentence) :
wordById = {}
for word in sentence :
splited = word[0].split("-")
if len(splited) > 1 :
continue
wordById[word[0]] = word[1]
for word in sentence :
splited = word[0].split("-")
if len(splited) > 1 :
rule = ""
for id in range(int(splited[0]),int(splited[-1])+1) :
rule += "@" + wordById[str(id)]
if word[1] in rules :
if rule in rules[word[1]] :
rules[word[1]][rule] += 1
else :
rules[word[1]][rule] = 1
else :
rules[word[1]] = {}
rules[word[1]][rule] = 1
def main() :
sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
if len(sys.argv) != 3 :
printUsageAndExit()
conllMCD, conllMCDr = readMCD(sys.argv[2])
idId = int(conllMCDr["ID"])
idForm = int(conllMCDr["FORM"])
sentence = []
for line in open(sys.argv[1], "r", encoding="utf8") :
if len(line.strip()) < 2 or line[0] == '#' :
if len(sentence) > 0 :
computeRules(sentence)
sentence = []
continue
splited = line.strip().split('\t')
sentence += [[splited[idId], splited[idForm]]]
for word in rules :
if len(rules[word]) > 1 :
print("WARNING : Abiguity detected in \'%s\'"%(word+" "+str(rules[word])), file=sys.stderr)
toPrint = []
for rule in rules[word] :
toPrint.append([len(rule.split('@')), prefix+word+rule])
toPrint.sort(reverse=True)
for rule in toPrint :
print(rule[1])
if __name__ == "__main__" :
main()
#!/usr/bin/env perl
# Extracts raw text from CoNLL-U file. Uses newdoc and newpar tags when available.
# Copyright © 2017 Dan Zeman <zeman@ufal.mff.cuni.cz>
# License: GNU GPL
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
use Getopt::Long;
# Language code 'zh' or 'ja' will trigger Chinese-like text formatting.
my $language = 'en';
GetOptions
(
'language=s' => \$language
);
my $chinese = $language =~ m/^(zh|ja|lzh|yue)(_|$)/;
my $text = ''; # from the text attribute of the sentence
my $ftext = ''; # from the word forms of the tokens
my $newpar = 0;
my $newdoc = 0;
my $buffer = '';
my $start = 1;
my $mwtlast;
while(<>)
{
if(m/^\#\s*text\s*=\s*(.+)/)
{
$text = $1;
}
elsif(m/^\#\s*newpar(\s|$)/i)
{
$newpar = 1;
}
elsif(m/^\#\s*newdoc(\s|$)/i)
{
$newdoc = 1;
}
elsif(m/^\d+-(\d+)\t/)
{
$mwtlast = $1;
my @f = split(/\t/, $_);
# Paragraphs may start in the middle of a sentence (bulleted lists, verse etc.)
# The first token of the new paragraph has "NewPar=Yes" in the MISC column.
# Multi-word tokens have this in the token-introducing line.
if($f[9] =~ m/NewPar=Yes/i)
{
# Empty line between documents and paragraphs. (There may have been
# a paragraph break before the first part of this sentence as well!)
$buffer = print_new_paragraph_if_needed($start, $newdoc, $newpar, $buffer);
$buffer .= $ftext;
# Line breaks at word boundaries after at most 80 characters.
$buffer = print_lines_from_buffer($buffer, 80, $chinese);
print("$buffer\n\n");
$buffer = '';
# Start is only true until we write the first sentence of the input stream.
$start = 0;
$newdoc = 0;
$newpar = 0;
$text = '';
$ftext = '';
}
$ftext .= $f[1];
$ftext .= ' ' unless($f[9] =~ m/SpaceAfter=No/);
}
elsif(m/^(\d+)\t/ && !(defined($mwtlast) && $1<=$mwtlast))
{
$mwtlast = undef;
my @f = split(/\t/, $_);
# Paragraphs may start in the middle of a sentence (bulleted lists, verse etc.)
# The first token of the new paragraph has "NewPar=Yes" in the MISC column.
# Multi-word tokens have this in the token-introducing line.
if($f[9] =~ m/NewPar=Yes/i)
{
# Empty line between documents and paragraphs. (There may have been
# a paragraph break before the first part of this sentence as well!)
$buffer = print_new_paragraph_if_needed($start, $newdoc, $newpar, $buffer);
$buffer .= $ftext;
# Line breaks at word boundaries after at most 80 characters.
$buffer = print_lines_from_buffer($buffer, 80, $chinese);
print("$buffer\n\n");
$buffer = '';
# Start is only true until we write the first sentence of the input stream.
$start = 0;
$newdoc = 0;
$newpar = 0;
$text = '';
$ftext = '';
}
$ftext .= $f[1];
$ftext .= ' ' unless($f[9] =~ m/SpaceAfter=No/);
}
elsif(m/^\s*$/)
{
# In a valid CoNLL-U file, $text should be equal to $ftext except for the
# space after the last token. However, if there have been intra-sentential
# paragraph breaks, $ftext contains only the part after the last such
# break, and $text is empty. Hence we currently use $ftext everywhere
# and ignore $text, even though we note it when seeing the text attribute.
# $text .= ' ' unless($chinese);
# Empty line between documents and paragraphs.
$buffer = print_new_paragraph_if_needed($start, $newdoc, $newpar, $buffer);
$buffer .= $ftext;
# Line breaks at word boundaries after at most 80 characters.
$buffer = print_lines_from_buffer($buffer, 80, $chinese);
# Start is only true until we write the first sentence of the input stream.
$start = 0;
$newdoc = 0;
$newpar = 0;
$text = '';
$ftext = '';
$mwtlast = undef;
}
}
# There may be unflushed buffer contents after the last sentence, less than 80 characters
# (otherwise we would have already dealt with it), so just flush it.
if($buffer ne '')
{
print("$buffer\n");
}
#------------------------------------------------------------------------------
# Checks whether we have to print an extra line to separate paragraphs. Does it
# if necessary. Returns the updated buffer.
#------------------------------------------------------------------------------
sub print_new_paragraph_if_needed
{
my $start = shift;
my $newdoc = shift;
my $newpar = shift;
my $buffer = shift;
if(!$start && ($newdoc || $newpar))
{
if($buffer ne '')
{
print("$buffer\n");
$buffer = '';
}
print("\n");
}
return $buffer;
}
#------------------------------------------------------------------------------
# Prints as many complete lines of text as there are in the buffer. Returns the
# remaining contents of the buffer.
#------------------------------------------------------------------------------
sub print_lines_from_buffer
{
my $buffer = shift;
# Maximum number of characters allowed on one line, not counting the line
# break character(s), which also replace any number of trailing spaces.
# Exception: If there is a word longer than the limit, it will be printed
# on one line.
# Note that this algorithm is not suitable for Chinese and Japanese.
my $limit = shift;
# We need a different algorithm for Chinese and Japanese.
my $chinese = shift;
if($chinese)
{
return print_chinese_lines_from_buffer($buffer, $limit);
}
if(length($buffer) >= $limit)
{
my @cbuffer = split(//, $buffer);
# There may be more than one new line waiting in the buffer.
while(scalar(@cbuffer) >= $limit)
{
###!!! We could make it simpler if we ignored multi-space sequences
###!!! between words. It sounds OK to ignore them because at the
###!!! line break we do not respect original spacing anyway.
my $i;
my $ilastspace;
for($i = 0; $i<=$#cbuffer; $i++)
{
if($i>$limit && defined($ilastspace))
{
last;
}
if($cbuffer[$i] =~ m/\s/)
{
$ilastspace = $i;
}
}
if(defined($ilastspace) && $ilastspace>0)
{
my @out = @cbuffer[0..($ilastspace-1)];
splice(@cbuffer, 0, $ilastspace+1);
print(join('', @out), "\n");
}
else
{
print(join('', @cbuffer), "\n");
splice(@cbuffer);
}
}
$buffer = join('', @cbuffer);
}
return $buffer;
}
#------------------------------------------------------------------------------
# Prints as many complete lines of text as there are in the buffer. Returns the
# remaining contents of the buffer. Assumes that there are no spaces between
# words and lines can be broken between any two characters, as is the custom in
# Chinese and Japanese.
#------------------------------------------------------------------------------
sub print_chinese_lines_from_buffer
{
my $buffer = shift;
# Maximum number of characters allowed on one line, not counting the line
# break character(s).
my $limit = shift;
# We cannot simply print the first $limit characters from the buffer,
# followed by a line break. There could be embedded Latin words or
# numbers and we do not want to insert a line break in the middle of
# a foreign word.
my @cbuffer = split(//, $buffer);
while(scalar(@cbuffer) >= $limit)
{
my $nprint = 0;
for(my $i = 0; $i <= $#cbuffer; $i++)
{
if($i > $limit && $nprint > 0)
{
last;
}
unless($i < $#cbuffer && $cbuffer[$i] =~ m/[\p{Latin}0-9]/ && $cbuffer[$i+1] =~ m/[\p{Latin}0-9]/)
{
$nprint = $i+1;
}
}
my @out = @cbuffer[0..($nprint-1)];
splice(@cbuffer, 0, $nprint);
print(join('', @out), "\n");
}
$buffer = join('', @cbuffer);
return $buffer;
}
def readMCD(mcdFilename) :
mcd = {}
for line in open(mcdFilename, "r", encoding="utf8") :
clean = line.strip()
if len(line) == 2 or line[0] == '#' :
continue
mcd[len(mcd)] = clean
return mcd, {v: k for k, v in mcd.items()}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment