From 24005f0f2ffb823b73439b91ca18b1776c2a47c3 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Wed, 21 Apr 2021 13:34:01 +0200 Subject: [PATCH] Added default MCD in filterEmbeddings script and added script to compute word coverage --- scripts/filterEmbeddings.py | 10 ++++-- scripts/getWordsCoverage.py | 68 +++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 3 deletions(-) create mode 100755 scripts/getWordsCoverage.py diff --git a/scripts/filterEmbeddings.py b/scripts/filterEmbeddings.py index 83c138a..7dd11e5 100755 --- a/scripts/filterEmbeddings.py +++ b/scripts/filterEmbeddings.py @@ -1,6 +1,7 @@ #! /usr/bin/env python3 import sys +from readMCD import readMCD ################################################################################ def printUsageAndExit() : @@ -14,19 +15,22 @@ if __name__ == "__main__" : if len(sys.argv) < 3 : printUsageAndExit() + baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC" + vocab = {} for filename in sys.argv[2:] : - formIndex = None + col2index, index2col = readMCD(baseMCD) for line in open(filename, "r") : line = line.strip() if len(line) == 0 : continue if "# global.columns =" in line : - formIndex = line.split('=')[-1].strip().split(' ').index("FORM") + col2index, index2col = readMCD(line.split('=')[-1].strip()) + continue if line[0] == '#' : continue - word = line.split('\t')[formIndex] + word = line.split('\t')[col2index["FORM"]] vocab[word] = True print("Vocabulary size = %d words"%len(vocab), file=sys.stderr) diff --git a/scripts/getWordsCoverage.py b/scripts/getWordsCoverage.py new file mode 100755 index 0000000..74f5d22 --- /dev/null +++ b/scripts/getWordsCoverage.py @@ -0,0 +1,68 @@ +#! /usr/bin/env python3 + +import sys +import os +from readMCD import readMCD + +################################################################################ +def printUsageAndExit() : + print("USAGE : %s UD_Directory"%sys.argv[0], file=sys.stderr) + exit(1) +################################################################################ + +################################################################################ +if __name__ == "__main__" : + if len(sys.argv) != 2 : + printUsageAndExit() + + baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC" + + train = None + dev = None + test = None + + for dir in os.walk(sys.argv[1]) : + for file in dir[2] : + if "train" in file and ".conllu" in file : + train = file + elif "dev" in file and ".conllu" in file : + dev = file + elif "test" in file and ".conllu" in file : + test = file + + if train is None : + print("ERROR : train corpus not found", file=sys.stderr) + exit(1) + + vocab = set() + col2index, index2col = readMCD(baseMCD) + for line in open(sys.argv[1]+"/"+train, "r") : + line = line.strip() + if "# global.columns =" in line : + col2index, index2col = readMCD(line.split('=')[-1].strip()) + continue + if len(line) == 0 or line[0] == '#' : + continue + form = line.split('\t')[col2index["FORM"]] + if form not in vocab : + vocab.add(form) + + for file in [dev, test] : + nbWords = 0 + nbIn = 0 + if file is not None : + col2index, index2col = readMCD(baseMCD) + for line in open(sys.argv[1]+"/"+file, "r") : + line = line.strip() + if "# global.columns =" in line : + col2index, index2col = readMCD(line.split('=')[-1].strip()) + continue + if len(line) == 0 or line[0] == '#' : + continue + form = line.split('\t')[col2index["FORM"]] + nbWords += 1 + if form in vocab : + nbIn += 1 + print("%s\t%.2f"%(file, 100.0*nbIn/nbWords)) +################################################################################ + -- GitLab