Skip to content
Snippets Groups Projects
Commit 24005f0f authored by Franck Dary's avatar Franck Dary
Browse files

Added default MCD in filterEmbeddings script and added script to compute word coverage

parent 1d1cf159
No related branches found
No related tags found
No related merge requests found
#! /usr/bin/env python3 #! /usr/bin/env python3
import sys import sys
from readMCD import readMCD
################################################################################ ################################################################################
def printUsageAndExit() : def printUsageAndExit() :
...@@ -14,19 +15,22 @@ if __name__ == "__main__" : ...@@ -14,19 +15,22 @@ if __name__ == "__main__" :
if len(sys.argv) < 3 : if len(sys.argv) < 3 :
printUsageAndExit() printUsageAndExit()
baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
vocab = {} vocab = {}
for filename in sys.argv[2:] : for filename in sys.argv[2:] :
formIndex = None col2index, index2col = readMCD(baseMCD)
for line in open(filename, "r") : for line in open(filename, "r") :
line = line.strip() line = line.strip()
if len(line) == 0 : if len(line) == 0 :
continue continue
if "# global.columns =" in line : if "# global.columns =" in line :
formIndex = line.split('=')[-1].strip().split(' ').index("FORM") col2index, index2col = readMCD(line.split('=')[-1].strip())
continue
if line[0] == '#' : if line[0] == '#' :
continue continue
word = line.split('\t')[formIndex] word = line.split('\t')[col2index["FORM"]]
vocab[word] = True vocab[word] = True
print("Vocabulary size = %d words"%len(vocab), file=sys.stderr) print("Vocabulary size = %d words"%len(vocab), file=sys.stderr)
......
#! /usr/bin/env python3
import sys
import os
from readMCD import readMCD
################################################################################
def printUsageAndExit() :
print("USAGE : %s UD_Directory"%sys.argv[0], file=sys.stderr)
exit(1)
################################################################################
################################################################################
if __name__ == "__main__" :
if len(sys.argv) != 2 :
printUsageAndExit()
baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
train = None
dev = None
test = None
for dir in os.walk(sys.argv[1]) :
for file in dir[2] :
if "train" in file and ".conllu" in file :
train = file
elif "dev" in file and ".conllu" in file :
dev = file
elif "test" in file and ".conllu" in file :
test = file
if train is None :
print("ERROR : train corpus not found", file=sys.stderr)
exit(1)
vocab = set()
col2index, index2col = readMCD(baseMCD)
for line in open(sys.argv[1]+"/"+train, "r") :
line = line.strip()
if "# global.columns =" in line :
col2index, index2col = readMCD(line.split('=')[-1].strip())
continue
if len(line) == 0 or line[0] == '#' :
continue
form = line.split('\t')[col2index["FORM"]]
if form not in vocab :
vocab.add(form)
for file in [dev, test] :
nbWords = 0
nbIn = 0
if file is not None :
col2index, index2col = readMCD(baseMCD)
for line in open(sys.argv[1]+"/"+file, "r") :
line = line.strip()
if "# global.columns =" in line :
col2index, index2col = readMCD(line.split('=')[-1].strip())
continue
if len(line) == 0 or line[0] == '#' :
continue
form = line.split('\t')[col2index["FORM"]]
nbWords += 1
if form in vocab :
nbIn += 1
print("%s\t%.2f"%(file, 100.0*nbIn/nbWords))
################################################################################
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment