Skip to content
Snippets Groups Projects
Commit 24005f0f authored by Franck Dary's avatar Franck Dary
Browse files

Added default MCD in filterEmbeddings script and added script to compute word coverage

parent 1d1cf159
No related branches found
No related tags found
No related merge requests found
#! /usr/bin/env python3
import sys
from readMCD import readMCD
################################################################################
def printUsageAndExit() :
......@@ -14,19 +15,22 @@ if __name__ == "__main__" :
if len(sys.argv) < 3 :
printUsageAndExit()
baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
vocab = {}
for filename in sys.argv[2:] :
formIndex = None
col2index, index2col = readMCD(baseMCD)
for line in open(filename, "r") :
line = line.strip()
if len(line) == 0 :
continue
if "# global.columns =" in line :
formIndex = line.split('=')[-1].strip().split(' ').index("FORM")
col2index, index2col = readMCD(line.split('=')[-1].strip())
continue
if line[0] == '#' :
continue
word = line.split('\t')[formIndex]
word = line.split('\t')[col2index["FORM"]]
vocab[word] = True
print("Vocabulary size = %d words"%len(vocab), file=sys.stderr)
......
#! /usr/bin/env python3
import sys
import os
from readMCD import readMCD
################################################################################
def printUsageAndExit() :
print("USAGE : %s UD_Directory"%sys.argv[0], file=sys.stderr)
exit(1)
################################################################################
################################################################################
if __name__ == "__main__" :
if len(sys.argv) != 2 :
printUsageAndExit()
baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
train = None
dev = None
test = None
for dir in os.walk(sys.argv[1]) :
for file in dir[2] :
if "train" in file and ".conllu" in file :
train = file
elif "dev" in file and ".conllu" in file :
dev = file
elif "test" in file and ".conllu" in file :
test = file
if train is None :
print("ERROR : train corpus not found", file=sys.stderr)
exit(1)
vocab = set()
col2index, index2col = readMCD(baseMCD)
for line in open(sys.argv[1]+"/"+train, "r") :
line = line.strip()
if "# global.columns =" in line :
col2index, index2col = readMCD(line.split('=')[-1].strip())
continue
if len(line) == 0 or line[0] == '#' :
continue
form = line.split('\t')[col2index["FORM"]]
if form not in vocab :
vocab.add(form)
for file in [dev, test] :
nbWords = 0
nbIn = 0
if file is not None :
col2index, index2col = readMCD(baseMCD)
for line in open(sys.argv[1]+"/"+file, "r") :
line = line.strip()
if "# global.columns =" in line :
col2index, index2col = readMCD(line.split('=')[-1].strip())
continue
if len(line) == 0 or line[0] == '#' :
continue
form = line.split('\t')[col2index["FORM"]]
nbWords += 1
if form in vocab :
nbIn += 1
print("%s\t%.2f"%(file, 100.0*nbIn/nbWords))
################################################################################
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment