#! /usr/bin/env python3 import sys import os from readMCD import readMCD ################################################################################ def printUsageAndExit() : print("USAGE : %s UD_Directory"%sys.argv[0], file=sys.stderr) exit(1) ################################################################################ ################################################################################ if __name__ == "__main__" : if len(sys.argv) != 2 : printUsageAndExit() baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC" train = None dev = None test = None for dir in os.walk(sys.argv[1]) : for file in dir[2] : if "train" in file and ".conllu" in file : train = file elif "dev" in file and ".conllu" in file : dev = file elif "test" in file and ".conllu" in file : test = file if train is None : print("ERROR : train corpus not found", file=sys.stderr) exit(1) vocab = set() col2index, index2col = readMCD(baseMCD) for line in open(sys.argv[1]+"/"+train, "r") : line = line.strip() if "# global.columns =" in line : col2index, index2col = readMCD(line.split('=')[-1].strip()) continue if len(line) == 0 or line[0] == '#' : continue form = line.split('\t')[col2index["FORM"]] if form not in vocab : vocab.add(form) for file in [dev, test] : nbWords = 0 nbIn = 0 if file is not None : col2index, index2col = readMCD(baseMCD) for line in open(sys.argv[1]+"/"+file, "r") : line = line.strip() if "# global.columns =" in line : col2index, index2col = readMCD(line.split('=')[-1].strip()) continue if len(line) == 0 or line[0] == '#' : continue form = line.split('\t')[col2index["FORM"]] nbWords += 1 if form in vocab : nbIn += 1 print("%s\t%.2f"%(file, 100.0*nbIn/nbWords)) ################################################################################