"get_train_val.py" did not exist on "2afd11a99af6e2d0af26bc72b3893d9c88c51585"
Select Git revision
-
Mohyeddine2 authoredMohyeddine2 authored
conlluAddMissingColumns.py 1.56 KiB
#! /usr/bin/python3
import sys
def printUsageAndExit() :
print("USAGE : %s file.conllu mcd"%sys.argv[0], file=sys.stderr)
exit(1)
def readMCD(mcdFilename) :
mcd = {}
for line in open(mcdFilename, "r", encoding="utf8") :
clean = line.strip()
if len(line) < 2 or line[0] == '#' :
continue
splited = line.split(' ')
if len(splited) != 2 :
print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr)
exit(1)
mcd[splited[0].strip()] = splited[1].strip()
return mcd
if __name__ == "__main__" :
if len(sys.argv) != 3 :
printUsageAndExit()
conllMCD = readMCD(sys.argv[2])
conllMCDr = {v: k for k, v in conllMCD.items()}
lastWasEmpty = False
for line in open(sys.argv[1], "r") :
lastWasEmpty = False
if len(line.strip()) < 2 :
lastWasEmpty = True
print(line.strip())
continue
elif line[0] == '#' :
print(line.strip())
continue
columns = line.strip().split('\t')
for col in conllMCD :
while len(columns) <= int(col) :
columns.append("")
for i in range(len(columns)) :
suffix = "\t"
if i == len(columns)-1 :
suffix = "\n"
if len(columns[i]) > 0 :
print(columns[i], end=suffix)
elif conllMCD[str(i)] == "GOV" :
id = columns[int(conllMCDr["ID"])]
if id == "1" :
print("0", end=suffix)
elif len(id.split('-')) > 1 :
print("_", end=suffix)
else :
print("1", end=suffix)
else :
print("_", end=suffix)
if not lastWasEmpty :
print("")