diff --git a/scripts/concatW2V.py b/scripts/concatW2V.py index 024459e032f090b5006bc585bfb22a90f3c89bce..8100d988d95e2fd9e84535d735e0d827aaa72df1 100755 --- a/scripts/concatW2V.py +++ b/scripts/concatW2V.py @@ -2,17 +2,27 @@ import sys -hadFirst = False +nbLines = 0 +embSize = None for filename in sys.argv[1:] : - prefix = filename.split('/')[-1].split('.')[0] for line in open(filename, "r") : line = line.strip() splited = line.split() if len(splited) == 2 : - if hadFirst : - continue - hadFirst = True - print(line) + if embSize is None : + embSize = int(splited[1]) + elif embSize != int(splited[1]) : + print("ERROR : incompatibles embedings sizes %d and %d"%(embSize, int(splited[1])), file=sys.stderr) + exit(1) else : + nbLines += 1 + +print(nbLines, embSize) +for filename in sys.argv[1:] : + prefix = filename.split('/')[-1].split('.')[0] + for line in open(filename, "r") : + line = line.strip() + splited = line.split() + if len(splited) > 2 : print(prefix+"_"+line) diff --git a/scripts/conlluCopyColumn.py b/scripts/conlluCopyColumn.py new file mode 100755 index 0000000000000000000000000000000000000000..8cf162c788cab6bbfde49e8475d74056bb046605 --- /dev/null +++ b/scripts/conlluCopyColumn.py @@ -0,0 +1,29 @@ +#! /usr/bin/env python3 + +import sys +from readMCD import readMCD + +if len(sys.argv) < 4 : + print("USAGE : %s fromColumn toColumn file1.conllu file2.conllu..."%sys.argv[0], file=sys.stderr) + exit(1) + +fromCol = sys.argv[1] +toCol = sys.argv[2] + +for filename in sys.argv[3:] : + lines = [] + for line in open(filename, "r") : + line = line.strip() + if "# global.columns =" in line : + line = line + " " + toCol + conllMCD, conllMCDr = readMCD(line.split('=')[-1].strip()) + if len(line) == 0 or line[0] == '#' : + lines.append(line) + continue + splited = line.split('\t') + fromValue = splited[conllMCD[fromCol]] + splited.append(fromValue) + lines.append("\t".join(splited)) + with open(filename, "w") as out : + print("\n".join(lines), file=out) +