diff --git a/scripts/concatW2V.py b/scripts/concatW2V.py new file mode 100755 index 0000000000000000000000000000000000000000..024459e032f090b5006bc585bfb22a90f3c89bce --- /dev/null +++ b/scripts/concatW2V.py @@ -0,0 +1,18 @@ +#! /usr/bin/env python3 + +import sys + +hadFirst = False +for filename in sys.argv[1:] : + prefix = filename.split('/')[-1].split('.')[0] + for line in open(filename, "r") : + line = line.strip() + splited = line.split() + if len(splited) == 2 : + if hadFirst : + continue + hadFirst = True + print(line) + else : + print(prefix+"_"+line) + diff --git a/scripts/conlluPrefixFormByFilename.py b/scripts/conlluPrefixFormByFilename.py new file mode 100755 index 0000000000000000000000000000000000000000..133c8ad54af06fe023785792b14f3a8cf30177d2 --- /dev/null +++ b/scripts/conlluPrefixFormByFilename.py @@ -0,0 +1,25 @@ +#! /usr/bin/env python3 + +import sys +from readMCD import readMCD + +if len(sys.argv) < 3 : + print("USAGE : %s FORMindex filename1 filename2..."%sys.argv[0]) + +baseMCD = sys.argv[1] + +for filename in sys.argv[2:] : + prefix = filename.split('/')[-1].split('.')[0] + formIndex = int(sys.argv[1]) + lines = [] + for line in open(filename, "r") : + lines.append(line.strip()) + with open(filename, "w") as out : + for line in lines : + if len(line) == 0 or line[0] == "#" : + print(line, file=out) + continue + splited = line.split('\t') + splited[formIndex] = prefix+"_"+splited[formIndex] + print("\t".join(splited), file=out) + diff --git a/scripts/mcf2conllu.py b/scripts/mcf2conllu.py index a536a2e4519d12807112cf32b48be01dc2852a13..514e92473f91956e1a93371992c803c820b29745 100755 --- a/scripts/mcf2conllu.py +++ b/scripts/mcf2conllu.py @@ -42,7 +42,7 @@ if __name__ == "__main__" : splited[args.head] = int(splited[args.head]) sentence.append(splited) - eos = int(splited[args.eos]) + eos = 0 if splited[args.eos] == "_" else int(splited[args.eos]) if eos == 1 : sentenceID += 1 print("# sent_id = %d"%sentenceID) diff --git a/scripts/sentences2Conllu.py b/scripts/sentences2Conllu.py new file mode 100755 index 0000000000000000000000000000000000000000..a736541b8f0a6f31bbe1f0de93008c3dff5262bd --- /dev/null +++ b/scripts/sentences2Conllu.py @@ -0,0 +1,23 @@ +#! /usr/bin/env python3 + +import sys +from readMCD import readMCD + + +col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC") + +print("# global.columns = %s"%(" ".join(col2index.keys()))) + +for line in open(sys.argv[1], "r") : + line = line.strip() + words = line.split() + sentence = [["_" for _ in col2index] for _ in words] + for i in range(len(sentence)) : + sentence[i][col2index["ID"]] = str(i+1) + sentence[i][col2index["FORM"]] = words[i] + sentence[i][col2index["HEAD"]] = "0" if i == 0 else "1" + sentence[i][col2index["DEPREL"]] = "root" if i == 0 else "_" + print("# text = %s"%line) + print("\n".join(["\t".join(word) for word in sentence])) + print("") +