From 7b65bf7f683b1107d2fac61593204b9df746b20b Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Wed, 8 Sep 2021 08:38:46 +0200 Subject: [PATCH] Improved diverse scripts --- scripts/concatW2V.py | 18 ++++++++++++++++++ scripts/conlluPrefixFormByFilename.py | 25 +++++++++++++++++++++++++ scripts/mcf2conllu.py | 2 +- scripts/sentences2Conllu.py | 23 +++++++++++++++++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) create mode 100755 scripts/concatW2V.py create mode 100755 scripts/conlluPrefixFormByFilename.py create mode 100755 scripts/sentences2Conllu.py diff --git a/scripts/concatW2V.py b/scripts/concatW2V.py new file mode 100755 index 0000000..024459e --- /dev/null +++ b/scripts/concatW2V.py @@ -0,0 +1,18 @@ +#! /usr/bin/env python3 + +import sys + +hadFirst = False +for filename in sys.argv[1:] : + prefix = filename.split('/')[-1].split('.')[0] + for line in open(filename, "r") : + line = line.strip() + splited = line.split() + if len(splited) == 2 : + if hadFirst : + continue + hadFirst = True + print(line) + else : + print(prefix+"_"+line) + diff --git a/scripts/conlluPrefixFormByFilename.py b/scripts/conlluPrefixFormByFilename.py new file mode 100755 index 0000000..133c8ad --- /dev/null +++ b/scripts/conlluPrefixFormByFilename.py @@ -0,0 +1,25 @@ +#! /usr/bin/env python3 + +import sys +from readMCD import readMCD + +if len(sys.argv) < 3 : + print("USAGE : %s FORMindex filename1 filename2..."%sys.argv[0]) + +baseMCD = sys.argv[1] + +for filename in sys.argv[2:] : + prefix = filename.split('/')[-1].split('.')[0] + formIndex = int(sys.argv[1]) + lines = [] + for line in open(filename, "r") : + lines.append(line.strip()) + with open(filename, "w") as out : + for line in lines : + if len(line) == 0 or line[0] == "#" : + print(line, file=out) + continue + splited = line.split('\t') + splited[formIndex] = prefix+"_"+splited[formIndex] + print("\t".join(splited), file=out) + diff --git a/scripts/mcf2conllu.py b/scripts/mcf2conllu.py index a536a2e..514e924 100755 --- a/scripts/mcf2conllu.py +++ b/scripts/mcf2conllu.py @@ -42,7 +42,7 @@ if __name__ == "__main__" : splited[args.head] = int(splited[args.head]) sentence.append(splited) - eos = int(splited[args.eos]) + eos = 0 if splited[args.eos] == "_" else int(splited[args.eos]) if eos == 1 : sentenceID += 1 print("# sent_id = %d"%sentenceID) diff --git a/scripts/sentences2Conllu.py b/scripts/sentences2Conllu.py new file mode 100755 index 0000000..a736541 --- /dev/null +++ b/scripts/sentences2Conllu.py @@ -0,0 +1,23 @@ +#! /usr/bin/env python3 + +import sys +from readMCD import readMCD + + +col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC") + +print("# global.columns = %s"%(" ".join(col2index.keys()))) + +for line in open(sys.argv[1], "r") : + line = line.strip() + words = line.split() + sentence = [["_" for _ in col2index] for _ in words] + for i in range(len(sentence)) : + sentence[i][col2index["ID"]] = str(i+1) + sentence[i][col2index["FORM"]] = words[i] + sentence[i][col2index["HEAD"]] = "0" if i == 0 else "1" + sentence[i][col2index["DEPREL"]] = "root" if i == 0 else "_" + print("# text = %s"%line) + print("\n".join(["\t".join(word) for word in sentence])) + print("") + -- GitLab