diff --git a/scripts/mcf2conllu.py b/scripts/mcf2conllu.py new file mode 100755 index 0000000000000000000000000000000000000000..a536a2e4519d12807112cf32b48be01dc2852a13 --- /dev/null +++ b/scripts/mcf2conllu.py @@ -0,0 +1,58 @@ +#! /usr/bin/env python3 + +import argparse +import sys + +################################################################################ +if __name__ == "__main__" : + parser = argparse.ArgumentParser() + parser.add_argument("input", type=str, + help="Input mcf file") + parser.add_argument("head", type=int, + help="Index of the column containing governor relative index.") + parser.add_argument("eos", type=int, + help="Index of the column containing end of sentence info.") + parser.add_argument("--form", type=int, default=None, + help="Index of the column containing FORM.") + parser.add_argument("--upos", type=int, default=None, + help="Index of the column containing UPOS.") + + args = parser.parse_args() + mcd = [] + + sentence = [] + sentenceID = 0 + for line in open(args.input, "r") : + line = line.strip() + if len(line) == 0 : + continue + if line[0] == '#' : + continue + splited = line.split('\t') + + if len(mcd) == 0 : + mcd = [str(k) for k in range(1,len(splited)+1)] + mcd[args.head] = "HEAD" + mcd[args.eos] = "ID" + if args.form is not None : + mcd[args.form] = "FORM" + if args.upos is not None : + mcd[args.upos] = "UPOS" + print("# global.columns = %s"%" ".join(mcd)) + + splited[args.head] = int(splited[args.head]) + sentence.append(splited) + eos = int(splited[args.eos]) + if eos == 1 : + sentenceID += 1 + print("# sent_id = %d"%sentenceID) + for i in range(len(sentence)) : + sentence[i][args.eos] = i+1 # Recycling EOS column into ID column + if sentence[i][args.head] != 0 : + sentence[i][args.head] += i+1 # Transforming relative head into absolute + for word in sentence : + print('\t'.join(map(str,word))) + print("") + sentence = [] +################################################################################ +