From cb3187f580a34bbd1c36c3736d31cce8c50f2314 Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Thu, 1 Jul 2021 14:53:16 +0200
Subject: [PATCH] Added script mcf2conllu

---
 scripts/mcf2conllu.py | 58 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100755 scripts/mcf2conllu.py

diff --git a/scripts/mcf2conllu.py b/scripts/mcf2conllu.py
new file mode 100755
index 0000000..a536a2e
--- /dev/null
+++ b/scripts/mcf2conllu.py
@@ -0,0 +1,58 @@
+#! /usr/bin/env python3
+
+import argparse
+import sys
+
+################################################################################
+if __name__ == "__main__" :
+  parser = argparse.ArgumentParser()
+  parser.add_argument("input", type=str,
+    help="Input mcf file")
+  parser.add_argument("head", type=int,
+    help="Index of the column containing governor relative index.")
+  parser.add_argument("eos", type=int,
+    help="Index of the column containing end of sentence info.")
+  parser.add_argument("--form", type=int, default=None,
+    help="Index of the column containing FORM.")
+  parser.add_argument("--upos", type=int, default=None,
+    help="Index of the column containing UPOS.")
+
+  args = parser.parse_args()
+  mcd = []
+
+  sentence = []
+  sentenceID = 0
+  for line in open(args.input, "r") :
+    line = line.strip()
+    if len(line) == 0 :
+      continue
+    if line[0] == '#' :
+      continue
+    splited = line.split('\t')
+
+    if len(mcd) == 0 :
+      mcd = [str(k) for k in range(1,len(splited)+1)]
+      mcd[args.head] = "HEAD"
+      mcd[args.eos] = "ID"
+      if args.form is not None :
+        mcd[args.form] = "FORM"
+      if args.upos is not None :
+        mcd[args.upos] = "UPOS"
+      print("# global.columns = %s"%" ".join(mcd))
+
+    splited[args.head] = int(splited[args.head])
+    sentence.append(splited)
+    eos = int(splited[args.eos])
+    if eos == 1 :
+      sentenceID += 1
+      print("# sent_id = %d"%sentenceID)
+      for i in range(len(sentence)) :
+        sentence[i][args.eos] = i+1 # Recycling EOS column into ID column
+        if sentence[i][args.head] != 0 :
+          sentence[i][args.head] += i+1 # Transforming relative head into absolute
+      for word in sentence :
+        print('\t'.join(map(str,word)))
+      print("")
+      sentence = []
+################################################################################
+
-- 
GitLab