From 7b65bf7f683b1107d2fac61593204b9df746b20b Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Wed, 8 Sep 2021 08:38:46 +0200
Subject: [PATCH] Improved diverse scripts

---
 scripts/concatW2V.py                  | 18 ++++++++++++++++++
 scripts/conlluPrefixFormByFilename.py | 25 +++++++++++++++++++++++++
 scripts/mcf2conllu.py                 |  2 +-
 scripts/sentences2Conllu.py           | 23 +++++++++++++++++++++++
 4 files changed, 67 insertions(+), 1 deletion(-)
 create mode 100755 scripts/concatW2V.py
 create mode 100755 scripts/conlluPrefixFormByFilename.py
 create mode 100755 scripts/sentences2Conllu.py

diff --git a/scripts/concatW2V.py b/scripts/concatW2V.py
new file mode 100755
index 0000000..024459e
--- /dev/null
+++ b/scripts/concatW2V.py
@@ -0,0 +1,18 @@
+#! /usr/bin/env python3
+
+import sys
+
+hadFirst = False
+for filename in sys.argv[1:] :
+  prefix = filename.split('/')[-1].split('.')[0]
+  for line in open(filename, "r") :
+    line = line.strip()
+    splited = line.split()
+    if len(splited) == 2 :
+      if hadFirst :
+        continue
+      hadFirst = True
+      print(line)
+    else :
+      print(prefix+"_"+line)
+
diff --git a/scripts/conlluPrefixFormByFilename.py b/scripts/conlluPrefixFormByFilename.py
new file mode 100755
index 0000000..133c8ad
--- /dev/null
+++ b/scripts/conlluPrefixFormByFilename.py
@@ -0,0 +1,25 @@
+#! /usr/bin/env python3
+
+import sys
+from readMCD import readMCD
+
+if len(sys.argv) < 3 :
+  print("USAGE : %s FORMindex filename1 filename2..."%sys.argv[0])
+
+baseMCD = sys.argv[1]
+
+for filename in sys.argv[2:] :
+  prefix = filename.split('/')[-1].split('.')[0]
+  formIndex = int(sys.argv[1])
+  lines = []
+  for line in open(filename, "r") :
+    lines.append(line.strip())
+  with open(filename, "w") as out :
+    for line in lines :
+      if len(line) == 0 or line[0] == "#" :
+        print(line, file=out)
+        continue
+      splited = line.split('\t')
+      splited[formIndex] = prefix+"_"+splited[formIndex]
+      print("\t".join(splited), file=out)
+    
diff --git a/scripts/mcf2conllu.py b/scripts/mcf2conllu.py
index a536a2e..514e924 100755
--- a/scripts/mcf2conllu.py
+++ b/scripts/mcf2conllu.py
@@ -42,7 +42,7 @@ if __name__ == "__main__" :
 
     splited[args.head] = int(splited[args.head])
     sentence.append(splited)
-    eos = int(splited[args.eos])
+    eos = 0 if splited[args.eos] == "_" else int(splited[args.eos])
     if eos == 1 :
       sentenceID += 1
       print("# sent_id = %d"%sentenceID)
diff --git a/scripts/sentences2Conllu.py b/scripts/sentences2Conllu.py
new file mode 100755
index 0000000..a736541
--- /dev/null
+++ b/scripts/sentences2Conllu.py
@@ -0,0 +1,23 @@
+#! /usr/bin/env python3
+
+import sys
+from readMCD import readMCD
+
+
+col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC")
+
+print("# global.columns = %s"%(" ".join(col2index.keys())))
+
+for line in open(sys.argv[1], "r") :
+  line = line.strip()
+  words = line.split()
+  sentence = [["_" for _ in col2index] for _ in words]
+  for i in range(len(sentence)) :
+    sentence[i][col2index["ID"]] = str(i+1)
+    sentence[i][col2index["FORM"]] = words[i]
+    sentence[i][col2index["HEAD"]] = "0" if i == 0 else "1"
+    sentence[i][col2index["DEPREL"]] = "root" if i == 0 else "_"
+  print("# text = %s"%line)
+  print("\n".join(["\t".join(word) for word in sentence]))
+  print("")
+
-- 
GitLab