From 423c26c3393fa8d208edf62db5e8c30c0ccc94d2 Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Fri, 20 May 2022 15:42:33 +0200
Subject: [PATCH] Added script to print most ambiguous vectors from w2v
 generated from lexicon

---
 findMostAmbiguousFromW2v.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100755 findMostAmbiguousFromW2v.py

diff --git a/findMostAmbiguousFromW2v.py b/findMostAmbiguousFromW2v.py
new file mode 100755
index 0000000..122217f
--- /dev/null
+++ b/findMostAmbiguousFromW2v.py
@@ -0,0 +1,27 @@
+#! /usr/bin/env python3
+
+# Given a .w2v file, that has been produced by macaon_data/script/lefff2W2V.py, list the most ambiguous words.
+
+import sys
+
+def formatProba(f) :
+  if f > 0.0 :
+    return "\033[1m%.2f\033[0m"%f
+  return "%.2f"%f
+
+if __name__ == "__main__" :
+  wordList = []
+
+  for line in open(sys.argv[1], "r") :
+    line = line.strip()
+    splited = line.split()
+    if len(splited) == 2 :
+      continue
+    word = splited[0]
+    vec = [float(c) for c in splited[1:]]
+    maxProba = max(vec)
+    wordList.append((maxProba, " ".join([formatProba(f) for f in vec]+[word])))
+
+wordList.sort()
+print("\n".join([e[1] for e in wordList if e[0] < 1.0]))
+
-- 
GitLab