From 423c26c3393fa8d208edf62db5e8c30c0ccc94d2 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Fri, 20 May 2022 15:42:33 +0200 Subject: [PATCH] Added script to print most ambiguous vectors from w2v generated from lexicon --- findMostAmbiguousFromW2v.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100755 findMostAmbiguousFromW2v.py diff --git a/findMostAmbiguousFromW2v.py b/findMostAmbiguousFromW2v.py new file mode 100755 index 0000000..122217f --- /dev/null +++ b/findMostAmbiguousFromW2v.py @@ -0,0 +1,27 @@ +#! /usr/bin/env python3 + +# Given a .w2v file, that has been produced by macaon_data/script/lefff2W2V.py, list the most ambiguous words. + +import sys + +def formatProba(f) : + if f > 0.0 : + return "\033[1m%.2f\033[0m"%f + return "%.2f"%f + +if __name__ == "__main__" : + wordList = [] + + for line in open(sys.argv[1], "r") : + line = line.strip() + splited = line.split() + if len(splited) == 2 : + continue + word = splited[0] + vec = [float(c) for c in splited[1:]] + maxProba = max(vec) + wordList.append((maxProba, " ".join([formatProba(f) for f in vec]+[word]))) + +wordList.sort() +print("\n".join([e[1] for e in wordList if e[0] < 1.0])) + -- GitLab