diff --git a/findMostAmbiguousFromW2v.py b/findMostAmbiguousFromW2v.py new file mode 100755 index 0000000000000000000000000000000000000000..122217f137d58ca3c25ed1baae5f2dff866a21f9 --- /dev/null +++ b/findMostAmbiguousFromW2v.py @@ -0,0 +1,27 @@ +#! /usr/bin/env python3 + +# Given a .w2v file, that has been produced by macaon_data/script/lefff2W2V.py, list the most ambiguous words. + +import sys + +def formatProba(f) : + if f > 0.0 : + return "\033[1m%.2f\033[0m"%f + return "%.2f"%f + +if __name__ == "__main__" : + wordList = [] + + for line in open(sys.argv[1], "r") : + line = line.strip() + splited = line.split() + if len(splited) == 2 : + continue + word = splited[0] + vec = [float(c) for c in splited[1:]] + maxProba = max(vec) + wordList.append((maxProba, " ".join([formatProba(f) for f in vec]+[word]))) + +wordList.sort() +print("\n".join([e[1] for e in wordList if e[0] < 1.0])) +