diff --git a/scripts/cleanW2V.py b/scripts/cleanW2V.py new file mode 100755 index 0000000000000000000000000000000000000000..9451e5cdf8c3b7e42d0526bc31ff378a7b661907 --- /dev/null +++ b/scripts/cleanW2V.py @@ -0,0 +1,44 @@ +#! /usr/bin/env python3 + +import sys + +def printUsageAndExit() : + print("USAGE: %s w2vFilename wordSizeLimit"%sys.argv[0], file=sys.stderr) + exit(1) + +if __name__ == "__main__" : + if len(sys.argv) != 3 : + printUsageAndExit() + + nbWords = None + embSize = None + lineNumber = 0 + deleted = set() + + for line in open(sys.argv[1], "r") : + lineNumber += 1 + line = line.strip() + splited = line.split() + if nbWords is None : + nbWords = int(splited[0]) + embSize = int(splited[1]) + deleted.add(lineNumber) + continue + wordLen = int(len(splited[0])) + numValues = int(len(splited))-1 + if numValues != embSize : + print("ERROR: in line %d expected %d values got %d instead"%(lineNumber, embSize, numValues), file=sys.stderr) + continue + if wordLen > int(sys.argv[2]) : + print("deleting line %d: word length = %d (%s...)"%(lineNumber, wordLen,splited[0][:10]), file=sys.stderr) + deleted.add(lineNumber) + + newNb = lineNumber - 1 - len(deleted) + print(newNb, embSize) + lineNumber = 0 + + for line in open(sys.argv[1], "r") : + lineNumber += 1 + if lineNumber not in deleted : + print(line, end="") +