#! /usr/bin/env python3 import sys def printUsageAndExit() : print("USAGE: %s w2vFilename wordSizeLimit"%sys.argv[0], file=sys.stderr) exit(1) if __name__ == "__main__" : if len(sys.argv) != 3 : printUsageAndExit() nbWords = None embSize = None lineNumber = 0 deleted = set() for line in open(sys.argv[1], "r") : lineNumber += 1 line = line.strip() splited = line.split() if nbWords is None : nbWords = int(splited[0]) embSize = int(splited[1]) deleted.add(lineNumber) continue wordLen = int(len(splited[0])) numValues = int(len(splited))-1 if numValues != embSize : print("ERROR: in line %d expected %d values got %d instead"%(lineNumber, embSize, numValues), file=sys.stderr) continue if wordLen > int(sys.argv[2]) : print("deleting line %d: word length = %d (%s...)"%(lineNumber, wordLen,splited[0][:10]), file=sys.stderr) deleted.add(lineNumber) newNb = lineNumber - 1 - len(deleted) print(newNb, embSize) lineNumber = 0 for line in open(sys.argv[1], "r") : lineNumber += 1 if lineNumber not in deleted : print(line, end="")