Something went wrong on our end
-
Franck Dary authoredFranck Dary authored
cleanW2V.py 1.13 KiB
#! /usr/bin/env python3
import sys
def printUsageAndExit() :
print("USAGE: %s w2vFilename wordSizeLimit"%sys.argv[0], file=sys.stderr)
exit(1)
if __name__ == "__main__" :
if len(sys.argv) != 3 :
printUsageAndExit()
nbWords = None
embSize = None
lineNumber = 0
deleted = set()
for line in open(sys.argv[1], "r") :
lineNumber += 1
line = line.strip()
splited = line.split()
if nbWords is None :
nbWords = int(splited[0])
embSize = int(splited[1])
deleted.add(lineNumber)
continue
wordLen = int(len(splited[0]))
numValues = int(len(splited))-1
if numValues != embSize :
print("ERROR: in line %d expected %d values got %d instead"%(lineNumber, embSize, numValues), file=sys.stderr)
continue
if wordLen > int(sys.argv[2]) :
print("deleting line %d: word length = %d (%s...)"%(lineNumber, wordLen,splited[0][:10]), file=sys.stderr)
deleted.add(lineNumber)
newNb = lineNumber - 1 - len(deleted)
print(newNb, embSize)
lineNumber = 0
for line in open(sys.argv[1], "r") :
lineNumber += 1
if lineNumber not in deleted :
print(line, end="")