Skip to content
Snippets Groups Projects
Commit 6b1f4c28 authored by Franck Dary's avatar Franck Dary
Browse files

Added script to clean w2v file of long words

parent 258180d3
No related branches found
No related tags found
No related merge requests found
#! /usr/bin/env python3
import sys
def printUsageAndExit() :
print("USAGE: %s w2vFilename wordSizeLimit"%sys.argv[0], file=sys.stderr)
exit(1)
if __name__ == "__main__" :
if len(sys.argv) != 3 :
printUsageAndExit()
nbWords = None
embSize = None
lineNumber = 0
deleted = set()
for line in open(sys.argv[1], "r") :
lineNumber += 1
line = line.strip()
splited = line.split()
if nbWords is None :
nbWords = int(splited[0])
embSize = int(splited[1])
deleted.add(lineNumber)
continue
wordLen = int(len(splited[0]))
numValues = int(len(splited))-1
if numValues != embSize :
print("ERROR: in line %d expected %d values got %d instead"%(lineNumber, embSize, numValues), file=sys.stderr)
continue
if wordLen > int(sys.argv[2]) :
print("deleting line %d: word length = %d (%s...)"%(lineNumber, wordLen,splited[0][:10]), file=sys.stderr)
deleted.add(lineNumber)
newNb = lineNumber - 1 - len(deleted)
print(newNb, embSize)
lineNumber = 0
for line in open(sys.argv[1], "r") :
lineNumber += 1
if lineNumber not in deleted :
print(line, end="")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment