Skip to content
Snippets Groups Projects
cleanW2V.py 1.13 KiB
#! /usr/bin/env python3

import sys

def printUsageAndExit() :
  print("USAGE: %s w2vFilename wordSizeLimit"%sys.argv[0], file=sys.stderr)
  exit(1)

if __name__ == "__main__" :
  if len(sys.argv) != 3 :
    printUsageAndExit()

  nbWords = None
  embSize = None
  lineNumber = 0
  deleted = set()

  for line in open(sys.argv[1], "r") :
    lineNumber += 1
    line = line.strip()
    splited = line.split()
    if nbWords is None :
      nbWords = int(splited[0])
      embSize = int(splited[1])
      deleted.add(lineNumber)
      continue
    wordLen = int(len(splited[0]))
    numValues = int(len(splited))-1
    if numValues != embSize :
      print("ERROR: in line %d expected %d values got %d instead"%(lineNumber, embSize, numValues), file=sys.stderr)
      continue
    if wordLen > int(sys.argv[2]) :
      print("deleting line %d: word length = %d (%s...)"%(lineNumber, wordLen,splited[0][:10]), file=sys.stderr)
      deleted.add(lineNumber)

  newNb = lineNumber - 1 - len(deleted)
  print(newNb, embSize)
  lineNumber = 0
  
  for line in open(sys.argv[1], "r") :
    lineNumber += 1
    if lineNumber not in deleted :
      print(line, end="")