#! /usr/bin/env python3

import sys
import os
from readMCD import readMCD

################################################################################
def printUsageAndExit() :
  print("USAGE : %s UD_Directory"%sys.argv[0], file=sys.stderr)
  exit(1)
################################################################################

################################################################################
if __name__ == "__main__" :
  if len(sys.argv) != 2 :
    printUsageAndExit()

  baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"

  train = None
  dev = None
  test = None

  for dir in os.walk(sys.argv[1]) :
    for file in dir[2] :
      if "train" in file and ".conllu" in file :
        train = file
      elif "dev" in file and ".conllu" in file :
        dev = file
      elif "test" in file and ".conllu" in file :
        test = file

  if train is None :
    print("ERROR : train corpus not found", file=sys.stderr)
    exit(1)

  vocab = set()
  col2index, index2col = readMCD(baseMCD)
  for line in open(sys.argv[1]+"/"+train, "r") :
    line = line.strip()
    if "# global.columns =" in line :
      col2index, index2col = readMCD(line.split('=')[-1].strip())
      continue
    if len(line) == 0 or line[0] == '#' :
      continue
    form = line.split('\t')[col2index["FORM"]]
    if form not in vocab :
      vocab.add(form)

  for file in [dev, test] :
    nbWords = 0
    nbIn = 0
    if file is not None :
      col2index, index2col = readMCD(baseMCD)
      for line in open(sys.argv[1]+"/"+file, "r") :
        line = line.strip()
        if "# global.columns =" in line :
          col2index, index2col = readMCD(line.split('=')[-1].strip())
          continue
        if len(line) == 0 or line[0] == '#' :
          continue
        form = line.split('\t')[col2index["FORM"]]
        nbWords += 1
        if form in vocab :
          nbIn += 1
      print("%s\t%.2f"%(file, 100.0*nbIn/nbWords))
################################################################################