Skip to content
Snippets Groups Projects
Select Git revision
  • a185a2e924ca9d3506f6379476c8e2132e88489b
  • master default protected
  • debugging
3 results

oargen.py

Blame
  • conllu2splits.py 1.93 KiB
    #! /usr/bin/python3
    
    import sys
    
    rules = {}
    prefix = "SPLITWORD "
    
    def printUsageAndExit() :
      print("USAGE : %s file.conllu conllu.mcd"%sys.argv[0], file=sys.stderr)
      exit(1)
    
    def readMCD(mcdFilename) :
      mcd = {}
      for line in open(mcdFilename, "r", encoding="utf8") :
        clean = line.strip()
        if len(line) < 2 or line[0] == '#' :
          continue
        splited = line.split(' ')
        if len(splited) != 2 :
          print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr)
          exit(1)
        mcd[splited[0].strip()] = splited[1].strip()
    
      return mcd
    
    def computeRules(sentence) :
      wordById = {}
      for word in sentence :
        splited = word[0].split("-")
        if len(splited) > 1 :
          continue
        wordById[word[0]] = word[1]
    
      for word in sentence :
        splited = word[0].split("-")
        if len(splited) > 1 :
          rule = ""
          for id in splited :
            rule += "@" + wordById[id]
          if word[1] in rules :
            if rule in rules[word[1]] :
              rules[word[1]][rule] += 1
            else :
              rules[word[1]][rule] = 1
          else :
            rules[word[1]] = {}
            rules[word[1]][rule] = 1
    
    def main() :
    
      sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
    
      if len(sys.argv) != 3 :
        printUsageAndExit()
    
      conllMCD = readMCD(sys.argv[2])
      conllMCDr = {v: k for k, v in conllMCD.items()}
    
      idId = int(conllMCDr["ID"])
      idForm = int(conllMCDr["FORM"])
    
      sentence = []
    
      for line in open(sys.argv[1], "r", encoding="utf8") :
        if len(line.strip()) < 2 or line[0] == '#' :
          if len(sentence) > 0 :
            computeRules(sentence)
          sentence = []
          continue
    
        splited = line.strip().split('\t')
        sentence += [[splited[idId], splited[idForm]]]
    
      for word in rules :
        if len(rules[word]) > 1 :
          print("WARNING : Abiguity detected in \'%s\'"%(word+" "+str(rules[word])), file=sys.stderr)
        for rule in rules[word] :
          print(prefix+word+rule)
          break
    
    if __name__ == "__main__" :
      main()