Update accuracy to evaluate NER with P/R/F

7ea2bf6d · Carlos Ramisch · d0769a46 · 7ea2bf6d · 7ea2bf6d · 7ea2bf6d
Commit 7ea2bf6d authored 11 months ago by Carlos Ramisch
--- a/lib/accuracy.py
+++ b/lib/accuracy.py
 #!/usr/bin/env python3
 import sys
 import argparse
-import collections
+from collections import defaultdict, Counter
 import pdb
 from conllulib import CoNLLUReader, Util
@@ -23,10 +24,10 @@ parser.add_argument('-g', "--gold", metavar="FILENAME.conllu", required=True,\
 parser.add_argument('-t', "--train", metavar="FILENAME.conllu", required=False,\
        dest="train_filename", type=argparse.FileType('r', encoding='UTF-8'), \
        help="""Training corpus in CoNLL-U, from which tagger was learnt.""")        
-parser.add_argument('-c', "--tagcolumn", metavar="NAME", dest="col_name_tag",
+parser.add_argument('-c', "--tagcolumn", metavar="NAME", dest="name_tag",
        required=False, type=str, default="upos", help="""Column name of tags, \
        as defined in header. Use lowercase""")   
-parser.add_argument('-f', "--featcolumn", metavar="NAME", dest="col_name_feat",
+parser.add_argument('-f', "--featcolumn", metavar="NAME", dest="name_feat",
        required=False, type=str, default="form", help="""Column name of input 
        feature, as defined in header. Use lowercase.""")
 parser.add_argument('-u', "--upos-filter", metavar="NAME", dest="upos_filter",
@@ -34,13 +35,20 @@ parser.add_argument('-u', "--upos-filter", metavar="NAME", dest="upos_filter",
        help="""Only calculate accuracy for words with UPOS in this list. \
        Empty list = no filter.""")        
-########################################################################
+################################################################################
 def process_args(parser):
+  """
+  Show (in debug mode) and process all command line options. Checks tag and feat
+  columns appear in corpora. Create training corpus vocabulary if option present
+  for OOV status check. Input is an instance of `argparse.ArgumentParser`, 
+  returns list of `args`, `gold_corpus` and `pred_corpus` as `CoNLLUReader`, 
+  `train_vocab` dictionary. 
+  """
  args = parser.parse_args()
  Util.DEBUG_FLAG = args.DEBUG_FLAG
-  args.col_name_tag = args.col_name_tag.lower()
+  args.name_tag = args.name_tag.lower()
-  args.col_name_feat = args.col_name_feat.lower()
+  args.name_feat = args.name_feat.lower()
  Util.debug("Command-line arguments and defaults:")
  for (k,v) in vars(args).items():
    Util.debug("  * {}: {}",k,v)    
@@ -49,37 +57,21 @@ def process_args(parser):
  train_vocab = None
  if args.train_filename:
    train_corpus = CoNLLUReader(args.train_filename)
-    ignoreme, train_vocab = train_corpus.to_int_and_vocab({args.col_name_feat:[]})    
+    ignoreme, train_vocab = train_corpus.to_int_and_vocab({args.name_feat:[]})    
-  if args.col_name_tag  not in gold_corpus.header or \
+  if args.name_tag  not in gold_corpus.header or \
-     args.col_name_feat not in gold_corpus.header:
+     args.name_feat not in gold_corpus.header:
    Util.error("-c and -f names must be valid conllu column among:\n{}", 
               gold_corpus.header)
  return args, gold_corpus, pred_corpus, train_vocab
-########################################################################
+################################################################################
-if __name__ == "__main__":
+def tp_count_feats(tok_pred, tok_gold, prf):
-  args, gold_corpus, pred_corpus, train_vocab = process_args(parser)
+  """
-  prf = collections.defaultdict(lambda:{'tp':0,'t':0, 'p':0})
+  Increment number of true positives, trues and positives for morph feature eval
-  total_tokens = correct_tokens = 0
+  Compares all features of `tok_pred` with thos of `tok_gold`
-  total_oov = correct_oov = 0
+  Result is modification of `prf` dict, function does not return anything
-  for (sent_gold, sent_pred) in zip(gold_corpus.readConllu(),
+  """
-                                    pred_corpus.readConllu()):
-    for (tok_gold, tok_pred) in zip (sent_gold, sent_pred):
-      if not args.upos_filter or tok_gold['upos'] in args.upos_filter :
-        if train_vocab :
-          train_vocab_feat = train_vocab[args.col_name_feat].keys()
-          if tok_gold[args.col_name_feat] not in train_vocab_feat:
-            total_oov = total_oov + 1
-            oov = True
-          else:
-            oov = False
-        if tok_gold[args.col_name_tag] == tok_pred[args.col_name_tag]:        
-          correct_tokens = correct_tokens + 1        
-          if train_vocab and oov :
-            correct_oov = correct_oov + 1
-        total_tokens += 1
-        if args.col_name_tag == 'feats':
  pred_feats = tok_pred['feats'] if tok_pred['feats'] else {}
  gold_feats = tok_gold['feats'] if tok_gold['feats'] else {}          
  for key in pred_feats.keys():
@@ -93,34 +85,113 @@ if __name__ == "__main__":
    t_inc = int(gold_feats.get(key,None) != None)
    prf[key]['t'] = prf[key]['t'] + t_inc
    prf['micro-average']['t'] = prf['micro-average']['t'] + t_inc
-  print("Pred file: {}".format(pred_corpus.name()))
+################################################################################
+def parseme_cat_in(ent, ent_list):
+  """
+  Verify if `ent` is present in `ent_list` by comparing both span AND category.
+  Default cuptlib implementation ignores category
+  """
+  for ent_cand in ent_list:
+    if ent.span == ent_cand.span and ent.cat == ent_cand.cat :
+      return True
+  return False
+################################################################################
+def tp_count_parseme(s_pred, s_gold, name_tag, prf):
+  try :
+    import parseme.cupt as cupt
+  except ImportError:
+    print("""Please install cuptlib before running this script\n\n  git clone \
+https://gitlab.com/parseme/cuptlib.git\n  cd cuptlib\n  pip install .""")
+    sys.exit(-1)
+  ents_pred = cupt.retrieve_mwes(s_pred, column_name=name_tag)
+  ents_gold = cupt.retrieve_mwes(s_gold, column_name=name_tag)
+  prf['Exact-nocat']['p'] += len(ents_pred)
+  prf['Exact-nocat']['t'] += len(ents_gold)
+  for e_pred in ents_pred.values() :         
+    if e_pred in ents_gold.values() :
+      #pdb.set_trace()
+      prf['Exact-nocat']['tp'] += 1
+    if parseme_cat_in(e_pred, ents_gold.values()) :  
+      prf['Exact-'+e_pred.cat]['tp'] += 1    
+    prf['Exact-'+e_pred.cat]['p'] += 1
+  for e_pred in ents_gold.values() :
+    prf['Exact-'+e_pred.cat]['t'] += 1
+  # Token-based evaluation - categories always ignored here
+  span_pred = sum([list(ep.int_span()) for ep in ents_pred.values()], start=[])
+  span_gold = sum([list(eg.int_span()) for eg in ents_gold.values()], start=[])
+  prf['Token-nocat']['p'] += len(span_pred)
+  prf['Token-nocat']['t'] += len(span_gold)  
+  for e_pred in span_pred :       
+    if e_pred in span_gold :      
+      prf['Token-nocat']['tp'] += 1
+################################################################################
+def print_results(pred_corpus_name, args, acc, prf):
+  """
+  Calculate and print accuracies, precision, recall, f-score, etc.
+  """
+  print("Predictions file: {}".format(pred_corpus_name))
  if args.upos_filter :
-    print("Results focus only on following UPOS: {}".format(" ".join(args.upos_filter)))
+    print("Results concern only some UPOS: {}".format(" ".join(args.upos_filter)))
-  accuracy = (correct_tokens / total_tokens) * 100  
+  accuracy = (acc['correct_tokens'] / acc['total_tokens']) * 100  
-  print("Accuracy on all {}: {:0.2f} ({}/{})".format(args.col_name_tag,accuracy,
+  print("Accuracy on all {}: {:0.2f} ({:5}/{:5})".format(args.name_tag, accuracy,
-                                                 correct_tokens, total_tokens))
+        acc['correct_tokens'], acc['total_tokens']))
-  if train_vocab :
+  if args.train_filename :
-    accuracy_oov = (correct_oov / total_oov) * 100
+    accuracy_oov = (acc['correct_oov'] / acc['total_oov']) * 100
-    print("Accuracy on OOV {}: {:0.2f} ({}/{})".format(args.col_name_tag,
+    print("Accuracy on OOV {}: {:0.2f} ({:5}/{:5})".format(args.name_tag, accuracy_oov,
-                                                 accuracy_oov,
+                                                 acc['correct_oov'], acc['total_oov']))
-                                                 correct_oov, total_oov))
  if prf:
-    print("Metrics per feature:")
+    print("\nPrecision, recall, and F-score for {}:".format(args.name_tag))
    macro = {"precis":0.0, "recall":0.0}
-    for key in sorted(prf.keys()):
+    for key in sorted(prf): # max prevents zero-division in P and R       
-      precis = prf[key]['tp'] / max(1,prf[key]['p']) # max prevents zero-division
+      precis = (prf[key]['tp'] / max(1, prf[key]['p'])) * 100
-      recall = prf[key]['tp'] / max(1,prf[key]['t'])
+      recall = (prf[key]['tp'] / max(1, prf[key]['t'])) * 100
-      fscore = (2*precis*recall)/max(1,precis+recall)
+      fscore = ((2 * precis * recall) / max(1, precis + recall))
      if key != 'micro-average':
        macro['precis'] = macro['precis'] + precis
        macro['recall'] = macro['recall'] + recall
      else:
        print()
      templ = "{:13}: P={:6.2f} ({:5}/{:5}) / R={:6.2f} ({:5}/{:5}) / F={:6.2f}"      
-      print(templ.format(key,precis*100,prf[key]['tp'],prf[key]['p'],recall*100, 
+      print(templ.format(key, precis, prf[key]['tp'], prf[key]['p'], recall, 
-                         prf[key]['tp'],prf[key]['t'], fscore*100))
+                         prf[key]['tp'], prf[key]['t'], fscore))
-    templ = "{:13}: P={:6.2f}               / R={:6.2f}               / F={:6.2f}"    
+    templ = "{:13}: P={:6.2f}" + " "*15 + "/ R={:6.2f}" + " "*15 + "/ F={:6.2f}"    
-    ma_precis = macro['precis'] / (len(prf.keys())-1)
+    if len(prf) > 1 : # Calculate macro-precision
-    ma_recall = macro['recall'] / (len(prf.keys())-1)
+      nb_scores = len(prf)-1 if "micro-average" in prf else len(prf)
-    ma_fscore = (2*ma_precis*ma_recall)/max(1,ma_precis+ma_recall)
+      ma_precis = (macro['precis'] / (nb_scores)) 
-    print(templ.format("macro-average",ma_precis*100,ma_recall*100, ma_fscore*100))
+      ma_recall = (macro['recall'] / (nb_scores)) 
+      ma_fscore = ((2*ma_precis*ma_recall)/max(1,ma_precis+ma_recall))
+      print(templ.format("macro-average", ma_precis, ma_recall, ma_fscore))
+################################################################################
+if __name__ == "__main__":
+  args, gold_corpus, pred_corpus, train_vocab = process_args(parser)
+  prf = defaultdict(lambda:{'tp':0,'t':0, 'p':0}) # used for feats, NEs and MWEs
+  acc = Counter() # store correct and total for all and OOV
+  for (s_gold,s_pred) in zip(gold_corpus.readConllu(),pred_corpus.readConllu()):
+    if args.name_tag.startswith("parseme"):
+      tp_count_parseme(s_pred, s_gold, args.name_tag, prf)
+    for (tok_gold, tok_pred) in zip (s_gold, s_pred):
+      if not args.upos_filter or tok_gold['upos'] in args.upos_filter :
+        if train_vocab :
+          train_vocab_feat = train_vocab[args.name_feat].keys()
+          if tok_gold[args.name_feat] not in train_vocab_feat:
+            acc['total_oov'] += 1
+            oov = True
+          else:
+            oov = False
+        if tok_gold[args.name_tag] == tok_pred[args.name_tag]:        
+          acc['correct_tokens'] += 1       
+          if train_vocab and oov :
+            acc['correct_oov'] += 1
+        acc['total_tokens'] += 1
+        if args.name_tag == 'feats':
+          tp_count_feats(tok_gold, tok_pred, prf)
+  print_results(pred_corpus.name(), args, acc, prf) 
--- a/lib/conllulib.py
+++ b/lib/conllulib.py
@@ -6,6 +6,7 @@ import collections
 from torch.utils.data import TensorDataset, DataLoader
 import torch
 import random
+import numpy as np
 import pdb
 ########################################################################
@@ -15,6 +16,7 @@ import pdb
 class Util(object):
  DEBUG_FLAG = False
+  PSEUDO_INF = 9999.0
  ###############################
@@ -64,6 +66,23 @@ class Util(object):
      random.seed(seed)
      torch.manual_seed(seed)
+  ###############################
+  @staticmethod
+  def log_cap(number):
+    """Returns the base-10 logarithm of `number`.
+    If `number` is negative, stops the program with an error message.
+    If `number` is zero returns -9999.0 representing negative pseudo infinity
+    This is more convenient than -np.inf returned by np.log10 because :
+    inf + a = inf (no difference in sum) but 9999.0 + a != 9999.0"""
+    if number < 0 :
+      Util.error("Cannot get logarithm of negative number {}".format(number))
+    elif number == 0:
+      return -Util.PSEUDO_INF
+    else :
+      return np.log10(number)
 ########################################################################
 # CONLLU FUNCTIONS 
 ########################################################################
@@ -72,6 +91,8 @@ class CoNLLUReader(object):
  ###############################
+  start_tag = "<s>"
  def __init__(self, infile):
    self.infile = infile
    DEFAULT_HEADER = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC " +\
@@ -129,14 +150,14 @@ class CoNLLUReader(object):
  ###############################
-  def to_int_from_vocab(self, col_name_dict, unk_token, vocab={}):  
+  def to_int_from_vocab(self, col_names, unk_token, vocab={}):  
    int_list = {}
    unk_toks = {}
-    for col_name, special_tokens in col_name_dict.items():  
+    for col_name in col_names:  
      int_list[col_name] = []
      unk_toks[col_name] = vocab[col_name].get(unk_token,None)
    for s in self.readConllu():
-      for col_name in col_name_dict.keys():
+      for col_name in col_names:
        id_getter = lambda v,t: v[col_name].get(t[col_name],unk_toks[col_name])
        int_list[col_name].append([id_getter(vocab,tok) for tok in s])        
    return int_list 
@@ -144,11 +165,126 @@ class CoNLLUReader(object):
  ###############################
  @staticmethod
-  def to_int_from_vocab_sent(sent, col_name_dict, unk_token, vocab={}):  
+  def to_int_from_vocab_sent(sent, col_names, unk_token, vocab={}, 
+                             lowercase=False):  
    int_list = {}    
-    for col_name in col_name_dict.keys():
+    for col_name in col_names:
      unk_tok_id = vocab[col_name].get(unk_token, None)
-      id_getter = lambda v,t: v[col_name].get(t[col_name],unk_tok_id)
+      low_or_not = lambda w: w.lower() if lowercase else w
+      id_getter = lambda v,t: v[col_name].get(low_or_not(t[col_name]),unk_tok_id)
      int_list[col_name]=[id_getter(vocab,tok) for tok in sent]
    return int_list 
+  ###############################
+  @staticmethod
+  def to_bio(sent, bio_style='bio', name_tag='parseme:ne'):
+    bio_enc = []
+    neindex = 0
+    for tok in sent :
+      netag = tok[name_tag]
+      if netag == '*' :
+        cur_tag = 'O'
+      elif netag == neindex :
+        cur_tag = 'I' + necat
+      else :
+        neindex, necat = netag.split(":")
+        necat = '-' + necat
+        if bio_style == 'io' :
+          cur_tag = 'I' + necat
+        else:
+          cur_tag = 'B' + necat
+      bio_enc.append(cur_tag)      
+    return bio_enc
+###############################
+  @staticmethod
+  def from_bio(bio_enc, bio_style='bio', stop_on_error=False):
+    """Converst BIO-encoded annotations into Sequoia/parseme format.
+    Input `bio_enc` is a list of strings, each corresponding to one BIO tag.
+    `bio_style` can be "bio" (default) or "io". Will try to recover encoding
+    errors by replacing wrong tags when `stop_on_error` equals False (default),
+    otherwise stops execution and shows an error message.  
+    Only works for BIO-cat & IO-cat, with -cat appended to both B and I tags.
+    Requires adaptations for BIOES, and encoding schemes without "-cat. 
+    Examples:
+    >>> from_bio(["B-PERS", "I-PERS", "I-PERS", "O", "B-LOC", "I-LOC"], bio_style='bio')
+    ['1:PERS', '1', '1', '*', '2:LOC', '2']
+    >>> from_bio(["B-PERS", "I-PERS", "I-PERS", "O", "B-LOC", "I-LOC"],bio_style='io')
+    WARNING: Got B tag in spite of 'io' bio_style: interpreted as I
+    WARNING: Got B tag in spite of 'io' bio_style: interpreted as I
+    ['1:PERS', '1', '1', '*', '2:LOC', '2']
+    >>> from_bio(["I-PERS", "B-PERS", "I-PERS", "O", "I-LOC"],bio_style='io')
+    WARNING: Got B tag in spite of 'io' bio_style: interpreted as I
+    ['1:PERS', '1', '1', '*', '2:LOC']
+    >>> from_bio(["I-PERS", "I-PERS", "I-PERS", "O", "I-LOC"], bio_style='bio')
+    WARNING: Invalid I-initial tag I-PERS converted to B
+    WARNING: Invalid I-initial tag I-LOC converted to B
+    ['1:PERS', '1', '1', '*', '2:LOC']
+    >>> from_bio(["I-PERS", "B-PERS", "I-PERS", "O", "I-LOC"], bio_style='bio')
+    WARNING: Invalid I-initial tag I-PERS converted to B
+    WARNING: Invalid I-initial tag I-LOC converted to B
+    ['1:PERS', '2:PERS', '2', '*', '3:LOC']
+    >>> from_bio(["I-PERS", "B-PERS", "I-EVE", "O", "I-PERS"], bio_style='io')
+    ['1:PERS', '2:PERS', '3:EVE', '*', '4:PERS']
+    >>> from_bio(["I-PERS", "B-PERS", "I-EVE", "O", "I-PERS"], bio_style='bio')
+    WARNING: Invalid I-initial tag I-PERS converted to B
+    WARNING: Invalid I-initial tag I-EVE converted to B
+    WARNING: Invalid I-initial tag I-PERS converted to B
+    ['1:PERS', '2:PERS', '3:EVE', '*', '4:PERS']
+    """
+    # TODO: warning if I-cat != previous I-cat or B-cat
+    result = []
+    neindex = 0
+    prev_bio_tag = 'O'
+    prev_cat = None
+    for bio_tag in bio_enc :
+      if bio_tag == 'O' :
+        seq_tag = '*'                  
+      elif bio_tag[0] in ['B', 'I'] and bio_tag[1] == '-':
+        necat = bio_tag.split("-")[1]
+        if bio_tag[0] == 'B' and bio_style == 'bio':
+          neindex += 1 # Begining of an entity
+          seq_tag = str(neindex) + ":" + necat
+        elif bio_tag[0] == 'B' : # bio_style = 'io'
+          if  stop_on_error:
+            Util.error("B tag not allowed with 'io'")
+          else:
+            bio_tag = bio_tag.replace("B-", "I-")
+            Util.warn("Got B tag in spite of 'io' bio_style: interpreted as I")
+        if bio_tag[0] == "I" and bio_style == "io" :
+          if necat != prev_cat:
+            neindex += 1 # Begining of an entity
+            seq_tag = str(neindex) + ":" + necat
+          else: 
+            seq_tag = str(neindex) # is a continuation
+        elif bio_tag[0] == "I" : # tag is "I" and bio_style is "bio"
+          if bio_style == 'bio' and prev_bio_tag != 'O' and necat == prev_cat : 
+            seq_tag = str(neindex) # is a continuation
+          elif stop_on_error : 
+            Util.error("Invalid I-initial tag in BIO format: {}".format(bio_tag))
+          else:
+            neindex += 1 # Begining of an entity
+            seq_tag = str(neindex) + ":" + necat
+            Util.warn("Invalid I-initial tag {} converted to B".format(bio_tag))
+        prev_cat = necat     
+      else:
+        if stop_on_error:
+          Util.error("Invalid BIO tag: {}".format(bio_tag))
+        else:
+          Util.warn("Invalid BIO tag {} converted to O".format(bio_tag))
+          result.append("*")
+      result.append(seq_tag)      
+      prev_bio_tag = bio_tag
+    return result
+################################################################################
--- a/sequoia/bin/simplify_sequoia.py
+++ b/sequoia/bin/simplify_sequoia.py
@@ -35,7 +35,13 @@ import sys
 import conllu
 import re
 import pdb
+import subprocess
+try :
  import parseme.cupt as cupt
+except ImportError:
+  print("""Please install cuptlib before running this script\n\n  git clone \
+  https://gitlab.com/parseme/cuptlib.git\n  cd cuptlib\n  pip install .""")
+  sys.exit(-1)
 #########################################