Update documentation of conllulib

6d6ecda4 · Carlos Ramisch · 13cc0ebe · 6d6ecda4 · 6d6ecda4
Commit 6d6ecda4 authored 7 months ago by Carlos Ramisch
--- a/lib/accuracy.py
+++ b/lib/accuracy.py
@@ -80,14 +80,14 @@ def tp_count_feats(tok_pred, tok_gold, prf):
  for key in pred_feats.keys():
    tp_inc = int(gold_feats.get(key,None) == pred_feats[key])
    prf[key]['tp'] = prf[key]['tp'] + tp_inc
-    prf['micro-average']['tp'] = prf['micro-average']['tp'] + tp_inc
+    prf['micro-avg']['tp'] = prf['micro-avg']['tp'] + tp_inc
    p_inc = int(pred_feats.get(key,None) != None)
    prf[key]['p'] = prf[key]['p'] + p_inc
-    prf['micro-average']['p'] = prf['micro-average']['p'] + p_inc
+    prf['micro-avg']['p'] = prf['micro-avg']['p'] + p_inc
  for key in gold_feats.keys():
    t_inc = int(gold_feats.get(key,None) != None)
    prf[key]['t'] = prf[key]['t'] + t_inc
-    prf['micro-average']['t'] = prf['micro-average']['t'] + t_inc
+    prf['micro-avg']['t'] = prf['micro-avg']['t'] + t_inc

 ################################################################################

@@ -128,14 +128,14 @@ https://gitlab.com/parseme/cuptlib.git\n  cd cuptlib\n  pip install .""")
    prf['Exact-'+e_pred.cat]['p'] += 1
  for e_pred in ents_gold.values() :
    prf['Exact-'+e_pred.cat]['t'] += 1
-  # Token-based evaluation - categories always ignored here
+  # Fuzzy (token-based) evaluation - categories always ignored here
  span_pred = sum([list(ep.int_span()) for ep in ents_pred.values()], start=[])
  span_gold = sum([list(eg.int_span()) for eg in ents_gold.values()], start=[])
-  prf['Token-nocat']['p'] += len(span_pred)
-  prf['Token-nocat']['t'] += len(span_gold)  
+  prf['Fuzzy-nocat']['p'] += len(span_pred)
+  prf['Fuzzy-nocat']['t'] += len(span_gold)  
  for e_pred in span_pred :       
    if e_pred in span_gold :      
-      prf['Token-nocat']['tp'] += 1
+      prf['Fuzzy-nocat']['tp'] += 1
      
 ################################################################################

@@ -172,21 +172,21 @@ def print_results(pred_corpus_name, args, acc, prf, parsing=False):
      precis = (prf[key]['tp'] / max(1, prf[key]['p'])) * 100
      recall = (prf[key]['tp'] / max(1, prf[key]['t'])) * 100
      fscore = ((2 * precis * recall) / max(1, precis + recall))
-      if key != 'micro-average':
+      if key != 'micro-avg':
        macro['precis'] = macro['precis'] + precis
        macro['recall'] = macro['recall'] + recall
      else:
        print()
-      templ = "{:13}: P={:6.2f} ({:5}/{:5}) / R={:6.2f} ({:5}/{:5}) / F={:6.2f}"      
+      templ = "{:11}: P={:6.2f} ({:5}/{:5}) / R={:6.2f} ({:5}/{:5}) / F={:6.2f}"      
      print(templ.format(key, precis, prf[key]['tp'], prf[key]['p'], recall, 
                         prf[key]['tp'], prf[key]['t'], fscore))
-    templ = "{:13}: P={:6.2f}" + " "*15 + "/ R={:6.2f}" + " "*15 + "/ F={:6.2f}"    
+    templ = "{:11}: P={:6.2f}" + " "*15 + "/ R={:6.2f}" + " "*15 + "/ F={:6.2f}"    
    if len(prf) > 1 : # Calculate macro-precision
-      nb_scores = len(prf)-1 if "micro-average" in prf else len(prf)
+      nb_scores = len(prf)-1 if "micro-avg" in prf else len(prf)
      ma_precis = (macro['precis'] / (nb_scores)) 
      ma_recall = (macro['recall'] / (nb_scores)) 
      ma_fscore = ((2*ma_precis*ma_recall)/max(1,ma_precis+ma_recall))
-      print(templ.format("macro-average", ma_precis, ma_recall, ma_fscore))
+      print(templ.format("macro-avg", ma_precis, ma_recall, ma_fscore))

 ################################################################################


--- a/lib/conllulib.py
+++ b/lib/conllulib.py
@@ -131,29 +131,48 @@ class CoNLLUReader(object):
 
  ###############################
  
-  start_tag = "<s>"
-  
  def __init__(self, infile): 
+    """
+    Initialise a CoNLL-U reader object from an open `infile` handler (read mode, 
+    UTF-8 encoding). Tries to automatically get the names of all columns from 
+    first line "# global.columns" meta-data.
+    """   
    self.infile = infile
-    DEFAULT_HEADER = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC " +\
-                     "PARSEME:MWE FRSEMCOR:NOUN PARSEME:NE"
-    try:
+    try: # guess the header (names of columns) from first line
      first = self.infile.readline().strip() # First line in the file
      globalcolumns = conllu.parse(first)[0].metadata['global.columns']
      self.header = globalcolumns.lower().split(" ")
      self.infile.seek(0) # Rewind open file
-    except KeyError:
-      self.header = DEFAULT_HEADER.split(" ")
+    except KeyError: # if first line absent (wrong format), try to set a default
+      DEFAULT_HEADER = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC " +\
+                     "PARSEME:MWE FRSEMCOR:NOUN PARSEME:NE"
+      self.header = DEFAULT_HEADER.lower().split(" ")
      
  ###############################
    
  def readConllu(self):
+    """
+    Yields sentences as `TokenList` from open CoNLL-U file given to constructor
+    """
    for sent in conllu.parse_incr(self.infile):
      yield sent
      
  ###############################
  
+  @staticmethod
+  def readConlluStr(conllustring):
+    """
+    Yields sentences as `TokenList` from CoNLL-U text given as a string
+    """
+    for sent in conllu.parse(conllustring):
+      yield sent
+
+  ###############################
+  
  def name(self):
+    """
+    Returns the CoNLL-U filename
+    """
    return self.infile.name
    
  ###############################
@@ -177,6 +196,25 @@ class CoNLLUReader(object):
  ###############################

  def to_int_and_vocab(self, col_name_dict, extra_cols_dict={}): 
+    """
+    Transforms open `self.infile` into lists of integer indices and associated
+    vocabularies. Vocabularies are created on the fly, according to the file 
+    contents. Parameter `col_name_dict` is a dictionary with column names to 
+    encode as keys, and containing as values a list of special tokens for each 
+    column, for instance: 
+    col_name_dict = {"form":["<PAD>", "<UNK>"], "upos":["<PAD>"]}
+    means that 2 columns will be encoded, "form" and "upos", with the 
+    corresponding special symbols in respective vocabularies. Parameter 
+    `extra_cols_dict` is similar, but instead of list of special tokens, value
+    is a function to be applied to each column value, for instance:
+    extra_cols_dict = {"head":int}
+    means that column "head" will also be encoded, but with no vocabulary 
+    associated. Instead, column values are directly encoded with function int.
+    Returns a tuple of 2 dicts, `int_list` and `vocab`, with same keys as those  
+    in `col_name_dict` and `extra_cols_dict`, and results as values (list of 
+    integers and vocabulary dict, respectively)\
+    Useful to encode **training** corpora.
+    """ 
    int_list = {}; 
    vocab = {}
    for col_name, special_tokens in col_name_dict.items():  
@@ -201,6 +239,16 @@ class CoNLLUReader(object):
  ###############################

  def to_int_from_vocab(self, col_names, unk_token, vocab={}, extra_cols_dict={}):  
+    """
+    Transforms open `self.infile` into lists of integer indices according to 
+    provided `vocab` dictionaries (different from `to_int_and_vocab`, where 
+    vocabs are also built). Values not found in `vocab` will be replaced by 
+    `vocab[unk_token]`. Parameters `col_name_dict` and `extra_cols_dict` are
+    the same as in `to_int_and_vocab`, see above. Returns a dict, `int_list`, 
+    with same keys as those in `col_name_dict` and `extra_cols_dict`, and 
+    results as values (list of integers).
+    Useful to encode **test/dev** corpora.
+    """ 
    int_list = {}
    unk_toks = {}
    for col_name in col_names:  
@@ -221,6 +269,11 @@ class CoNLLUReader(object):
  @staticmethod
  def to_int_from_vocab_sent(sent, col_names, unk_token, vocab={}, 
                             lowercase=False):  
+    """
+    Similar to `to_int_from_vocab` above, but applies to a single `sent` 
+    represented as a `TokenList`. Extra possibility to `lowercase` sentence 
+    elements before looking them up in `vocab`.
+    """
    int_list = {}    
    for col_name in col_names:
      unk_tok_id = vocab[col_name].get(unk_token, None)
@@ -239,6 +292,13 @@ class CoNLLUReader(object):
    belonging to the same NE get the same int + first gets ":category" suffix). 
    The output has category appended to 'B' and 'I' tags. The `bio_style` can
    be 'bio' or 'io', the latter has only 'I-category' tags, no 'B's.
+    Example:
+    >>> test=\"\"\"# global.columns = ID FORM parseme:ne\n1\tLe\t1:PROD\n2\tPetit\t1\n3\tPrince\t1\n4\tde\t*\n5\tSaint-Exupéry\t2:PERS\n6\test\t*\n7\tentré\t*\n8\tà\t*\n9\tl'\t*\n10\tÉcole\t3:ORG\n11\tJules-Romains\t3\"\"\"
+    >>> for sent in readConlluString(test):
+    >>>  print(CoNLLUReader.to_bio(sent))
+    ['B-PROD', 'I-PROD', 'I-PROD', 'O', 'B-PERS', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG']
+    
+    
    """
    bio_enc = []
    neindex = 0