Skip to content
Snippets Groups Projects
Commit 825360ef authored by Franck Dary's avatar Franck Dary
Browse files

Added script to add new column to conllu

parent 29b7b6c4
No related branches found
No related tags found
No related merge requests found
#! /usr/bin/env python3
# Add a new column to a conllu file
# From a dataframe
# Data is aligned using other columns as a key
import argparse
import sys
from readMCD import readMCD
# if f is float (encoded as str), return it in %.2f
def transformFloat(f) :
try :
f = float(f)
return "%.2f"%f
except :
return f
if __name__ == "__main__" :
parser = argparse.ArgumentParser()
parser.add_argument("dataframe",
help="File containing the new column.")
parser.add_argument("conllu",
help="File containing the data.")
parser.add_argument("targetColumn", type=int,
help="Index of the dataframe column that will be added to the conllu.")
parser.add_argument("--dataframeKey", nargs="+",
help="List of columns numbers from the dataframe to act as a key to align data.")
parser.add_argument("--conlluKey", nargs="+",
help="List of columns numbers from the conllu file to act as a key to align data.")
parser.add_argument("--colname", default="NEW",
help="Name of the new column.")
args = parser.parse_args()
if len(args.dataframeKey) == 0 or len(args.conlluKey) == 0 :
print("ERROR : missing keys", file=sys.stderr)
exit(1)
# Format key : value
newValues = {}
# Read dataframe
for line in open(args.dataframe, "r") :
if line[-1] == "\n" :
line = line[:-1]
splited = line.split("\t")
targetValue = splited[args.targetColumn]
key = "_key_".join([transformFloat(splited[int(i)]) for i in args.dataframeKey]).lower()
# if key in newValues :
# print("WARNING : duplicated key for line '%s'"%line.strip(), file=sys.stderr)
# print("KEY = %s"%key, file=sys.stderr)
# print(newValues[key][1], "\n", file=sys.stderr)
newValues[key] = targetValue
# Read conllu
foundKeys = 0
missingKeys = 0
output = []
baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
conllMCD, conllMCDr = readMCD(baseMCD)
for line in open(args.conllu, "r") :
line = line.strip()
if "global.columns =" in line and line[0] == "#" :
splited = line.split("global.columns =")
conllMCD, conllMCDr = readMCD(splited[-1].strip())
continue
if len(line) == 0 or line[0] == "#" :
output.append(line)
continue
splited = line.split("\t")
key = "_key_".join([transformFloat(splited[int(i)]) for i in args.conlluKey]).lower()
if key in newValues :
foundKeys += 1
splited.append(newValues[key])
else :
missingKeys += 1
splited.append("_")
print("Missing key : '%s'"%key, file=sys.stderr)
output.append("\t".join(splited))
missingProportion = 100*missingKeys/(missingKeys+foundKeys)
print("Proportion of missing keys = %.2f%%"%(missingProportion), file=sys.stderr)
columns = [conllMCDr[i] for i in range(len(conllMCD))]
columns.append(args.colname)
print("# global.columns = %s"%(" ".join(columns)))
print("\n".join(output))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment