diff --git a/scripts/addDataframeToConllu.py b/scripts/addDataframeToConllu.py new file mode 100755 index 0000000000000000000000000000000000000000..759dd2a94e78acb3adb16e2105da97e6cf26df33 --- /dev/null +++ b/scripts/addDataframeToConllu.py @@ -0,0 +1,98 @@ +#! /usr/bin/env python3 + +# Add a new column to a conllu file +# From a dataframe +# Data is aligned using other columns as a key + +import argparse +import sys +from readMCD import readMCD + +# if f is float (encoded as str), return it in %.2f +def transformFloat(f) : + try : + f = float(f) + return "%.2f"%f + except : + return f + +if __name__ == "__main__" : + parser = argparse.ArgumentParser() + parser.add_argument("dataframe", + help="File containing the new column.") + parser.add_argument("conllu", + help="File containing the data.") + parser.add_argument("targetColumn", type=int, + help="Index of the dataframe column that will be added to the conllu.") + parser.add_argument("--dataframeKey", nargs="+", + help="List of columns numbers from the dataframe to act as a key to align data.") + parser.add_argument("--conlluKey", nargs="+", + help="List of columns numbers from the conllu file to act as a key to align data.") + parser.add_argument("--colname", default="NEW", + help="Name of the new column.") + + args = parser.parse_args() + + if len(args.dataframeKey) == 0 or len(args.conlluKey) == 0 : + print("ERROR : missing keys", file=sys.stderr) + exit(1) + + # Format key : value + newValues = {} + + # Read dataframe + for line in open(args.dataframe, "r") : + if line[-1] == "\n" : + line = line[:-1] + + splited = line.split("\t") + + targetValue = splited[args.targetColumn] + key = "_key_".join([transformFloat(splited[int(i)]) for i in args.dataframeKey]).lower() + +# if key in newValues : +# print("WARNING : duplicated key for line '%s'"%line.strip(), file=sys.stderr) +# print("KEY = %s"%key, file=sys.stderr) +# print(newValues[key][1], "\n", file=sys.stderr) + + newValues[key] = targetValue + + + # Read conllu + foundKeys = 0 + missingKeys = 0 + + output = [] + baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC" + conllMCD, conllMCDr = readMCD(baseMCD) + for line in open(args.conllu, "r") : + line = line.strip() + if "global.columns =" in line and line[0] == "#" : + splited = line.split("global.columns =") + conllMCD, conllMCDr = readMCD(splited[-1].strip()) + continue + if len(line) == 0 or line[0] == "#" : + output.append(line) + continue + + splited = line.split("\t") + + key = "_key_".join([transformFloat(splited[int(i)]) for i in args.conlluKey]).lower() + if key in newValues : + foundKeys += 1 + splited.append(newValues[key]) + else : + missingKeys += 1 + splited.append("_") + print("Missing key : '%s'"%key, file=sys.stderr) + + output.append("\t".join(splited)) + + missingProportion = 100*missingKeys/(missingKeys+foundKeys) + print("Proportion of missing keys = %.2f%%"%(missingProportion), file=sys.stderr) + + columns = [conllMCDr[i] for i in range(len(conllMCD))] + columns.append(args.colname) + print("# global.columns = %s"%(" ".join(columns))) + print("\n".join(output)) +