From 29b7b6c4674d0140d6e6bca3f8be910f5464c951 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Fri, 20 May 2022 15:12:51 +0200 Subject: [PATCH] Added script to append column containing lexicon pos information into conllu file --- scripts/addLefff2Conllu.py | 171 +++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100755 scripts/addLefff2Conllu.py diff --git a/scripts/addLefff2Conllu.py b/scripts/addLefff2Conllu.py new file mode 100755 index 0000000..6eb82f2 --- /dev/null +++ b/scripts/addLefff2Conllu.py @@ -0,0 +1,171 @@ +#! /usr/bin/env python3 +# Take as input the lefff lexicon and conllu files. +# For each word form it will compute the possible POS. +# Then it will add a column encoding these possible POS to output conllu files. + +import sys +import argparse +from readMCD import readMCD + +# List of UD POS tags : https://universaldependencies.org/u/pos/index.html +allPos = ["adj", "adp", "adv", "aux", "cconj", "det", "intj", "noun", "num", "part", "pron", "propn", "punct", "sconj", "sym", "verb", "x"] + +# Convert lefff part of speech into UD UPOS. +lefffPOS2UD = { + "adj" : "adj", + "csu" : "sconj", + "que" : "sconj", # Not only ? + "det" : "det", + "pres" : "intj", # Nothing match ? INTJ or X + "v" : "verb", + "nc" : "noun", + "cfi" : "noun", + "advPref" : "x", # No meaning with UD tokenization + "adjPref" : "x", # same + "suffAdj" : "x", # same + "cln" : "pron", + "ce" : "pron", + "clg" : "adp", + "cll" : "pron", + "ilimp" : "pron", + "cla" : "pron", + "cld" : "pron", + "pro" : "pron", + "caimp" : "pron", + "pri" : "adv", + "prel" : "pron", + "clr" : "pron", + "clar" : "pron", + "cldr" : "pron", + "adv" : "adv", + "advm" : "adv", + "advp" : "adv", + "coo" : "cconj", + "ponctw" : "punct", + "advneg" : "adv", + "clneg" : "adv", + "que_restr" : "sconj", + "np" : "propn", + "poncts" : "punct", + "parento" : "punct", + "epsilon" : "punct", + "parentf" : "punct", + "prep" : "adp", + "auxAvoir" : "aux", + "auxEtre" : "aux", +} + +if __name__ == "__main__" : + parser = argparse.ArgumentParser() + parser.add_argument("--lefff", type=str, + help="Lefff file in tab separated columns: FORM POS LEMMA MORPHO.") + parser.add_argument("--conllu", nargs="+", type=str, + help="Input conllu files, to find possible POS for each word.") + parser.add_argument("--output", nargs="+", type=str, + help="Output conllu files. Must be existing conllu files, this script adds a new column in place.") + parser.add_argument("--colName", type=str, default="LEXICON", + help="Name of the column that will be added by the script. If the column already exists, it will be replaced.") + + args = parser.parse_args() + + if args.lefff is None and args.conllu is None : + print("ERROR: must provide --lefff and/or --conllu", file=sys.stderr) + exit(1) + if args.output is None : + print("ERROR: must provide --output", file=sys.stderr) + exit(1) + + # Dict with key=FORM and value= dict associationg pos with number of occ + form2pos = {} + # Associate each form with a counter, only for conllu files + formCount = {} + + # Read lefff and populate form2pos with # of occ = 1 + if args.lefff is not None : + for line in open(args.lefff, "r") : + splited = line.strip().split("\t") + form = splited[0].lower() + pos = lefffPOS2UD[splited[1]] + # In lefff there might be spaces in forms. W2v format don't allow it. We replace space by dotted circle. + form = form.replace(" ", "◌") + if " " in form : + print("HERE '%s'"%form, file=sys.stderr) + if pos not in allPos : + print("ERROR: Unknown pos '%s' (check allPos in the script)"%pos, file=sys.stderr) + if form not in form2pos : + form2pos[form] = {} + if form not in formCount : + formCount[form] = 0 + if pos not in form2pos[form] : + form2pos[form][pos] = 1 + + # If conllu files are provided, count number of occurences into form2pos + if args.conllu is not None : + if args.conllu is not None : + for filename in args.conllu : + baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC" + conllMCD, conllMCDr = readMCD(baseMCD) + for line in open(filename, "r") : + line = line.strip() + if "global.columns =" in line and line[0] == "#" : + splited = line.split("global.columns =") + conllMCD, conllMCDr = readMCD(splited[-1].strip()) + continue + if len(line) == 0 or line[0] == "#" : + continue + splited = line.split("\t") + wordId = splited[conllMCD["ID"]].lower() + if "-" in wordId : + continue + form = splited[conllMCD["FORM"]].lower() + pos = splited[conllMCD["UPOS"]].lower() + form = form.replace(" ", "◌") + if pos not in allPos : + print("ERROR: Unknown pos '%s' (check allPos in the script)"%pos, file=sys.stderr) + if form not in form2pos : + form2pos[form] = {} + if pos not in form2pos[form] : + form2pos[form][pos] = 0 + form2pos[form][pos] += 1 + if form not in formCount : + formCount[form] = 0 + formCount[form] += 1 + + + # Reshape form2pos to be form -> pos set as string (ex. adj|verb) + for form in form2pos : + posSetStr = "|".join([pos for pos in form2pos[form]]) + form2pos[form] = posSetStr + + # Read all output conllu files and rewrite them in place with the new column + for filename in args.output : + baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC" + conllMCD, conllMCDr = readMCD(baseMCD) + if args.colName not in conllMCD : + conllMCD[args.colName] = len(conllMCD) + conllMCDr[len(conllMCDr)] = args.colName + newLines = ["# global.columns = %s"%(" ".join([conllMCDr[i] for i in range(len(conllMCDr))]))] + for line in open(filename, "r") : + line = line.strip() + if "global.columns =" in line and line[0] == "#" : + splited = line.split("global.columns =") + conllMCD, conllMCDr = readMCD(splited[-1].strip()) + if args.colName not in conllMCD : + conllMCD[args.colName] = len(conllMCD) + conllMCDr[len(conllMCDr)] = args.colName + newLines = ["# global.columns = %s"%(" ".join([conllMCDr[i] for i in range(len(conllMCDr))]))] + continue + if len(line) == 0 or line[0] == "#" : + newLines.append(line) + continue + splited = line.split("\t") + form = splited[conllMCD["FORM"]].lower() + form = form.replace(" ", "◌") + posSetStr = form2pos.get(form, "none") + newColIndex = conllMCD[args.colName] + if newColIndex not in range(len(splited)) : + splited.append("") + splited[newColIndex] = posSetStr + newLines.append("\t".join(splited)) + print("\n".join(newLines), file=open(filename, "w")) + -- GitLab